Beispiel #1
0
def get_zygosity(sample, bwtfile, threshold, expected_length, ref_te):
    msbwt = MultiStringBWT.loadBWT(bwtfile, useMemmap=False)
    oc_length = 25
    ed_th = .2 * oc_length
    kmer_list = get_kmers(sample, ref_te)
    zygosity_data = []
    zygo_dict = {}
    other_context = {}
    for TEi_id, my_id, side, chromo, pos, strand, ref_te, context, TE, TSD in kmer_list:
        #if (TEi_id, ref_te) != (47,0):
        #    continue
        #print TEi_id,my_id,side,chromo,pos,strand,ref_te,context,TE,TSD
        new_context = MultiStringBWT.reverseComplement(
            context) if side == 'start' else context
        lo, hi = msbwt.findIndicesOfStr(new_context)
        context_List = set()
        context_List = growContext(msbwt, lo, hi, '', oc_length, context_List)
        zygo_dict[TEi_id] = zygo_dict.get(TEi_id, set([1]))

        for oc in context_List:
            oc = MultiStringBWT.reverseComplement(
                oc) if side == 'start' else oc
            ed = lv.distance(
                oc, TE[:oc_length]) if side == 'start' else lv.distance(
                    oc, TE[-oc_length:])
            TE = TE[:oc_length] if side == 'start' else TE[-oc_length:]

            if ed > ed_th:
                zygo_dict[TEi_id] = zygo_dict[TEi_id] | set([0])
                other_context[TEi_id] = oc
        zygosity_data.append([
            TEi_id, my_id, side, chromo, pos, strand, ref_te, context, TE, TSD
        ])

    individual_file = "IndividualAnalysis/%s_%s.csv" % (sample, ref_te)
    zygosity_data.sort(key=lambda x: (x[3], x[4]))
    header = [
        'TEi_id', 'my_id', 'side', 'chromo', 'pos', 'strand', 'ref_te',
        'context', 'TE', 'TSD', 'other_context', 'zygosity'
    ]
    with open(individual_file, 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        a.writerows([header])
        for d in zygosity_data:
            zygosity = 'heterozygous' if len(
                zygo_dict[d[0]]) == 2 else 'homozygous'
            d.append(other_context.get(d[0], ''))
            d.append(zygosity)
            a.writerows([d])
    print "Wrote file: %s [%d lines]" % (individual_file, len(zygosity_data))
def runQuery(**kwargs):
    pieces = kwargs["dataset"].split('-')
    directory = MSBWTdirs[int(pieces[0])] + '/' + '-'.join(pieces[1:])
    # load the MSBWT
    msbwt = MSBWT.loadBWT(directory)
    if kwargs['forward'] == "true":
        forwardResults = [
            msbwt.countOccurrencesOfSeq(str(kmer))
            for kmer in kwargs['kmerQueries']
        ]
    else:
        forwardResults = []
    if kwargs['revComp'] == "true":
        rcResults = [
            msbwt.countOccurrencesOfSeq(MSBWT.reverseComplement(str(kmer)))
            for kmer in kwargs['kmerQueries']
        ]
    else:
        rcResults = []
    return [forwardResults, rcResults]
Beispiel #3
0
    def firstTimeExtension(self, foundKmers, unexploredPaths, nodes, edges):
        '''
        @param foundKmers - Each kmer we find will be checked against this and added if not present
        @param unexploredPaths - if we find a new path split, we add the things here, also merges are important to add here
        @param nodes - the list of nodes if we find a new one
        '''
        pc = ''
        kmer = self.seq
        terminate = False
        while not terminate:
            if len(kmer) != self.pathK:
                print('ERROR: DIFFERENT SIZED K-MER ' + str(len(kmer)))
                raise Exception('ERROR')

            #First, perform all the counts of paths going both forwards and backwards
            counts = {}
            revCounts = {}

            #maxV - the count of the (k+1)-mer with maxC on it, total is the total counts of valid chars
            maxV = 0
            maxC = ''
            total = 0

            #count the number of forward and reversed paths
            numPaths = 0
            numRevPaths = 0

            for c in self.validChars:
                counts[c] = self.msbwt.countOccurrencesOfSeq(
                    kmer + c) + self.msbwt.countOccurrencesOfSeq(
                        MultiStringBWT.reverseComplement(kmer + c))
                revCounts[c] = self.msbwt.countOccurrencesOfSeq(
                    c + kmer) + self.msbwt.countOccurrencesOfSeq(
                        MultiStringBWT.reverseComplement(c + kmer))

                if self.drawDollarTerminals or c != '$':
                    total += counts[c]
                    if counts[c] > maxV:
                        maxV = counts[c]
                        maxC = c

                    if counts[c] >= self.pathThreshold:
                        numPaths += 1

                    #if we have evidence from the counts OR if the previous character was known to be that character
                    if revCounts[c] >= self.pathThreshold or c == pc:
                        numRevPaths += 1

            #check if we have incoming edges, in which case we need to end this block
            if numRevPaths > 1 and kmer != self.seq:

                #remove the last kmer, because it's actually in the new node we merge into
                self.seq = self.seq[0:-1]

                #this will lead to repeating the same counts later, but that's okay
                newID = len(nodes)
                newHistMers = set([])
                nodes.append(
                    PathNode(newID, kmer, self.msbwt,
                             self.minDistToSeed + len(self.pileups),
                             self.settingsDict))
                edges.append(
                    PathEdge(len(edges), self.nodeID, newID, revCounts[pc],
                             pc + ', ' + str(revCounts)))
                self.termCondition = 'MERGE_' + str(newID)
                foundKmers[kmer] = newID

                unexploredPaths.append(nodes[newID])

                #print 'Ending block for merge'
                terminate = True

            elif total == 0:
                #print 'No strings found.'
                self.termCondition = 'TERMINAL'
                terminate = True
            else:
                #the kmer was found in this block and it may have multiple extensions
                foundKmers[kmer] = self.nodeID
                revMer = MultiStringBWT.reverseComplement(kmer)
                if foundKmers.has_key(revMer):
                    otherID = foundKmers[revMer]
                    self.inversionSet.add(otherID)
                    nodes[otherID].inversionSet.add(self.nodeID)

                r1 = self.msbwt.findIndicesOfStr(kmer[-self.countK:])
                r2 = self.msbwt.findIndicesOfStr(
                    MultiStringBWT.reverseComplement(kmer[-self.countK:]))
                kmerCount = (r1[1] - r1[0]) + (r2[1] - r2[0])
                self.pileups.append(kmerCount)
                perc = float(maxV) / total

                if self.trackReads == True:
                    for i in range(r1[0], r1[1]):
                        self.readSet.add(
                            (int(self.msbwt.getSequenceDollarID(i)), 0))
                    for i in range(r2[0], r2[1]):
                        self.readSet.add(
                            (int(self.msbwt.getSequenceDollarID(i)), 1))

                #if kmerCount > self.overloadThreshold:
                if self.pileups[0] > self.overloadThreshold:
                    #this path is too heavy, we probably won't figure out what's going on downstream
                    self.termCondition = 'OVERLOAD'
                    terminate = True

                elif numPaths > 1:
                    self.termCondition = 'SPLIT'
                    for c in self.validChars:
                        if counts[c] >= self.pathThreshold:
                            newKmer = kmer[1:] + c
                            if foundKmers.has_key(newKmer):
                                otherNID = foundKmers[newKmer]
                                nodes[otherNID].minDistToSeed = min(
                                    nodes[otherNID].minDistToSeed,
                                    self.minDistToSeed + len(self.pileups))
                                edges.append(
                                    PathEdge(len(edges), self.nodeID, otherNID,
                                             counts[c],
                                             c + ': ' + str(counts[c])))

                            else:
                                if self.drawDollarTerminals or c != '$':
                                    newID = len(nodes)
                                    newHistMers = set([])
                                    nodes.append(
                                        PathNode(
                                            newID, newKmer, self.msbwt,
                                            self.minDistToSeed +
                                            len(self.pileups),
                                            self.settingsDict))
                                    edges.append(
                                        PathEdge(len(edges), self.nodeID,
                                                 newID, counts[c],
                                                 c + ': ' + str(counts[c])))
                                    foundKmers[newKmer] = newID

                                    if c != '$':
                                        unexploredPaths.append(nodes[newID])
                                    else:
                                        nodes[newID].termCondition = '$ Ext'

                    terminate = True
                else:
                    #this is data pertaining to this k-mer
                    #print ':\t'+kmer+maxC+'\t'+str(perc)+'\t'+str(maxV)+'/'+str(total)+'\t'+str(total-maxV)+'\t'
                    pc = kmer[0]
                    kmer = kmer[1:] + maxC
                    #check if we've found the new k-mer before
                    if foundKmers.has_key(kmer):
                        otherNID = foundKmers[kmer]
                        nodes[otherNID].minDistToSeed = min(
                            nodes[otherNID].minDistToSeed,
                            self.minDistToSeed + len(self.pileups))
                        if counts[maxC] >= self.pathThreshold:
                            edges.append(
                                PathEdge(len(edges), self.nodeID, otherNID,
                                         counts[maxC],
                                         pc + ': ' + str(counts[maxC])))
                            self.termCondition = 'MERGE_' + str(otherNID)
                        else:
                            edges.append(
                                PathEdge(len(edges), self.nodeID, otherNID,
                                         counts[maxC],
                                         pc + ': ' + str(counts[maxC]),
                                         'dashed'))
                            self.termCondition = 'MERGE_' + str(
                                otherNID) + ', THRESHOLD'

                        terminate = True
                    else:
                        self.seq += maxC
                        if maxC == '$':
                            self.termCondition = '$ Max'
                            terminate = True
Beispiel #4
0
def TestForUnique(sample, side, bowtie_dir, species, expected_length):
    global genome
    bt2cmd = "bowtie2 -x %s/%s --no-head -r --end-to-end -k 4 %s.seq > %s.sam"
    designfile = "tmp/bowtie_data/%s_%s.csv" % (sample, side)

    t = designfile.rfind('.')
    root = designfile[:t]
    outfile = root + ".seq"

    fp = open(outfile, 'wb')
    design = pd.read_csv(filepath_or_buffer=designfile, sep=',')
    N = design.shape[0]
    probes = {}
    distinct_context = set()
    for index, row in enumerate(design.values):
        seq = row[1]
        fp.write(seq + "\n")
        distinct_context.add(seq)
    print "TOtal distinct contexts: %d" % (len(distinct_context))
    fp.close()
    print "Wrote %s (%d lines)" % (outfile, N)
    sys.stdout.flush()
    code = subprocess.call(bt2cmd % (bowtie_dir, species, root, root),
                           shell=True)
    if (code == 0):
        print "Alignment completed"
    else:
        print "Alignment failed:" + (bt2cmd %
                                     (bowtie_dir, species, root, root))
        return

    samfile = outfile.replace('.seq', '.sam')
    columns = [str(i) for i in range(20)]
    df = pd.read_csv(filepath_or_buffer=samfile,
                     names=columns,
                     sep='\t',
                     header=None)
    df = df.drop_duplicates(subset=['0'], keep=False)
    #df.to_csv("tmp/bowtie_data/test.csv")
    data = df.iloc[:, ].values
    unique_locations = {}
    unmapped = 0
    pos_set = set()
    new_data = []
    unique = set()
    for fields in data:
        index = fields[0]
        chromo = fields[2]
        pos = fields[3]
        flags = int(fields[1])
        if chromo == '*':
            unmapped += 1
            continue
        alignment_score = -100
        if fields[11].find("AS:i:") == 0:
            alignment_score = int(fields[11].split(":")[-1])
        if alignment_score < 0:
            continue

        strand = '-' if flags & 16 else '+'
        new_seq = revcomp(fields[9]) if flags & 16 else fields[9]
        new_data.append([new_seq, chromo, pos, strand])
        unique.add(new_seq)

    locationfile = outfile.replace('.seq', '_location.csv')
    with open(locationfile, 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        header = ['context', 'chromo', 'pos', 'strand']
        a.writerows([header])
        for d in new_data:
            a.writerows([d])

    print "Wrote file: %s [%d lines]" % (locationfile, len(new_data))
    print "unmapped: %d" % unmapped
    print "unique: %d" % len(unique)
    ########

    df1 = pd.read_csv(filepath_or_buffer=designfile, sep=',')
    df2 = pd.read_csv(filepath_or_buffer=locationfile, sep=',')

    result = pd.merge(df1, df2, how='right', on=['context'])
    data = result.iloc[:, ].values
    new_data = []

    # ['id', 'side', 'context', 'chromo', 'pos', 'strand']
    for d in data:
        [my_id, context, te, chromo, pos, strand] = d[0:6]
        #chromo = chromo[3:]
        if chromo not in genome.keys():
            continue
        plen = len(context)

        if strand == '+' and side == 'start':
            ref_prefix = genome[chromo][pos:pos + plen]
            ref_suffix = genome[chromo][pos + plen:pos + plen + 25]
            other_context = genome[chromo][pos + plen:pos + plen + 25]
            pos = pos + plen
        if strand == '+' and side == 'end':
            ref_prefix = genome[chromo][pos - 25:pos]
            ref_suffix = genome[chromo][pos:pos + plen]
            other_context = genome[chromo][pos - 25:pos]
        if strand == '-' and side == 'start':
            ref_prefix = genome[chromo][pos:pos + 25]
            ref_suffix = genome[chromo][pos - plen:pos]
            other_context = genome[chromo][pos - 25:pos]
        if strand == '-' and side == 'end':
            ref_prefix = genome[chromo][pos + plen:pos + plen + 25]
            ref_suffix = genome[chromo][pos:pos + plen]
            other_context = genome[chromo][pos + plen:pos + plen + 25]
            pos = pos + plen
        other_context = MultiStringBWT.reverseComplement(
            other_context) if strand == '-' else other_context
        if strand == '-':
            ref_prefix = MultiStringBWT.reverseComplement(ref_prefix)
            ref_suffix = MultiStringBWT.reverseComplement(ref_suffix)

        ed_th = .2 * 25
        if side == 'start':
            ed = lv.distance(ref_suffix[:25], te[:25])
            ref_te = 1 if ed <= ed_th else 0
        if side == 'end':
            ed = lv.distance(ref_prefix[-25:], te[-25:])
            ref_te = 1 if ed <= ed_th else 0
        new_data.append([
            my_id, context, te, ref_te, ref_prefix, ref_suffix, chromo, pos,
            strand,
            len(context)
        ])
        #new_data.append([my_id,context,te,ref_prefix,ref_suffix,chromo,pos,strand])

    finalfile = locationfile.replace('location', 'UNIQUE')
    with open(finalfile, 'wb') as fp:
        a = csv.writer(fp, delimiter=',')
        header = [
            'my_id', 'context', 'TE', 'ref_te', 'ref_prefix', 'ref_suffix',
            'chromo', 'pos', 'strand', 'clen'
        ]
        a.writerows([header])
        for d in new_data:
            a.writerows([d])
    remove_duplicates(finalfile)
    return
    command = "rm ./tmp/bowtie_data/*.seq"
    rval = os.system(command)
    command = "rm ./tmp/bowtie_data/*.sam"
    rval = os.system(command)
    command = "rm ./tmp/bowtie_data/*_location.csv"
    rval = os.system(command)
def mainRun():
    '''
    This is the primary function for external typical users to run when the Command Line Interface is used
    '''
    #start up the logger
    initLogger()

    #attempt to parse the arguments
    p = ap.ArgumentParser(description=util.DESC,
                          formatter_class=ap.RawTextHelpFormatter)

    #version data
    p.add_argument('-V', '--version', action='version', version='%(prog)s' + \
                   ' %s in MSBWT %s' % (util.VERSION, util.PKG_VERSION))

    #TODO: do we want subparsers groups by type or sorted by name? it's type currently

    sp = p.add_subparsers(dest='subparserID')
    p2 = sp.add_parser('cffq',
                       help='create a MSBWT from FASTQ files (pp + cfpp)')
    p2.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p2.add_argument('-u',
                    '--uniform',
                    dest='areUniform',
                    action='store_true',
                    help='the input sequences have uniform length',
                    default=False)
    p2.add_argument('-c',
                    '--compressed',
                    dest='buildCompressed',
                    action='store_true',
                    help='build the RLE BWT (faster, less disk I/O)',
                    default=False)
    p2.add_argument('outBwtDir',
                    type=util.newDirectory,
                    help='the output MSBWT directory')
    p2.add_argument('inputFastqs',
                    nargs='+',
                    type=util.readableFastqFile,
                    help='the input FASTQ files')

    p7 = sp.add_parser('pp',
                       help='pre-process FASTQ files before BWT creation')
    p7.add_argument('-u',
                    '--uniform',
                    dest='areUniform',
                    action='store_true',
                    help='the input sequences have uniform length',
                    default=False)
    p7.add_argument('outBwtDir',
                    type=util.newDirectory,
                    help='the output MSBWT directory')
    p7.add_argument('inputFastqs',
                    nargs='+',
                    type=util.readableFastqFile,
                    help='the input FASTQ files')

    p3 = sp.add_parser(
        'cfpp', help='create a MSBWT from pre-processed sequences and offsets')
    p3.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p3.add_argument('-u',
                    '--uniform',
                    dest='areUniform',
                    action='store_true',
                    help='the input sequences have uniform length',
                    default=False)
    p3.add_argument('-c',
                    '--compressed',
                    dest='buildCompressed',
                    action='store_true',
                    help='build the RLE BWT (faster, less disk I/O)',
                    default=False)
    p3.add_argument('bwtDir',
                    type=util.existingDirectory,
                    help='the MSBWT directory to process')

    p4 = sp.add_parser('merge', help='merge many MSBWTs into a single MSBWT')
    p4.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p4.add_argument('outBwtDir',
                    type=util.newDirectory,
                    help='the output MSBWT directory')
    p4.add_argument('inputBwtDirs',
                    nargs='+',
                    type=util.existingDirectory,
                    help='input BWT directories to merge')

    p5 = sp.add_parser(
        'query',
        help='search for a sequence in an MSBWT, prints sequence and seqID')
    p5.add_argument('inputBwtDir',
                    type=util.existingDirectory,
                    help='the BWT to query')
    p5.add_argument('kmer',
                    type=util.validKmer,
                    help='the input k-mer to search for')
    p5.add_argument(
        '-d',
        '--dump-seqs',
        dest='dumpSeqs',
        action='store_true',
        help='print all sequences with the given kmer (default=False)',
        default=False)

    p6 = sp.add_parser('massquery',
                       help='search for many sequences in an MSBWT')
    p6.add_argument('inputBwtDir',
                    type=util.existingDirectory,
                    help='the BWT to query')
    p6.add_argument('kmerFile', help='a file with one k-mer per line')
    p6.add_argument('outputFile', help='output file with counts per line')
    p6.add_argument('-r',
                    '--rev-comp',
                    dest='reverseComplement',
                    action='store_true',
                    help='also search for each kmer\'s reverse complement',
                    default=False)

    p8 = sp.add_parser('compress',
                       help='compress a MSBWT from byte/base to RLE')
    p8.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p8.add_argument('srcDir',
                    type=util.existingDirectory,
                    help='the source directory for the BWT to compress')
    p8.add_argument('dstDir',
                    type=util.newDirectory,
                    help='the destination directory')

    p9 = sp.add_parser('decompress',
                       help='decompress a MSBWT from RLE to byte/base')
    p9.add_argument('-p',
                    metavar='numProcesses',
                    dest='numProcesses',
                    type=int,
                    default=1,
                    help='number of processes to run (default: 1)')
    p9.add_argument('srcDir',
                    type=util.existingDirectory,
                    help='the source directory for the BWT to compress')
    p9.add_argument('dstDir',
                    type=util.newDirectory,
                    help='the destination directory')

    p10 = sp.add_parser('convert', help='convert from a raw text input to RLE')
    p10.add_argument('-i',
                     metavar='inputTextFN',
                     dest='inputTextFN',
                     default=None,
                     help='input text filename (default: stdin)')
    p10.add_argument('dstDir',
                     type=util.newDirectory,
                     help='the destination directory')

    args = p.parse_args()

    if args.subparserID == 'cffq':
        logger.info('Inputs:\t' + str(args.inputFastqs))
        logger.info('Uniform:\t' + str(args.areUniform))
        logger.info('Output:\t' + args.outBwtDir)
        logger.info('Output Compressed:\t' + str(args.buildCompressed))
        logger.info('Processes:\t' + str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning(
                'Using multi-processing with slow disk accesses can lead to slower build times.'
            )
        print
        if args.areUniform:
            #if they are uniform, use the method developed by Bauer et al., it's likely short Illumina seq
            if args.buildCompressed:
                MultiStringBWT.createMSBWTCompFromFastq(
                    args.inputFastqs, args.outBwtDir, args.numProcesses,
                    args.areUniform, logger)
            else:
                MultiStringBWT.createMSBWTFromFastq(args.inputFastqs,
                                                    args.outBwtDir,
                                                    args.numProcesses,
                                                    args.areUniform, logger)
        else:
            #if they aren't uniform, use the merge method by Holt et al., it's likely longer PacBio seq
            if args.buildCompressed:
                logger.error(
                    'No compressed builder for non-uniform datasets, compress after creation.'
                )
            else:
                Multimerge.createMSBWTFromFastq(args.inputFastqs,
                                                args.outBwtDir,
                                                args.numProcesses,
                                                args.areUniform, logger)

    elif args.subparserID == 'pp':
        logger.info('Inputs:\t' + str(args.inputFastqs))
        logger.info('Uniform:\t' + str(args.areUniform))
        logger.info('Output:\t' + args.outBwtDir)
        if args.areUniform:
            #preprocess for Bauer et al. method
            MultiStringBWT.preprocessFastqs(args.inputFastqs, args.outBwtDir,
                                            args.areUniform, logger)
        else:
            #preprocess for Holt et al. method
            numProcs = 1
            Multimerge.preprocessFastqs(args.inputFastqs, args.outBwtDir,
                                        numProcs, args.areUniform, logger)

    elif args.subparserID == 'cfpp':
        logger.info('BWT dir:\t' + args.bwtDir)
        logger.info('Uniform:\t' + str(args.areUniform))
        logger.info('Output Compressed:\t' + str(args.buildCompressed))
        logger.info('Processes:\t' + str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning(
                'Using multi-processing with slow disk accesses can lead to slower build times.'
            )
        print
        seqFN = args.bwtDir + '/seqs.npy'
        offsetFN = args.bwtDir + '/offsets.npy'
        bwtFN = args.bwtDir + '/msbwt.npy'

        if args.areUniform:
            #process it using the column-wise Bauer et al. method
            if args.buildCompressed:
                MSBWTCompGenCython.createMsbwtFromSeqs(args.bwtDir,
                                                       args.numProcesses,
                                                       logger)
            else:
                MSBWTGenCython.createMsbwtFromSeqs(args.bwtDir,
                                                   args.numProcesses, logger)
        else:
            #process it using the Holt et al. merge method
            if args.buildCompressed:
                logger.error(
                    'No compressed builder for non-uniform datasets, compress after creation.'
                )
            else:
                Multimerge.interleaveLevelMerge(args.bwtDir, args.numProcesses,
                                                args.areUniform, logger)

    elif args.subparserID == 'compress':
        logger.info('Source Directory:' + args.srcDir)
        logger.info('Dest Directory:' + args.dstDir)
        logger.info('Processes:' + str(args.numProcesses))
        if args.srcDir == args.dstDir:
            raise Exception(
                'Source and destination directories cannot be the same directory.'
            )
        print
        MSBWTGen.compressBWT(args.srcDir + '/msbwt.npy',
                             args.dstDir + '/comp_msbwt.npy',
                             args.numProcesses, logger)

    elif args.subparserID == 'decompress':
        logger.info('Source Directory: ' + args.srcDir)
        logger.info('Dest Directory: ' + args.dstDir)
        logger.info('Processes: ' + str(args.numProcesses))
        print
        MSBWTGen.decompressBWT(args.srcDir, args.dstDir, args.numProcesses,
                               logger)
        #TODO: remove if srcdir and dstdir are the same?

    elif args.subparserID == 'merge':
        logger.info('Inputs:\t' + str(args.inputBwtDirs))
        logger.info('Output:\t' + args.outBwtDir)
        logger.info('Processes:\t' + str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning(
                'Multi-processing is not supported at this time, but will be included in a future release.'
            )
            numProcs = 1
            #logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.')
        print
        #MSBWTGen.mergeNewMSBWT(args.outBwtDir, args.inputBwtDirs, args.numProcesses, logger)
        if len(args.inputBwtDirs) > 2:
            #this is a deprecated method, it may still work if you feel daring
            #MSBWTGenCython.mergeMsbwts(args.inputBwtDirs, args.outBwtDir, 1, logger)
            logger.error(
                'Merging more than two MSBWTs at once is not currently supported.'
            )
        else:
            GenericMerge.mergeTwoMSBWTs(args.inputBwtDirs[0],
                                        args.inputBwtDirs[1], args.outBwtDir,
                                        numProcs, logger)

    elif args.subparserID == 'query':
        #this is the easiest thing we can do, don't dump the standard info, just do it
        msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger)

        #always print how many are found, users can parse it out if they want
        r = msbwt.findIndicesOfStr(args.kmer)
        print r[1] - r[0]

        #dump the seqs if request
        if args.dumpSeqs:
            for x in xrange(r[0], r[1]):
                dInd = msbwt.getSequenceDollarID(x)
                print msbwt.recoverString(dInd)[1:] + ',' + str(dInd)

    elif args.subparserID == 'massquery':
        logger.info('Input:\t' + str(args.inputBwtDir))
        logger.info('Queries:\t' + str(args.kmerFile))
        logger.info('Output:\t' + args.outputFile)
        logger.info('Rev-comp:\t' + str(args.reverseComplement))
        print
        msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger)

        output = open(args.outputFile, 'w+')
        output.write('k-mer,counts')
        if args.reverseComplement:
            output.write(',revCompCounts\n')
        else:
            output.write('\n')

        logger.info('Beginning queries...')
        for line in open(args.kmerFile, 'r'):
            kmer = line.strip('\n')
            c = msbwt.countOccurrencesOfSeq(kmer)
            if args.reverseComplement:
                rc = msbwt.countOccurrencesOfSeq(
                    MultiStringBWT.reverseComplement(kmer))
                output.write(kmer + ',' + str(c) + ',' + str(rc) + '\n')
            else:
                output.write(kmer + ',' + str(c) + '\n')
        logger.info('Queries complete.')

    elif args.subparserID == 'convert':
        if args.inputTextFN == None:
            logger.info('Input: stdin')
        else:
            logger.info('Input: ' + args.inputTextFN)
        logger.info('Output: ' + args.dstDir)
        logger.info('Beginning conversion...')
        CompressToRLE.compressInput(args.inputTextFN, args.dstDir)
        logger.info('Finished conversion.')

    else:
        print args.subparserID + " is currently not implemented, please wait for a future release."
Beispiel #6
0
 def firstTimeExtension(self, foundKmers, unexploredPaths, nodes, edges):
     '''
     @param foundKmers - Each kmer we find will be checked against this and added if not present
     @param unexploredPaths - if we find a new path split, we add the things here, also merges are important to add here
     @param nodes - the list of nodes if we find a new one
     '''
     pc = ''
     kmer = self.seq
     terminate = False
     while not terminate:
         if len(kmer) != self.pathK:
             print 'ERROR: DIFFERENT SIZED K-MER '+str(len(kmer))
             raise Exception('ERROR')
     
         #First, perform all the counts of paths going both forwards and backwards
         counts = {}
         revCounts = {}
         
         #maxV - the count of the (k+1)-mer with maxC on it, total is the total counts of valid chars
         maxV = 0
         maxC = ''
         total = 0
         
         #count the number of forward and reversed paths
         numPaths = 0
         numRevPaths = 0
         
         for c in self.validChars:
             counts[c] = self.msbwt.countOccurrencesOfSeq(kmer+c)+self.msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer+c))
             revCounts[c] = self.msbwt.countOccurrencesOfSeq(c+kmer)+self.msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(c+kmer))
             
             if self.drawDollarTerminals or c != '$':
                 total += counts[c]
                 if counts[c] > maxV:
                     maxV = counts[c]
                     maxC = c
                 
                 if counts[c] >= self.pathThreshold:
                     numPaths += 1
                     
                 #if we have evidence from the counts OR if the previous character was known to be that character
                 if revCounts[c] >= self.pathThreshold or c == pc:
                     numRevPaths += 1
             
         #check if we have incoming edges, in which case we need to end this block
         if numRevPaths > 1 and kmer != self.seq:
             
             #remove the last kmer, because it's actually in the new node we merge into
             self.seq = self.seq[0:-1]
             
             #this will lead to repeating the same counts later, but that's okay
             newID = len(nodes)
             newHistMers = set([])
             nodes.append(PathNode(newID, kmer, self.msbwt, self.minDistToSeed+len(self.pileups), self.settingsDict))
             edges.append(PathEdge(len(edges), self.nodeID, newID, revCounts[pc], pc+', '+str(revCounts)))
             self.termCondition = 'MERGE_'+str(newID)
             foundKmers[kmer] = newID
             
             unexploredPaths.append(nodes[newID])
             
             #print 'Ending block for merge'
             terminate = True
             
         elif total == 0:
             #print 'No strings found.'
             self.termCondition = 'TERMINAL'
             terminate = True
         else:
             #the kmer was found in this block and it may have multiple extensions
             foundKmers[kmer] = self.nodeID
             revMer = MultiStringBWT.reverseComplement(kmer)
             if foundKmers.has_key(revMer):
                 otherID = foundKmers[revMer]
                 self.inversionSet.add(otherID)
                 nodes[otherID].inversionSet.add(self.nodeID)
             
             r1 = self.msbwt.findIndicesOfStr(kmer[-self.countK:])
             r2 = self.msbwt.findIndicesOfStr(MultiStringBWT.reverseComplement(kmer[-self.countK:]))
             kmerCount = (r1[1]-r1[0])+(r2[1]-r2[0])
             self.pileups.append(kmerCount)
             perc = float(maxV)/total
             
             if self.trackReads == True:
                 for i in xrange(r1[0], r1[1]):
                     self.readSet.add((int(self.msbwt.getSequenceDollarID(i)), 0))
                 for i in xrange(r2[0], r2[1]):
                     self.readSet.add((int(self.msbwt.getSequenceDollarID(i)), 1))
             
             #if kmerCount > self.overloadThreshold:
             if self.pileups[0] > self.overloadThreshold:
                 #this path is too heavy, we probably won't figure out what's going on downstream
                 self.termCondition = 'OVERLOAD'
                 terminate = True
                 
             elif numPaths > 1:
                 self.termCondition = 'SPLIT'
                 for c in self.validChars:
                     if counts[c] >= self.pathThreshold:
                         newKmer = kmer[1:]+c
                         if foundKmers.has_key(newKmer):
                             otherNID = foundKmers[newKmer]
                             nodes[otherNID].minDistToSeed = min(nodes[otherNID].minDistToSeed, self.minDistToSeed+len(self.pileups))
                             edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[c], c+': '+str(counts[c])))
                         
                         else:
                             if self.drawDollarTerminals or c != '$':
                                 newID = len(nodes)
                                 newHistMers = set([])
                                 nodes.append(PathNode(newID, newKmer, self.msbwt, self.minDistToSeed+len(self.pileups), self.settingsDict))
                                 edges.append(PathEdge(len(edges), self.nodeID, newID, counts[c], c+': '+str(counts[c])))
                                 foundKmers[newKmer] = newID
                                 
                                 if c != '$':
                                     unexploredPaths.append(nodes[newID])
                                 else:
                                     nodes[newID].termCondition = '$ Ext'
                             
                 terminate = True
             else:
                 #this is data pertaining to this k-mer
                 #print ':\t'+kmer+maxC+'\t'+str(perc)+'\t'+str(maxV)+'/'+str(total)+'\t'+str(total-maxV)+'\t'
                 pc = kmer[0]
                 kmer = kmer[1:]+maxC
                 #check if we've found the new k-mer before
                 if foundKmers.has_key(kmer):
                     otherNID = foundKmers[kmer]
                     nodes[otherNID].minDistToSeed = min(nodes[otherNID].minDistToSeed, self.minDistToSeed+len(self.pileups))
                     if counts[maxC] >= self.pathThreshold:
                         edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[maxC], pc+': '+str(counts[maxC])))
                         self.termCondition = 'MERGE_'+str(otherNID)
                     else:
                         edges.append(PathEdge(len(edges), self.nodeID, otherNID, counts[maxC], pc+': '+str(counts[maxC]), 'dashed'))
                         self.termCondition = 'MERGE_'+str(otherNID)+', THRESHOLD'
                         
                     terminate = True
                 else:
                     self.seq += maxC
                     if maxC == '$':
                         self.termCondition = '$ Max'
                         terminate = True
Beispiel #7
0
def mainRun():
    '''
    This is the primary function for external typical users to run when the Command Line Interface is used
    '''
    #start up the logger
    initLogger()
    
    #attempt to parse the arguments
    p = ap.ArgumentParser(description=util.DESC, formatter_class=ap.RawTextHelpFormatter)
    
    #version data
    p.add_argument('-V', '--version', action='version', version='%(prog)s' + \
                   ' %s in MSBWT %s' % (util.VERSION, util.PKG_VERSION))
    
    #TODO: do we want subparsers groups by type or sorted by name? it's type currently
    
    sp = p.add_subparsers(dest='subparserID')
    p2 = sp.add_parser('cffq', help='create a MSBWT from FASTQ files (pp + cfpp)')
    p2.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p2.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False)
    p2.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False)
    p2.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory')
    p2.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files')
    
    p7 = sp.add_parser('pp', help='pre-process FASTQ files before BWT creation')
    p7.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False)
    p7.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory')
    p7.add_argument('inputFastqs', nargs='+', type=util.readableFastqFile, help='the input FASTQ files')
    
    p3 = sp.add_parser('cfpp', help='create a MSBWT from pre-processed sequences and offsets')
    p3.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p3.add_argument('-u', '--uniform', dest='areUniform', action='store_true', help='the input sequences have uniform length', default=False)
    p3.add_argument('-c', '--compressed', dest='buildCompressed', action='store_true', help='build the RLE BWT (faster, less disk I/O)', default=False)
    p3.add_argument('bwtDir', type=util.existingDirectory, help='the MSBWT directory to process')
    
    p4 = sp.add_parser('merge', help='merge many MSBWTs into a single MSBWT')
    p4.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p4.add_argument('outBwtDir', type=util.newDirectory, help='the output MSBWT directory')
    p4.add_argument('inputBwtDirs', nargs='+', type=util.existingDirectory, help='input BWT directories to merge')
    
    p5 = sp.add_parser('query', help='search for a sequence in an MSBWT, prints sequence and seqID')
    p5.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query')
    p5.add_argument('kmer', type=util.validKmer, help='the input k-mer to search for')
    p5.add_argument('-d', '--dump-seqs', dest='dumpSeqs', action='store_true', help='print all sequences with the given kmer (default=False)', default=False)
    
    p6 = sp.add_parser('massquery', help='search for many sequences in an MSBWT')
    p6.add_argument('inputBwtDir', type=util.existingDirectory, help='the BWT to query')
    p6.add_argument('kmerFile', help='a file with one k-mer per line')
    p6.add_argument('outputFile', help='output file with counts per line')
    p6.add_argument('-r', '--rev-comp', dest='reverseComplement', action='store_true', help='also search for each kmer\'s reverse complement', default=False)
    
    p8 = sp.add_parser('compress', help='compress a MSBWT from byte/base to RLE')
    p8.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p8.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress')
    p8.add_argument('dstDir', type=util.newDirectory, help='the destination directory')
    
    p9 = sp.add_parser('decompress', help='decompress a MSBWT from RLE to byte/base')
    p9.add_argument('-p', metavar='numProcesses', dest='numProcesses', type=int, default=1, help='number of processes to run (default: 1)')
    p9.add_argument('srcDir', type=util.existingDirectory, help='the source directory for the BWT to compress')
    p9.add_argument('dstDir', type=util.newDirectory, help='the destination directory')
    
    args = p.parse_args()
    
    if args.subparserID == 'cffq':
        logger.info('Inputs:\t'+str(args.inputFastqs))
        logger.info('Uniform:\t'+str(args.areUniform))
        logger.info('Output:\t'+args.outBwtDir)
        logger.info('Output Compressed:\t'+str(args.buildCompressed))
        logger.info('Processes:\t'+str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.')
        print
        if args.areUniform:
            #if they are uniform, use the method developed by Bauer et al., it's likely short Illumina seq
            if args.buildCompressed:
                MultiStringBWT.createMSBWTCompFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger)
            else:
                MultiStringBWT.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger)
        else:
            #if they aren't uniform, use the merge method by Holt et al., it's likely longer PacBio seq
            if args.buildCompressed:
                logger.error('No compressed builder for non-uniform datasets, compress after creation.')
            else:
                Multimerge.createMSBWTFromFastq(args.inputFastqs, args.outBwtDir, args.numProcesses, args.areUniform, logger)
        
    elif args.subparserID == 'pp':
        logger.info('Inputs:\t'+str(args.inputFastqs))
        logger.info('Uniform:\t'+str(args.areUniform))
        logger.info('Output:\t'+args.outBwtDir)
        if args.areUniform:
            #preprocess for Bauer et al. method
            MultiStringBWT.preprocessFastqs(args.inputFastqs, args.outBwtDir, args.areUniform, logger)
        else:
            #preprocess for Holt et al. method
            numProcs = 1
            Multimerge.preprocessFastqs(args.inputFastqs, args.outBwtDir, numProcs, args.areUniform, logger)
        
    elif args.subparserID == 'cfpp':
        logger.info('BWT dir:\t'+args.bwtDir)
        logger.info('Uniform:\t'+str(args.areUniform))
        logger.info('Output Compressed:\t'+str(args.buildCompressed))
        logger.info('Processes:\t'+str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.')
        print
        seqFN = args.bwtDir+'/seqs.npy'
        offsetFN = args.bwtDir+'/offsets.npy'
        bwtFN = args.bwtDir+'/msbwt.npy'
        
        if args.areUniform:
            #process it using the column-wise Bauer et al. method
            if args.buildCompressed:
                MSBWTCompGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger)
            else:
                MSBWTGenCython.createMsbwtFromSeqs(args.bwtDir, args.numProcesses, logger)
        else:
            #process it using the Holt et al. merge method
            if args.buildCompressed:
                logger.error('No compressed builder for non-uniform datasets, compress after creation.')
            else:
                Multimerge.interleaveLevelMerge(args.bwtDir, args.numProcesses, args.areUniform, logger)
        
    elif args.subparserID == 'compress':
        logger.info('Source Directory:'+args.srcDir)
        logger.info('Dest Directory:'+args.dstDir)
        logger.info('Processes:'+str(args.numProcesses))
        if args.srcDir == args.dstDir:
            raise Exception('Source and destination directories cannot be the same directory.')
        print
        MSBWTGen.compressBWT(args.srcDir+'/msbwt.npy', args.dstDir+'/comp_msbwt.npy', args.numProcesses, logger)
        
    elif args.subparserID == 'decompress':
        logger.info('Source Directory: '+args.srcDir)
        logger.info('Dest Directory: '+args.dstDir)
        logger.info('Processes: '+str(args.numProcesses))
        print
        MSBWTGen.decompressBWT(args.srcDir, args.dstDir, args.numProcesses, logger)
        #TODO: remove if srcdir and dstdir are the same?
        
    elif args.subparserID == 'merge':
        logger.info('Inputs:\t'+str(args.inputBwtDirs))
        logger.info('Output:\t'+args.outBwtDir)
        logger.info('Processes:\t'+str(args.numProcesses))
        if args.numProcesses > 1:
            logger.warning('Multi-processing is not supported at this time, but will be included in a future release.')
            numProcs = 1
            #logger.warning('Using multi-processing with slow disk accesses can lead to slower build times.')
        print
        #MSBWTGen.mergeNewMSBWT(args.outBwtDir, args.inputBwtDirs, args.numProcesses, logger)
        if len(args.inputBwtDirs) > 2:
            #this is a deprecated method, it may still work if you feel daring
            #MSBWTGenCython.mergeMsbwts(args.inputBwtDirs, args.outBwtDir, 1, logger)
            logger.error('Merging more than two MSBWTs at once is not currently supported.')
        else:
            GenericMerge.mergeTwoMSBWTs(args.inputBwtDirs[0], args.inputBwtDirs[1], args.outBwtDir, numProcs, logger)
        
    elif args.subparserID == 'query':
        #this is the easiest thing we can do, don't dump the standard info, just do it
        msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger)
        
        #always print how many are found, users can parse it out if they want
        r = msbwt.findIndicesOfStr(args.kmer)
        print r[1]-r[0]
        
        #dump the seqs if request
        if args.dumpSeqs:
            for x in xrange(r[0], r[1]):
                dInd = msbwt.getSequenceDollarID(x)
                print msbwt.recoverString(dInd)[1:]+','+str(dInd)
    
    elif args.subparserID == 'massquery':
        logger.info('Input:\t'+str(args.inputBwtDir))
        logger.info('Queries:\t'+str(args.kmerFile))
        logger.info('Output:\t'+args.outputFile)
        logger.info('Rev-comp:\t'+str(args.reverseComplement))
        print
        msbwt = MultiStringBWT.loadBWT(args.inputBwtDir, logger=logger)
        
        output = open(args.outputFile, 'w+')
        output.write('k-mer,counts')
        if args.reverseComplement:
            output.write(',revCompCounts\n')
        else:
            output.write('\n')
        
        logger.info('Beginning queries...')
        for line in open(args.kmerFile, 'r'):
            kmer = line.strip('\n')
            c = msbwt.countOccurrencesOfSeq(kmer)
            if args.reverseComplement:
                rc = msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer))
                output.write(kmer+','+str(c)+','+str(rc)+'\n')
            else:
                output.write(kmer+','+str(c)+'\n')
        logger.info('Queries complete.')
        
    else:
        print args.subparserID+" is currently not implemented, please wait for a future release."
Beispiel #8
0
def extractHaplotypes(bwt, kmer):
	forwardIndices = bwt.findIndicesOfStr(kmer)
	revComp = MultiStringBWT.reverseComplement(kmer)
	reverseIndices = bwt.findIndicesOfStr(revComp)
	
	readLen = 101
	patternLen = len(kmer)
	totalBuffLen = 2*readLen-patternLen
	
	modifiedSeqs = []
	for i in xrange(forwardIndices[0], forwardIndices[1]):
		readSeq = bwt.recoverString(i)
		dollarPos = readSeq.find('$')
		
		#calcualte how many tailing '.' we need first, then construct the string from that info
		afterPattern = readLen-dollarPos-1
		modSeq = ('.'*(readLen-patternLen-afterPattern)+
				  readSeq[dollarPos+1:].lower()+
				  readSeq[0:patternLen]+
				  readSeq[patternLen:dollarPos+1].lower()+
				  '.'*(afterPattern))
		modifiedSeqs.append(modSeq)
	
	for i in xrange(reverseIndices[0], reverseIndices[1]):
		revCompSeq = bwt.recoverString(i)
		readSeq = MultiStringBWT.reverseComplement(revCompSeq)
		dollarPos = readSeq.find('$')
		
		#beforePattern = readLen-dollarPos
		afterPattern = readLen-dollarPos-patternLen
		modSeq = ('.'*(readLen-patternLen-afterPattern)+
				  readSeq[dollarPos:-patternLen].lower()+
				  readSeq[-patternLen:]+
				  readSeq[0:dollarPos].lower()+
				  '.'*(afterPattern))
		modifiedSeqs.append(modSeq)

	finishedHaps = []

	previousConsensus = 'A'*totalBuffLen
	currentConsensus, currentScorer = conSeq(modifiedSeqs)
	currSeqs = modifiedSeqs

	while len(currSeqs) > 0 and compareShiftedSeqs(previousConsensus, currentConsensus) > 0:
		nextSeqs = []
		consensusSeqs = []
		
		#we will fill in consensus Seqs downstream
		finishedHaps.append((currentConsensus, consensusSeqs, []))
		
		#first get all exact matching reads
		for seq in currSeqs:
			if compareShiftedSeqs(seq, currentConsensus) == 0:
				consensusSeqs.append(seq)
			else:
				nextSeqs.append(seq)
		
		finishedHaps[-1][2].append((0, len(consensusSeqs)))
		
		#update these things
		previousConsensus = currentConsensus
		currSeqs = nextSeqs
		currentConsensus, currentScorer = conSeq(currSeqs)
		
		#check if the next consensus is identical
		acceptedScore = 1
		while len(currSeqs) > 0 and compareShiftedSeqs(currentConsensus, previousConsensus) == 0:
			#print 'triggered', acceptedScore
			nextNextSeqs = []
			minScore = 0xFFFFFFFFFFFFFFFF
			for seq in nextSeqs:
				calcScore = scoreShiftedSeqs(seq, currentConsensus, currentScorer)
				if calcScore < minScore and calcScore > acceptedScore:
					minScore = calcScore
				
				if calcScore <= acceptedScore:
					consensusSeqs.append(seq)
				else:
					nextNextSeqs.append(seq)
			finishedHaps[-1][2].append((acceptedScore, len(nextSeqs)-len(nextNextSeqs)))
			
			nextSeqs = nextNextSeqs
			currSeqs = nextSeqs
			currentConsensus, currentScorer = conSeq(currSeqs)
			
			#acceptedScore += 1
			acceptedScore = minScore

	for seq in currSeqs:
		consensusSeqs.append(seq)

	return finishedHaps
Beispiel #9
0
def build_bridge(msbwt,
                 seedKmer,
                 targetKmer,
                 tMin=1,
                 branchLim=10,
                 maxBranchLen=250):
    """
	Assemble the short "bridge" between two sequences expected to occur nearby on the template.
	@param msbwt - the MSBWT to use for searchs
	@param seedKmer - a k-mer to seed our bridging
	@param targetKmer - the target we are trying to bridge to
	@param tMin - the minimum k-count needed to consider the path
	@param branchLim - the maximum number of branches we will test
	@param maxBranchLen - the maximum length of a branch before giving up
	@return (ret, numBranched)
		ret - a list of bridges discovered; for most cases this is a list of length one
		numBranched - the number of branches we explored; if numBranched >= branchLim, this function was not 100% exhaustive
	"""

    #initialize to our input kmer
    ret = []
    possBridges = [dna.unmask(dna.ungap(seedKmer))]
    targetKmer = dna.unmask(dna.ungap(targetKmer))
    kmerLen = len(seedKmer)

    #set up some easy values
    validChars = "ACGT"
    counts = np.zeros(dtype='<u8', shape=(len(validChars), ))
    numBranched = 0

    #print (seedKmer, targetKmer)

    #while we have things to explore, and we haven't explored too many, and we don't have a ridiculous number of possibilities
    while len(possBridges) > 0 and numBranched < branchLim:
        #get the bridge, the kmer, and the reverse kmer
        currBridge = possBridges.pop()
        numBranched += 1

        currKmer = currBridge[len(currBridge) - kmerLen:]
        revKmer = MultiStringBWT.reverseComplement(currKmer)

        #try to extend it on out
        while len(currBridge) < maxBranchLen:
            #get the counts for each possible extension
            for i, c in enumerate(validChars):
                counts[i] = msbwt.countOccurrencesOfSeq(
                    currKmer +
                    c) + msbwt.countOccurrencesOfSeq(dna.revcomp(c) + revKmer)

            #get the highest one
            maxPos = np.argmax(counts)
            maxSym = validChars[maxPos]

            #make sure the highest is high enough for us to consider it
            if counts[maxPos] >= tMin:
                if len(possBridges) < branchLim:
                    #go through all the other possible extensions
                    for i, c in enumerate(validChars):
                        if i != maxPos and counts[i] >= tMin:
                            #add the ones we aren't exploring right now if they're high enough
                            possBridges.append(currBridge + c)

                #make sure the highest isn't too high
                #this extension meets our requirement so shift over to loop back around
                currBridge += maxSym
                currKmer = currKmer[1:] + maxSym
                revKmer = dna.revcomp(maxSym) + revKmer[0:len(revKmer) - 1]
            else:
                #our BEST doesn't pass the threshold on this path, stop following
                break

            if currKmer.startswith(targetKmer):
                ret.append(currBridge)

    #return all our possibilities
    return (ret, numBranched)
Beispiel #10
0
def get_kmer_count(msbwt, kmer):
    c1 = msbwt.countOccurrencesOfSeq(kmer)
    c2 = msbwt.countOccurrencesOfSeq(MultiStringBWT.reverseComplement(kmer))
    return c1 + c2