Beispiel #1
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('A')
    p.add_argument('B')
    #p.add_argument('C')
    p.add_argument('output')
    p.add_argument('-k', '--ksize', default=31, type=int)
    args = p.parse_args()

    # for potato, would suggest changing 1e6 to 4e9. Only matters for A-B hash.
    a_table = khmer.Nodetable(args.ksize, 10e9, 4)
    b_table = khmer.Nodetable(args.ksize, 10e9, 4)
    b_table.consume_seqfile(args.B)

    n_hashes_loaded = 0
    for record in khmer.utils.clean_input_reads(screed.open(args.A)):
        for hash in a_table.get_kmer_hashes(record.cleaned_seq):
            if not b_table.get(hash):
                a_table.add(hash)
                n_hashes_loaded += 1

    print('loaded {} k-mers in A but not in B'.format(n_hashes_loaded))
    # at this point, a_table contains A - B
    
    out_table = open(args.output, 'w')
    head = 'sample\ttotal-read\thap-kmer-read\ttotal-kmer\tkmer-hits\n'
    out_line = '' # one string to rule them all
    
    for fastq in fastqGrab():
        print('Now querying {}'.format(fastq))
        hitcount = 0 # initialize hitcount
        hashcount = 0 # initialize hashcount
        out_fp = open(fastq+'.subset', 'w')
        n_read = 0
        n_written = 0
        
        for record in khmer.utils.clean_input_reads(screed.open(fastq)):
            n_read += 1
            match = 0 # adopt O state for no hits in read, change to 1 when match in read
            
            for hash in a_table.get_kmer_hashes(record.cleaned_seq):
                hashcount += 1
                if a_table.get(hash):
                    match = 1 # swtich to state "match"
                    hitcount += 1 # tally k-mer hit
                    
            if match == 1: # write to file if switch tripped
                khmer.utils.write_record(record, out_fp)
                n_written += 1
                #break
                
        print('wrote {} of {} records for {}'.format(n_written, n_read,
                                                fastq))
        out_line += fastq + '\t'+ str(n_read) + '\t' + str(n_written) + '\t' + str(hashcount) + '\t' + str(hitcount) + '\n'
        
    out_table.write(head+out_line)
Beispiel #2
0
    def __init__(self, query_file, ksize, scaled, catlas_name, debug=True):
        self.filename = query_file
        self.ksize = ksize
        self.kmers = set()
        self.name = None
        mh = MinHash(0, ksize, scaled=scaled)
        self.mh = mh
        self.catlas_name = catlas_name
        self.debug = debug

        notify('----')
        notify('QUERY FILE: {}', self.filename)

        # build hashes for all the query k-mers & create signature
        notify('loading query kmers...', end=' ')
        bf = khmer.Nodetable(ksize, 1, 1)

        for record in screed.open(self.filename):
            if self.name is None:
                self.name = record.name
            if len(record.sequence) >= int(ksize):
                self.kmers.update(bf.get_kmer_hashes(record.sequence))
            mh.add_sequence(record.sequence, True)

        self.sig = sourmash.SourmashSignature(mh,
                                              name=self.name,
                                              filename=self.filename)

        notify('got {} k-mers from query', len(self.kmers))

        self.cdbg_match_counts = {}
        self.catlas_match_counts = {}
Beispiel #3
0
def load_mask(maskfiles,
              ksize,
              memory,
              maxfpr=0.001,
              savefile=None,
              logfile=sys.stderr):
    """Load reference genome and/or contaminant database from a file."""
    if len(maskfiles) == 1 and maskfiles[0].endswith(('.nt', '.nodetable')):
        mask = kevlar.sketch.load(maskfiles[0])
        message = '    nodetable loaded'
    else:
        buckets = memory * khmer._buckets_per_byte['nodegraph'] / 4
        mask = khmer.Nodetable(ksize, buckets, 4)
        nr, nk = 0, 0
        for maskfile in maskfiles:
            numreads, numkmers = mask.consume_seqfile(maskfile)
            nr += numreads
            nk += numkmers
        message = '    {:d} sequences and {:d} k-mers consumed'.format(nr, nk)
    fpr = kevlar.sketch.estimate_fpr(mask)
    message += '; estimated false positive rate is {:1.3f}'.format(fpr)
    print(message, file=logfile)
    if fpr > maxfpr:
        print('[kevlar::filter] FPR too high, bailing out', file=logfile)
        sys.exit(1)
    if savefile:
        mask.save(savefile)
        message = '    nodetable saved to "{:s}"'.format(savefile)
        print(message, file=logfile)
    return mask
Beispiel #4
0
def count_second_pass(infiles, counts, nthreads=1):
    kevlar.plog('[kevlar::dist] Second pass over the data')
    tracking = khmer.Nodetable(counts.ksize(), 1, 1, primes=counts.hashsizes())
    abund_lists = list()

    def __do_abund_dist(parser):
        abund = counts.abundance_distribution(parser, tracking)
        abund_lists.append(abund)

    for filename in infiles:
        kevlar.plog('    -', filename)
        parser = khmer.ReadParser(filename)
        threads = list()
        for _ in range(nthreads):
            thread = threading.Thread(target=__do_abund_dist, args=(parser, ))
            threads.append(thread)
            thread.start()
        for thread in threads:
            thread.join()

    assert len(abund_lists) == len(infiles) * nthreads
    abundance = defaultdict(int)
    for abund in abund_lists:
        for i, count in enumerate(abund):
            if i > 0 and count > 0:
                abundance[i] += count

    kevlar.plog('[kevlar::dist] Done second pass over input!')

    return abundance
def main(argv):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix')
    p.add_argument('-k', '--ksize', default=31, type=int)
    a = p.parse_args(argv)

    kh = khmer.Nodetable(a.ksize, 1, 1)

    contigs_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz')
    mphf_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.mphf')
    array_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.indices')

    def create_records_iter():
        print('reading cDBG nodes from {}'.format(contigs_filename))
        return screed.open(contigs_filename)

    x, mphf_to_kmer, mphf_to_cdbg, sizes = build_mphf(kh, create_records_iter)

    print('done! saving to {} and {}'.format(mphf_filename, array_filename))

    x.save(mphf_filename)
    with open(array_filename, 'wb') as fp:
        numpy.savez_compressed(fp,
                               mphf_to_kmer=mphf_to_kmer,
                               kmer_to_cdbg=mphf_to_cdbg,
                               sizes=sizes)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('database')
    parser.add_argument('reads')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    args = parser.parse_args()

    if args.reads == '-':
        args.reads = sys.stdin

    kh = khmer.Nodetable(args.ksize, 1, 1)

    print('loading database {}'.format(args.database))
    with open(args.database, 'rb') as fp:
        family_ids, kmer_to_family_id = pickle.load(fp)

    print('done!')

    n_same = 0
    n_different = 0

    n = 0
    for record in screed.open(args.reads):
        n += 1
        if n % 1000 == 0:
            print('...', n)
            if n > 5000:
                break

        hashvals = kh.get_kmer_hashes(record.sequence)
        if len(hashvals) <= 1:
            continue

        first = hashvals[0]
        last = hashvals[-1]

        # find the first unambiguously assigned k-mer
        first_ids = kmer_to_family_id.get(first, set())
        idx = 1
        while idx < len(hashvals) / 2 and len(first_ids) != 1:
            first = hashvals[idx]
            idx += 1

        # find the last unambiguously assigned k-mer
        last_ids = kmer_to_family_id.get(last, set())
        idx = len(hashvals) - 2
        while idx > len(hashvals) / 2 and len(last_ids) != 1:
            last = hashvals[idx]
            idx -= 1

        if len(first_ids) == 1 and len(last_ids) == 1 and \
           first_ids == last_ids:
            n_same += 1
        else:
            print('different {} {}'.format(first_ids, last_ids))
            n_different += 1

    print('same:', n_same)
    print('different:', n_different)
Beispiel #7
0
def test_nodegraph_vs_table():
    x = khmer.Nodetable(4, 21, 3)
    y = khmer.Nodegraph(4, 21, 3)

    assert hasattr(x, 'add')
    assert hasattr(y, 'add')

    assert not hasattr(x, 'consume_and_tag')
    assert hasattr(y, 'consume_and_tag')
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("project", help="Project directory", type=str)
    parser.add_argument("-o", "--output", type=str)
    parser.add_argument('-k', '--ksize', default=31, type=int)
    args = parser.parse_args()

    # figure out catlas and domfile information.
    catlas_file = os.path.join(args.project, 'catlas.csv')
    domfile = os.path.join(args.project, 'first_doms.txt')

    # grab contigs
    print('loading contigs...')
    contigs_filename = os.path.join(args.project, 'contigs.fa.gz')
    contigs = {}
    for record in screed.open(contigs_filename):
        contigs[record.name] = record
    print('done. {} contigs.'.format(len(contigs)))

    # load catlas DAG
    catlas = CAtlas(catlas_file, domfile=domfile)
    print('loaded {} nodes from catlas {}'.format(len(catlas), catlas_file))
    print('loaded {} layer 1 catlas nodes'.format(len(catlas.layer1_to_cdbg)))

    # create k-mer hashing machinery
    kh = khmer.Nodetable(args.ksize, 1, 1)

    # function to yield records as if reading from a file
    def create_records_iter():
        return contigs.values()

    # BENCHMARK:
    # build the two indices (kmer to cdbg node, cdbg node ID to pieces)
    start = time.time()
    print('building MPHF index.')
    x, mphf_to_kmer, mphf_to_cdbg, sizes = build_mphf(kh, create_records_iter)
    print('done! {:.1f}s.'.format(time.time() - start))

    print('building index 2 (cdbg node ID to pieces)')
    cdbg_to_pieces = defaultdict(set)
    for node_id in catlas:
        level = catlas.levels[node_id]
        if level == 1:
            pieces = catlas.layer1_to_cdbg.get(node_id)
            for cdbg_node in pieces:
                cdbg_to_pieces[cdbg_node] = set(pieces)
    end = time.time()

    # done. output.

    outfp = sys.stdout
    if args.output:
        outfp = open(args.output, 'at')
    print("{},{},{:.1f},{},indexPieces".format(len(mphf_to_kmer), len(catlas),
                                               end - start, args.project),
          file=outfp)
Beispiel #9
0
def test_validate_with_mask():
    kmer = 'AGGGGCGTGACTTAATAAG'
    mask = khmer.Nodetable(19, 1e3, 2)
    mask.add(kmer)

    filelist = kevlar.tests.data_glob('collect.beta.?.txt')
    readset, countgraph = kevlar.filter.load_input(filelist, 19, 5e3)
    kevlar.filter.validate_and_print(readset, countgraph, mask)
    assert readset.valid == (3, 24)
    for record in readset:
        for ikmer in record.ikmers:
            assert ikmer.sequence != kmer
            assert kevlar.revcom(ikmer.sequence) != kmer
Beispiel #10
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('A')
    p.add_argument('B')
    p.add_argument('C')
    p.add_argument('output')
    p.add_argument('-k', '--ksize', default=31)
    args = p.parse_args()

    # for potato, would suggest changing 1e6 to 4e9
    a_table = khmer.Nodetable(args.ksize, 1e6, 4)  # 1e6 needs to be changed
    b_table = khmer.Nodetable(args.ksize, 1e6, 4)

    b_table.consume_seqfile(args.B)

    n_hashes_loaded = 0
    for record in khmer.utils.clean_input_reads(screed.open(args.A)):
        for hash in a_table.get_kmer_hashes(record.cleaned_seq):
            if not b_table.get(hash):
                a_table.add(hash)
                n_hashes_loaded += 1

    print('loaded {} k-mers in A but not in B'.format(n_hashes_loaded))
    # at this point, a_table contains A - B

    out_fp = open(args.output, 'w')
    n_read = 0
    n_written = 0

    for record in khmer.utils.clean_input_reads(screed.open(args.C)):
        n_read += 1
        for hash in a_table.get_kmer_hashes(record.cleaned_seq):
            if a_table.get(hash):
                khmer.utils.write_record(record, out_fp)
                n_written += 1
                break

    print('wrote {} of {} records to {}'.format(n_written, n_read,
                                                args.output))
Beispiel #11
0
def test_consume_with_mask_complement():
    mask = khmer.Nodetable(13, 1e3, 4)
    mask.consume('TGCTTGAAACAAGTG')

    infile = utils.get_test_data('seq-b.fa')
    ct = khmer.Counttable(13, 1e3, 4)
    nr, nk = ct.consume_seqfile_with_mask(infile,
                                          mask,
                                          threshold=1,
                                          consume_masked=True)

    assert ct.get_kmer_counts('TGCTTGAAACAAGTG') == [1, 1, 1]
    assert ct.get_kmer_counts('GAAACAAGTGGATTT') == [0, 0, 0]
Beispiel #12
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('readfilelist')
    args = p.parse_args()

    filelist = open(args.readfilelist).readlines()
    filelist = [x.strip() for x in filelist]

    inputfile = filelist.pop()
    while not os.path.exists(inputfile):
        inputfile = filelist.pop()
    print('starting with', inputfile)

    remove_queue = [None, None]
    for pos, filename in enumerate(filelist):
        if not os.path.exists(filename):
            print('skipping', filename)
            continue

        print('loading kh:', filename)
        kh = khmer.Nodetable(K, 2e8, 4)
        kh.consume_seqfile(filename)

        print('iterating over reads:', inputfile)

        outputfile = BASE + '.{}'.format(pos)
        fp = open(outputfile, 'w')

        n = 0
        m = 0
        for n, record in enumerate(clean_input_reads(screed.open(inputfile))):
            if len(record.sequence) < K:
                continue

            if kh.median_at_least(record.cleaned_seq, 1):
                khmer.utils.write_record(record, fp)
                m += 1
        fp.close()
        print('read {}, wrote {}'.format(n, m))

        if inputfile.startswith(BASE):
            remove_queue.append(inputfile)
        remove_name = remove_queue.pop(0)
        if remove_name:
            print('removing', remove_name)
            os.unlink(remove_name)

        inputfile = outputfile

    print('final file is:', inputfile)
Beispiel #13
0
def test_validate_with_mask():
    kmer = 'AGGGGCGTGACTTAATAAG'
    mask = khmer.Nodetable(19, 1e3, 2)
    mask.add(kmer)

    filelist = kevlar.tests.data_glob('collect.beta.?.txt')
    readset = ReadSet(19, 5e3)
    for record in kevlar.seqio.afxstream(filelist):
        readset.add(record)
    readset.validate(mask=mask)
    assert readset.valid == (3, 24)
    for record in readset:
        for ikmer in record.ikmers:
            assert ikmer.sequence != kmer
            assert kevlar.revcom(ikmer.sequence) != kmer
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('extract_from')
    parser.add_argument('queries', nargs='+')
    parser.add_argument('-o', '--output')
    parser.add_argument('-k', '--ksize', default=31, type=int)
    args = parser.parse_args()

    assert args.output, "must provide an argument to -o"

    kh = khmer.Nodetable(args.ksize, 1, 1)

    query_kmers = set()
    for queryfile in args.queries:
        print('loading query', queryfile, file=sys.stderr)
        for record in screed.open(queryfile):
            query_kmers.update(kh.get_kmer_hashes(record.sequence))

    print('loaded {} k-mers'.format(len(query_kmers)), file=sys.stderr)

    bp = 0
    threshold = REPORTING_BP
    m = 0
    n = 0
    bp_out = 0

    with open(args.output, 'wt') as fp:
        print('searching sequences in {}'.format(args.extract_from))
        for record in screed.open(args.extract_from):
            n += 1
            bp += len(record.sequence)
            if bp > threshold:
                threshold += REPORTING_BP
                print('... read {} bp in {} sequences'.format(bp, n),
                      file=sys.stderr)
                print('==> found {} bp in {} sequences'.format(bp_out, m),
                      file=sys.stderr)

            x = set(kh.get_kmer_hashes(record.sequence))
            if x.intersection(query_kmers):
                fp.write('>{}\n{}\n'.format(record.name, record.sequence))
                bp_out += len(record.sequence)
                m += 1

    print('... read {} bp in {} sequences'.format(bp, n), file=sys.stderr)
    print('==> found {} bp in {} sequences'.format(bp_out, m), file=sys.stderr)

    return 0
Beispiel #15
0
def load_mask(maskfiles,
              ksize,
              memory,
              maxfpr=0.001,
              savefile=None,
              logstream=sys.stderr):
    """Load reference genome and/or contaminant database from a file."""
    if maskfiles is None:
        return None

    timer = kevlar.Timer()
    timer.start('loadmask')
    print('[kevlar::filter] Loading mask from', maskfiles, file=logstream)

    if len(maskfiles) == 1 and maskfiles[0].endswith(('.nt', '.nodetable')):
        mask = kevlar.sketch.load(maskfiles[0])
        message = '    nodetable loaded'
    else:
        buckets = memory * khmer._buckets_per_byte['nodegraph'] / 4
        mask = khmer.Nodetable(ksize, buckets, 4)
        nr, nk = 0, 0
        for maskfile in maskfiles:
            numreads, numkmers = mask.consume_seqfile(maskfile)
            nr += numreads
            nk += numkmers
        message = '    {:d} sequences and {:d} k-mers consumed'.format(nr, nk)
    fpr = kevlar.sketch.estimate_fpr(mask)
    message += '; estimated false positive rate is {:1.3f}'.format(fpr)
    print(message, file=logstream)
    if fpr > maxfpr:
        raise KevlarUnsuitableFPRError('FPR too high, bailing out!!!')
    if savefile:
        mask.save(savefile)
        message = '    nodetable saved to "{:s}"'.format(savefile)
        print(message, file=logstream)

    elapsed = timer.stop('loadmask')
    print('[kevlar::filter]',
          'Mask loaded in {:.2f} sec'.format(elapsed),
          file=logstream)
    return mask
Beispiel #16
0
    def __init__(self, query_file, ksize):
        self.filename = query_file
        self.ksize = ksize
        self.kmers = set()
        self.name = None
        print('----')
        print('QUERY FILE:', self.filename)

        # build hashes for all the query k-mers
        print('loading query kmers...', end=' ')
        bf = khmer.Nodetable(ksize, 1, 1)

        for record in screed.open(self.filename):
            if self.name is None:
                self.name = record.name
            self.kmers.update(bf.get_kmer_hashes(record.sequence))

        print('got {}'.format(len(self.kmers)))

        self.cdbg_match_counts = {}
        self.catlas_match_counts = {}
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('transcriptomes', nargs='+')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-o', '--output')
    args = parser.parse_args()

    assert args.output

    kh = khmer.Nodetable(args.ksize, 1, 1)

    family_ids = {}
    family_counter = 0

    kmer_to_family_id = defaultdict(set)

    n = 0
    for tr_filename in args.transcriptomes:
        for record in screed.open(tr_filename):
            n += 1
            if n % 1000 == 0:
                print('...', n)

            family_name = record.name.split('|')[1]

            family_id = family_ids.get(family_name)
            if family_id is None:
                family_id = family_counter
                family_counter += 1
                family_ids[family_name] = family_id
            
            hashvals = kh.get_kmer_hashes(record.sequence)

            for hashval in hashvals:
                kmer_to_family_id[hashval].add(family_id)

    with open(args.output, 'wb') as fp:
        pickle.dump((family_ids, kmer_to_family_id), fp)
Beispiel #18
0
def main(argv):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix')
    p.add_argument('-k', '--ksize', default=31, type=int)
    a = p.parse_args(argv)

    kh = khmer.Nodetable(a.ksize, 1, 1)

    contigs_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz')
    mphf_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.mphf')
    array_filename = os.path.join(a.catlas_prefix, 'contigs.fa.gz.indices')

    # build a list of all k-mers in the cDBG
    all_kmers = list()
    print('reading cDBG nodes from {}'.format(contigs_filename))
    for n, record in enumerate(screed.open(contigs_filename)):
        if n % 50000 == 0 and n:
            print('... contig', n, end='\r')

        kmers = kh.get_kmer_hashes(record.sequence)
        all_kmers.extend(list(kmers))

    n_contigs = n + 1
    print('loaded {} contigs.\n'.format(n_contigs))

    # build MPHF (this is the CPU intensive bit)
    print('building MPHF for {} k-mers in {} nodes.'.format(
        len(all_kmers), n_contigs))
    x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0)

    # build tables linking:
    # * mphf hash to k-mer hash (for checking exactness)
    # * mphf hash to cDBG ID
    # * cDBG ID to node size (in k-mers)

    mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64)
    mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32)
    sizes = numpy.zeros(n_contigs, numpy.uint32)

    print('second pass; reading cDBG nodes from {}'.format(contigs_filename))
    for n, record in enumerate(screed.open(contigs_filename)):
        if n % 50000 == 0 and n:
            print('... contig {} of {}'.format(n, n_contigs), end='\r')

        # node ID is record name, must go from 0 to total-1
        cdbg_id = int(record.name)

        # get 64-bit numbers for each k-mer (doesn't really matter what hash)
        kmers = kh.get_kmer_hashes(record.sequence)

        # for each k-mer, find its MPHF hashval, & link to info.
        for kmer in kmers:
            mphf = x.lookup(kmer)
            mphf_to_kmer[mphf] = kmer
            mphf_to_cdbg[mphf] = cdbg_id

        # record each node size, while we're here.
        sizes[cdbg_id] = len(kmers)

    print('loaded {} contigs in pass2.\n'.format(n_contigs))
    assert n == max(mphf_to_cdbg), (n, max(mphf_to_cdbg))

    print('done! saving to {} and {}'.format(mphf_filename, array_filename))

    x.save(mphf_filename)
    with open(array_filename, 'wb') as fp:
        numpy.savez_compressed(fp,
                               mphf_to_kmer=mphf_to_kmer,
                               kmer_to_cdbg=mphf_to_cdbg,
                               sizes=sizes)
def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('query')
    p.add_argument('output')
    p.add_argument('--minsize', type=float, default=100)
    p.add_argument('--maxsize', type=float, default=10000)
    p.add_argument('-k', '--ksize', default=31, type=int,
                   help='k-mer size (default: 31)')
    args = p.parse_args(args)

    print('minsize: {:g}'.format(args.minsize))
    print('maxsize: {:g}'.format(args.maxsize))

    basename = os.path.basename(args.catlas_prefix)
    catlas = os.path.join(args.catlas_prefix, 'catlas.csv')
    domfile = os.path.join(args.catlas_prefix, 'first_doms.txt')

    # load catlas DAG
    top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(catlas)
    print('loaded {} nodes from catlas {}'.format(len(dag), catlas))

    # load mapping between dom nodes and cDBG/graph nodes:
    layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile)
    print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg)))

    # calculate the cDBG shadow sizes for each catlas node.
    print('decorating catlas with shadow size info.')
    node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes(layer1_to_cdbg, dag, dag_levels)

    # ...and load cdbg node sizes
    print('loading contig size info')
    cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(args.catlas_prefix)

    # decorate catlas with cdbg node sizes underneath them
    print('decorating catlas with contig size info.')
    node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes)

    # load k-mer index, query, etc. etc.
    kmer_idx = search_utils.load_kmer_index(args.catlas_prefix)

    bf = khmer.Nodetable(args.ksize, 1, 1)

    query_kmers = set()
    for record in screed.open(args.query):
        query_kmers.update(bf.get_kmer_hashes(record.sequence))

    print('got {}'.format(len(query_kmers)))

    # construct dict cdbg_id -> # of query k-mers
    cdbg_match_counts = kmer_idx.get_match_counts(query_kmers)

    total_match_kmers = sum(cdbg_match_counts.values())
    f_found = total_match_kmers / len(query_kmers)
    print('=> containment: {:.1f}%'.format(f_found * 100))
    print('done loading & counting query k-mers in cDBG.')

    total_kmers_in_cdbg_matches = 0
    for cdbg_id in set(cdbg_match_counts.keys()):
        total_kmers_in_cdbg_matches += kmer_idx.get_cdbg_size(cdbg_id)

    cdbg_sim = total_match_kmers / total_kmers_in_cdbg_matches
    print('cdbg match node similarity: {:.1f}%'.format(cdbg_sim * 100))
    cdbg_min_overhead = (total_kmers_in_cdbg_matches - total_match_kmers) / total_match_kmers
    print('min cdbg overhead: {}'.format(cdbg_min_overhead))

    # calculate the cDBG matching k-mers sizes for each catlas node.
    catlas_match_counts = kmer_idx.build_catlas_match_counts(cdbg_match_counts, dag, dag_levels, layer1_to_cdbg)

    ### ok, the real work: look at articulation of cDBG graph.

    # find highest nodes with kmer size less than given max_size
    def find_terminal_nodes(node_id, max_size):
        node_list = set()
        for sub_id in dag[node_id]:
            size = node_kmer_sizes[sub_id]

            if size < max_size:
                node_list.add(sub_id)
            else:
                children = find_terminal_nodes(sub_id, max_size)
                node_list.update(children)

        return node_list

    print('finding terminal nodes for {}.'.format(args.maxsize))

    terminal = find_terminal_nodes(top_node_id, args.maxsize)
    print('...got {}'.format(len(terminal)))
    terminal = { n for n in terminal if node_kmer_sizes[n] > args.minsize }
    print('...down to {} between {} and {} in size.'.format(len(terminal),
                                                            args.minsize,
                                                            args.maxsize))

    # now, go through all nodes and print out characteristics
    with open(args.output, 'wt') as fp:
        w = csv.writer(fp)

        w.writerow(['node_id', 'contained', 'n_kmers', 'n_weighted_kmers', 'shadow_size'])
        for n in terminal:
            f_contained = catlas_match_counts.get(n, 0) / node_kmer_sizes[n]
            w.writerow([str(n), str(f_contained), str(node_kmer_sizes[n]),
                        str(node_weighted_kmer_sizes[n]),
                        str(node_shadow_sizes[n])])
Beispiel #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('database')
    parser.add_argument('input_filenames',
                        metavar='input_sequence_filename',
                        help='Input FAST[AQ] sequence filename.',
                        nargs='+')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-p',
                        '--paired',
                        action='store_true',
                        help='require that all sequences be properly paired')
    parser.add_argument('--force_single',
                        dest='force_single',
                        action='store_true',
                        help='treat all sequences as single-ended/unpaired')
    parser.add_argument('-u',
                        '--unpaired-reads',
                        metavar="unpaired_reads_filename",
                        help='include a file of unpaired reads to which '
                        '-p/--paired does not apply.')
    parser.add_argument('-f',
                        '--force',
                        dest='force',
                        help='continue past file reading errors',
                        action='store_true')
    args = parser.parse_args()

    force_single = args.force_single

    #if args.reads == '-':
    #    args.reads = sys.stdin

    # check that input files exist
    check_valid_file_exists(args.input_filenames)

    filenames = []
    for pathfilename in args.input_filenames:
        filenames.append(pathfilename)

    # make a list of all filenames and if they're paired or not;
    # if we don't know if they're paired, default to allowing but not
    # forcing pairing.
    files = []
    for element in filenames:
        files.append([element, args.paired])
    if args.unpaired_reads:
        files.append([args.unpaired_reads, False])

    # create object of Nodetable in Khmer to use its
    kh = khmer.Nodetable(args.ksize, 1, 1)

    # load database
    mphf_filename = args.database + '.mphf'
    array_filename = args.database + '.arr'
    print('loading database {}'.format(args.database))

    with open(array_filename, 'rb') as fp:
        mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load(
            fp)
    mphf = bbhash.load_mphf(mphf_filename)

    print('done!')

    def get_kmer_to_family_ids(hashval):
        mphf_hash = mphf.lookup(hashval)
        if mphf_hash is None:
            return set()

        kmer_hash = mphf_to_kmer[mphf_hash]
        if kmer_hash != hashval:
            return set()

        cdbg_id = mphf_to_cdbg[mphf_hash]
        id_list = cdbg_to_family_id[cdbg_id]
        return id_list

    def readFusion(read):
        global n_unmatched, n_same, n_amb_same, n_clear_fusion, n_ambig_fusion, n_mutli_fusion
        flag = None
        lf_ids = set()
        rt_ids = set()
        families = []
        shared_kmers = []
        gaps = []

        hashvals = kh.get_kmer_hashes(read.sequence)

        # find a matching k-mer at the beginning of the read
        lf = hashvals[0]
        lf_ids = get_kmer_to_family_ids(lf)
        idx = 1
        while idx < len(hashvals) and len(lf_ids) == 0:
            lf = hashvals[idx]
            lf_ids = get_kmer_to_family_ids(lf)
            idx += 1

        if len(lf_ids) == 0:
            #print('no single match')
            n_unmatched += 1
            flag = "unmatched"
        elif idx == len(hashvals):
            #print('same, only last kmer matched')
            families.append(lf_ids)
            if len(lf_ids) == 1:
                n_same += 1
                flag = "unique"
            else:
                n_amb_same += 1
                flag = "ambiguous"
        else:  # len(lf_ids) > 0 & idx < len(hashvals)
            # find a matching k-mer at the end of the read
            rt = hashvals[-1]
            rt_ids = get_kmer_to_family_ids(rt)
            idy = len(hashvals) - 2
            while idy >= idx and len(rt_ids) == 0:
                rt = hashvals[idy]
                rt_ids = get_kmer_to_family_ids(rt)
                idy -= 1

            if len(rt_ids) == 0:
                #print('same, only one non-last kmer matched ')
                families.append(lf_ids)
                if len(lf_ids) == 1:
                    n_same += 1
                    flag = "unique"
                else:
                    n_amb_same += 1
                    flag = "ambiguous"
            else:
                intersect_ids = lf_ids.intersection(rt_ids)
                if len(intersect_ids) > 0:
                    families.append(intersect_ids)
                    if len(intersect_ids) == 1:
                        n_same += 1
                        flag = "unique"
                    else:
                        n_amb_same += 1
                        flag = "ambiguous"
                else:  # fusion to be resolved
                    shared_kmer = 1
                    gap_size = 0
                    gap = False
                    while idx <= idy + 1:
                        temp = hashvals[idx]
                        temp_ids = get_kmer_to_family_ids(temp)
                        if len(temp_ids) > 0:
                            intersect_ids = lf_ids.intersection(temp_ids)
                            if len(intersect_ids) > 0:
                                lf_ids = intersect_ids
                                shared_kmer += 1
                                gap_size = 0
                            else:  # len(intersect_ids) == 0
                                families.append(lf_ids)
                                shared_kmers.append(shared_kmer)
                                lf_ids = temp_ids
                                shared_kmer = 1
                                gaps.append(gap_size)
                                gap_size = 0
                        else:
                            gap_size += 1
                        idx += 1

                    families.append(lf_ids)
                    shared_kmers.append(shared_kmer)

                    assert len(families) > 1
                    if len(families) == 2:
                        if len(families[0]) == 1 and len(families[1]) == 1:
                            n_clear_fusion += 1
                            flag = "clear_fusion"
                        else:
                            n_ambig_fusion += 1
                            flag = "ambig_fusion"
                    else:  # len(families) > 2
                        n_mutli_fusion += 1
                        flag = "multi_fusion"

        #if len(families) == 0:
        #    families = "-"

        #if len(shared_kmers) == 0:
        #    shared_kmers = "-"

        return flag, families, shared_kmers, gaps

    fusion_filename = args.database + '_fusion.fa'
    fusion_fp = open(fusion_filename, 'w')
    fusionInfo_filename = args.database + '_fusion.info'
    fusionInfo_fp = open(fusionInfo_filename, 'w')
    print("fileName",
          "recordIndex",
          "whichInPair",
          "align_class",
          "gene_families",
          "shared_kmers",
          "gaps",
          file=fusionInfo_fp,
          sep='\t')
    fusionCalc_filename = args.database + '_fusion.calc'
    fusionCalc_fp = open(fusionCalc_filename, 'w')
    print("fileName",
          "recordIndex",
          "whichInPair",
          "align_class",
          "familiy_A",
          "familiy_B",
          "no_families",
          "len_families",
          "shared_kmers",
          "gaps",
          "sorted_keys",
          file=fusionCalc_fp,
          sep='\t')

    fusionPairs_filename = args.database + '_fusionPairs.fa'
    fusPair_fp = open(fusionPairs_filename, 'w')
    fusionPairsInfo_filename = args.database + '_fusionPairs.info'
    fusPairInfo_fp = open(fusionPairsInfo_filename, 'w')
    print("fileName",
          "recordIndex",
          "fusion_class",
          "R1_family",
          "R2_family",
          file=fusPairInfo_fp,
          sep='\t')
    fusionPairsCalc_filename = args.database + '_fusionPairs.calc'
    fusPairCalc_fp = open(fusionPairsCalc_filename, 'w')
    print("fileName",
          "recordIndex",
          "fusion_class",
          "familiy_A",
          "familiy_B",
          "len_families",
          "sorted_keys",
          file=fusPairCalc_fp,
          sep='\t')

    corrupt_files = []
    family_names = dict(zip(family_ids.values(), family_ids.keys()))
    n = 0
    n_paired_fusion = 0
    sameRef = ("unique", "ambiguous")
    fusion = ("clear_fusion", "ambig_fusion", "multi_fusion")
    for filename, require_paired in files:
        with catch_io_errors(filename, fusion_fp, fusionInfo_fp, fusionCalc_fp,
                             fusPair_fp, fusPairInfo_fp, fusPairCalc_fp,
                             args.force, corrupt_files):
            screed_iter = clean_input_reads(screed.open(filename))
            reader = broken_paired_reader(screed_iter,
                                          min_length=args.ksize,
                                          force_single=force_single,
                                          require_paired=require_paired)

            for r_index, is_paired, read0, read1 in reader:
                n += 1
                if n % 10000 == 0:
                    print('...', n)
                    #if n > 5000:
                    #    break

                flag0, families0, shared_kmers0, gaps0 = readFusion(read0)

                if not is_paired and flag0 in fusion:
                    #families_names0 = []
                    #for gp in families0:
                    #    gp_names = []
                    #    for family_id in gp:
                    #        family_name = family_names[family_id]
                    #        gp_names.append(family_name)
                    #    families_names0.append(gp_names)

                    print(filename,
                          r_index,
                          "single",
                          flag0,
                          families0,
                          shared_kmers0,
                          gaps0,
                          file=fusionInfo_fp,
                          sep='\t')
                    write_record(read0, fusion_fp)

                    #i = 1
                    #while i < len(families0):
                    #    for g1 in families0[i-1]:
                    #        for g2 in families0[i]:
                    #            print(filename, r_index, "single", flag0, sorted([g1,g2]), len(families0), len(families0[i-1]), len(families0[i]),
                    #                  shared_kmers0, gaps0, file=fusionCalc_fp, sep='\t')
                    #    i += 1

                    i = len(families0) - 1
                    for g1 in families0[0]:
                        g1_name = family_names[g1]
                        for g2 in families0[i]:
                            g2_name = family_names[g2]
                            print(filename,
                                  r_index,
                                  "single",
                                  flag0,
                                  '{}:{}'.format(g1, g1_name),
                                  '{}:{}'.format(g2, g2_name),
                                  len(families0), [len(f) for f in families0],
                                  shared_kmers0,
                                  gaps0,
                                  sorted([g1, g2]),
                                  file=fusionCalc_fp,
                                  sep='\t')

                if is_paired:
                    flag1, families1, shared_kmers1, gaps1 = readFusion(read1)

                    if flag0 in fusion or flag1 in fusion:
                        print(filename,
                              r_index,
                              "Read_1",
                              flag0,
                              families0,
                              shared_kmers0,
                              gaps0,
                              file=fusionInfo_fp,
                              sep='\t')
                        write_record(read0, fusion_fp)
                        print(filename,
                              r_index,
                              "Read_2",
                              flag1,
                              families1,
                              shared_kmers1,
                              gaps1,
                              file=fusionInfo_fp,
                              sep='\t')
                        write_record(read1, fusion_fp)

                        if flag0 in fusion:
                            i = len(families0) - 1
                            for g1 in families0[0]:
                                g1_name = family_names[g1]
                                for g2 in families0[i]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          "Read_1",
                                          flag0,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name),
                                          len(families0),
                                          [len(f) for f in families0],
                                          shared_kmers0,
                                          gaps0,
                                          sorted([g1, g2]),
                                          file=fusionCalc_fp,
                                          sep='\t')

                        if flag1 in fusion:
                            i = len(families1) - 1
                            for g1 in families1[0]:
                                g1_name = family_names[g1]
                                for g2 in families1[i]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          "Read_2",
                                          flag1,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name),
                                          len(families1),
                                          [len(f) for f in families1],
                                          shared_kmers1,
                                          gaps1,
                                          sorted([g1, g2]),
                                          file=fusionCalc_fp,
                                          sep='\t')

                    elif flag0 in sameRef and flag1 in sameRef:
                        if len(families0[0].intersection(families1[0])) == 0:
                            n_paired_fusion += 1

                            if flag0 == "unique" and flag1 == "unique":
                                fusion_class = "clear_fusion"
                            else:
                                fusion_class = "ambig_fusion"

                            print(filename,
                                  r_index,
                                  fusion_class,
                                  families0,
                                  families1,
                                  file=fusPairInfo_fp,
                                  sep='\t')
                            write_record(read0, fusPair_fp)
                            write_record(read1, fusPair_fp)

                            for g1 in families0[0]:
                                g1_name = family_names[g1]
                                for g2 in families1[0]:
                                    g2_name = family_names[g2]
                                    print(filename,
                                          r_index,
                                          fusion_class,
                                          '{}:{}'.format(g1, g1_name),
                                          '{}:{}'.format(g2, g2_name), [
                                              len(f) for f in (families0[0],
                                                               families1[0])
                                          ],
                                          sorted([g1, g2]),
                                          file=fusPairCalc_fp,
                                          sep='\t')

    print('No of input fragments: ', n)
    print('unmatched:', n_unmatched)
    print('Unique:', n_same)
    print('Ambiguous:', n_amb_same)
    print('Single read clear fusion:', n_clear_fusion)
    print('Single read ambiguous fusion:', n_ambig_fusion)
    print('Single read multi fusion:', n_mutli_fusion)
    print('paired read fusion:', n_paired_fusion)
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('database')
    parser.add_argument('reads')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    args = parser.parse_args()

    if args.reads == '-':
        args.reads = sys.stdin

    kh = khmer.Nodetable(args.ksize, 1, 1)

    mphf_filename = args.database + '.mphf'
    array_filename = args.database + '.arr'
    print('loading database {}'.format(args.database))

    with open(array_filename, 'rb') as fp:
        mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id = pickle.load(
            fp)
    mphf = bbhash.load_mphf(mphf_filename)

    print('done!')

    def get_kmer_to_family_ids(hashval):
        mphf_hash = mphf.lookup(hashval)
        if mphf_hash is None:
            return set()

        kmer_hash = mphf_to_kmer[mphf_hash]
        if kmer_hash != hashval:
            return set()

        cdbg_id = mphf_to_cdbg[mphf_hash]
        id_list = cdbg_to_family_id[cdbg_id]
        return id_list

    n_same = 0
    n_different = 0

    n = 0
    for record in screed.open(args.reads):
        n += 1
        if n % 1000 == 0:
            print('...', n)
            if n > 5000:
                break

        hashvals = kh.get_kmer_hashes(record.sequence)
        if len(hashvals) <= 1:
            continue

        first = hashvals[0]
        last = hashvals[-1]

        # find the first unambiguously assigned k-mer
        first_ids = get_kmer_to_family_ids(first)
        idx = 1
        while idx < len(hashvals) / 2 and len(first_ids) != 1:
            first = hashvals[idx]
            idx += 1

        # find the last unambiguously assigned k-mer
        last_ids = get_kmer_to_family_ids(last)
        idx = len(hashvals) - 2
        while idx > len(hashvals) / 2 and len(last_ids) != 1:
            last = hashvals[idx]
            idx -= 1

        if len(first_ids) == 1 and len(last_ids) == 1 and \
           first_ids == last_ids:
            n_same += 1
        else:
            print('different {} {}'.format(first_ids, last_ids))
            n_different += 1

    print('same:', n_same)
    print('different:', n_different)
def main(argv=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('queries', nargs='+')
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'))
    args = p.parse_args(argv)

    assert args.output, "must supply -o"

    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')
    x = search_utils.load_cdbg_size_info(args.catlas_prefix)
    cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = x

    # load k-mer MPHF index
    kmer_idx = search_utils.load_kmer_index(args.catlas_prefix)

    # build hashes for all the query k-mers
    print('loading query kmers...')
    bf = khmer.Nodetable(args.ksize, 1, 1)

    print('queryfile,containment,mean_abundance', file=args.output)

    for query in args.queries:
        print('loading', query)
        query_kmers = set()
        for record in screed.open(query):
            query_kmers.update(bf.get_kmer_hashes(record.sequence))

        # find the list of cDBG nodes that contain at least one query k-mer
        cdbg_match_counts = kmer_idx.get_match_counts(query_kmers)

        # calculate number of nodes found -
        cdbg_shadow = set(cdbg_match_counts.keys())

        # calculate the sum total k-mers across all of the matching nodes
        cdbg_node_sizes = {}
        cdbg_total_weighted = 0.
        for cdbg_id in cdbg_shadow:
            cdbg_node_sizes[cdbg_id] = kmer_idx.get_cdbg_size(cdbg_id)
            cdbg_total_weighted += cdbg_weighted_kmer_sizes[cdbg_id]

        # output some stats
        total_found = sum(cdbg_match_counts.values())
        f_found = total_found / len(query_kmers)
        print('...done loading & counting query k-mers in cDBG.')
        print('containment: {:.1f}%'.format(f_found * 100))

        weight = cdbg_total_weighted / total_found
        print('weight:', weight)

        if f_found < 0.5:
            print('skipping output for {}; low containment.'.format(query))
            continue

        print('{},{},{}'.format(query, f_found, weight), file=args.output)

    return 0
Beispiel #23
0
#!/usr/bin/env python

# A demonstration of using khmer to query a dataset for a k-mer. Typically
# khmer accrues a small false positive rate in order to save substantially on
# memory requirements.

import khmer

ksize = 21
target_table_size = 5e8
num_tables = 4

bloomfilter = khmer.Nodetable(ksize, target_table_size, num_tables)
bloomfilter.consume('GCTGCACCGATGTACGCAAAGCTATTTAAAACCATAACTATTCTCACTTA')

print('count for "GCTGCACCGATGTACGCAAAG" is',
      bloomfilter.get('GCTGCACCGATGTACGCAAAG'))

bloomfilter.count('GCTGCACCGATGTACGCAAAG')

print('count for "GCTGCACCGATGTACGCAAAG" is',
      bloomfilter.get('GCTGCACCGATGTACGCAAAG'))

print('count for "GATTACAGATTACAGATTACA" is',
      bloomfilter.get('GATTACAGATTACAGATTACA'))
def main():
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('query')
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'))
    p.add_argument('-v', '--verbose', action='store_true')
    args = p.parse_args()

    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # load k-mer MPHF index
    kmer_idx = search_utils.load_kmer_index(args.catlas_prefix)

    # build hashes for all the query k-mers
    print('loading query kmers...')
    bf = khmer.Nodetable(args.ksize, 1, 1)

    x = set()
    n = 0

    query_kmers = set()
    for record in screed.open(args.query):
        query_kmers.update(bf.get_kmer_hashes(record.sequence))

    # find the list of cDBG nodes that contain at least one query k-mer
    cdbg_match_counts = kmer_idx.get_match_counts(query_kmers)

    # calculate number of nodes found -
    cdbg_shadow = set(cdbg_match_counts.keys())

    # calculate the sum total k-mers across all of the matching nodes
    cdbg_node_sizes = {}
    for cdbg_id in cdbg_shadow:
        cdbg_node_sizes[cdbg_id] = kmer_idx.get_cdbg_size(cdbg_id)

    # output some stats
    total_found = sum(cdbg_match_counts.values())
    f_found = total_found / len(query_kmers)
    print('...done loading & counting query k-mers in cDBG.')
    print('containment: {:.1f}%'.format(f_found * 100))

    total_kmers_in_cdbg_nodes = sum(cdbg_node_sizes.values())
    sim = total_found / total_kmers_in_cdbg_nodes
    print('similarity: {:.1f}%'.format(sim * 100))

    if not args.output:
        sys.exit(0)

    # if output requested, extract unitigs.
    outfp = args.output
    outname = args.output.name

    total_bp = 0
    total_seqs = 0

    print('extracting contigs to {}.'.format(outname))
    for n, record in enumerate(screed.open(contigs)):
        if n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_shadow)
            print('...at n {} ({:.1f}% of shadow)'.format(
                total_seqs, offset_f * 100),
                  end='\r')

        contig_id = int(record.name)
        if contig_id not in cdbg_shadow:
            continue

        outfp.write('>{}\n{}\n'.format(record.name, record.sequence))

        total_bp += len(record.sequence)
        total_seqs += 1

    print('')
    print('fetched {} contigs, {} bp matching node list.'.format(
        total_seqs, total_bp))

    sys.exit(0)
Beispiel #25
0
def main(argv):
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument('catlas_prefix')
    p.add_argument('query')
    p.add_argument('cdbg_nodefile')

    p.add_argument('-o', '--output', type=argparse.FileType('wt'))
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    p.add_argument('-v', '--verbose', action='store_true')
    args = p.parse_args(argv)

    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    assert args.output, 'must specify -o'
    outfp = args.output
    outname = args.output.name

    print('loading bf...', end=' ')
    bf = khmer.Nodetable(args.ksize, 3e8, 2)
    bf.consume_seqfile(args.query)
    print('done.')

    print('loading catlas...', end=' ')
    catlas = CAtlas(args.catlas_prefix)
    layer1_to_cdbg = catlas.layer1_to_cdbg
    print('done.')

    print('loading nodefile {}'.format(args.cdbg_nodefile))
    cdbg_nodes = set()
    with gzip.open(args.cdbg_nodefile, 'r') as fp:
        for line in fp:
            cdbg_nodes.add(int(line.strip()))

    print('loading contigs')
    total_bp = 0
    total_seqs = 0

    n_homogeneous = 0
    n_missing = 0
    bp_missing = 0
    for n, record in enumerate(screed.open(contigs)):
        if n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_nodes)
            print('...at n {} ({:.1f}% of shadow)'.format(
                total_seqs, offset_f * 100),
                  end='\r')

        contig_id = int(record.name)
        if contig_id not in cdbg_nodes:
            continue

        counts = bf.get_kmer_counts(record.sequence)
        if min(counts) == max(counts):
            n_homogeneous += 1

        if max(counts) == 0:
            n_missing += 1
            bp_missing += len(record.sequence)

        outfp.write('{}\n'.format(len(record.sequence)))

        total_bp += len(record.sequence)
        total_seqs += 1

    print('')
    print('fetched {} contigs, {} bp matching node list.'.format(
        total_seqs, total_bp))
    print('n_homogeneous: {}'.format(n_homogeneous))
    print('pure overhead count: {} seqs / {} bp'.format(n_missing, bp_missing))

    return 0
Beispiel #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('unitigs')
    parser.add_argument('transcriptomes', nargs='+')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-o', '--output')
    args = parser.parse_args()

    assert args.output

    kh = khmer.Nodetable(args.ksize, 1, 1)

    all_kmers = []
    for n, record in enumerate(screed.open(args.unitigs)):
        if n % 10000 == 0:
            print('... cdbg', n)
            if n > 20000 and 0:
                break

        all_kmers.extend(kh.get_kmer_hashes(record.sequence))

    print('building MPHF for {} k-mers in {} nodes.'.format(len(all_kmers), n))
    x = bbhash.PyMPHF(all_kmers, len(all_kmers), 4, 1.0)

    ###

    mphf_to_kmer = numpy.zeros(len(all_kmers), numpy.uint64)
    mphf_to_cdbg = numpy.zeros(len(all_kmers), numpy.uint32)

    for n, record in enumerate(screed.open(args.unitigs)):
        if n % 10000 == 0:
            print('... cdbg', n)
            if n > 20000 and 0:
                break

        cdbg_id = int(record.name.split(' ')[0])
        kmers = kh.get_kmer_hashes(record.sequence)

        for kmer in kmers:
            mphf = x.lookup(kmer)
            mphf_to_kmer[mphf] = kmer
            mphf_to_cdbg[mphf] = cdbg_id

    ###

    print('walking the transcriptome')

    family_ids = {}
    family_counter = 0

    cdbg_to_family_id = defaultdict(set)

    n = 0
    for tr_filename in args.transcriptomes:
        for record in screed.open(tr_filename):
            n += 1
            if n % 1000 == 0:
                print('...', tr_filename, n)
                if n > 5000 and 0:
                    break

            # get the family name
            family_name = record.name.split('|')[1]

            # convert to family ID, generating a new one if we need one
            family_id = family_ids.get(family_name)
            if family_id is None:
                family_id = family_counter
                family_counter += 1
                family_ids[family_name] = family_id

            # for all k-mers,
            hashvals = kh.get_kmer_hashes(record.sequence)
            for hashval in hashvals:

                # find cDBG ID
                mphf = x.lookup(hashval)
                if mphf is None:
                    continue

                assert mphf is not None
                cdbg_id = mphf_to_cdbg[mphf]

                # link cDBG ID to family ID
                cdbg_to_family_id[cdbg_id].add(family_id)

    mphf_filename = args.output + '.mphf'
    array_filename = args.output + '.arr'
    x.save(mphf_filename)

    with open(array_filename, 'wb') as fp:
        pickle.dump(
            (mphf_to_kmer, mphf_to_cdbg, family_ids, cdbg_to_family_id), fp)
Beispiel #27
0
def main(argv=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('--query', nargs='+', action='append')
    p.add_argument('--subtract', nargs='+', action='append')
    p.add_argument('-o', '--output-suffix')
    p.add_argument('--threshold', type=float, default=DEFAULT_THRESHOLD)
    p.add_argument('-k', '--ksize', type=int, default=31)
    args = p.parse_args(argv)

    if not args.query:
        print('error, must specify at least one query with --query')
        sys.exit(-1)

    if not args.subtract:
        print('error, must specify at least one subtract with --subtract')
        sys.exit(-1)

    args.query = [item for sublist in args.query for item in sublist]
    args.subtract = [item for sublist in args.subtract for item in sublist]

    # construct output filename as {query}.suffix
    output_suffix = args.output_suffix
    if not output_suffix:
        output_suffix = '.donut.fa'

    # load k-mers to subtract
    all_kmers = list()
    kh = khmer.Nodetable(args.ksize, 1, 1)

    for subtract_fn in args.subtract:
        print('loading:', subtract_fn)
        for record in screed.open(subtract_fn):
            all_kmers.extend(kh.get_kmer_hashes(record.sequence))

    # now build a minimal perfect hash function for all those k-mers
    print('building bbhash table')
    table = BBHashTable(all_kmers, fill=1)
    del all_kmers

    # next, iterate over each input and do subtract
    for queryfile in args.query:
        output = os.path.basename(queryfile) + output_suffix
        print('subtracting from {} -> {}'.format(queryfile, output))
        outfp = open(output, 'wt')
        n = 0
        bp = 0
        n_kept = 0
        bp_kept = 0
        for n, record in enumerate(screed.open(queryfile)):
            if n % 100000 == 0:
                print('...', queryfile, n, n_kept)

            bp += len(record.sequence)

            if len(record.sequence) < args.ksize:
                continue

            kmers = kh.get_kmer_hashes(record.sequence)

            present = 0
            for k in kmers:
                if table[k]:
                    present += 1

            f = present / len(kmers)
            if f < args.threshold:  # keep?
                outfp.write('>{}\n{}\n'.format(record.name, record.sequence))
                n_kept += 1
                bp_kept += len(record.sequence)

        print('kept {} ({:.1g} Mbp) of {} ({:.1g} Mbp)'.format(
            n_kept, bp_kept / 1e6, n, bp / 1e6))

    return 0
def main(argv):
    """\
    Query a catlas with a sequence (read, contig, or genome), and retrieve
    cDBG node IDs and MinHash signatures for the matching unitigs in the graph.
    """

    p = argparse.ArgumentParser(description=main.__doc__)
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('output')
    p.add_argument('--query', help='query sequences', nargs='+')
    p.add_argument('-k', '--ksize', default=31, type=int,
                   help='k-mer size (default: 31)')
    p.add_argument('--scaled', default=1000, type=float,
                   help="scaled value for contigs minhash output")
    p.add_argument('-v', '--verbose', action='store_true')

    args = p.parse_args(argv)
    outfile = args.output

    if not args.query:
        print('must specify at least one query file using --query.')
        sys.exit(-1)

    # make sure all of the query sequences exist.
    for filename in args.query:
        if not os.path.exists(filename):
            error('query seq file {} does not exist.', filename)
            sys.exit(-1)

    # load catlas DAG
    catlas = CAtlas(args.catlas_prefix)
    notify('loaded {} nodes from catlas {}', len(catlas), args.catlas_prefix)
    notify('loaded {} layer 1 catlas nodes', len(catlas.layer1_to_cdbg))

    # find the contigs filename
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # ...and kmer index.
    ki_start = time.time()
    kmer_idx = MPHF_KmerIndex.from_catlas_directory(args.catlas_prefix)
    notify('loaded {} k-mers in index ({:.1f}s)',
           len(kmer_idx.mphf_to_kmer), time.time() - ki_start)

    # calculate the k-mer sizes for each catlas node.
    catlas.decorate_with_index_sizes(kmer_idx)

    # get a single ksize & scaled
    ksize = int(args.ksize)
    scaled = int(args.scaled)

    records_to_cdbg = {}
    cdbg_to_records = defaultdict(set)
    for filename in args.query:
        print(f"Reading from '{filename}'")
        for record in screed.open(filename):
            bf = khmer.Nodetable(ksize, 1, 1)
            if len(record.sequence) < int(ksize):
                continue
            
            kmers = bf.get_kmer_hashes(record.sequence)
            cdbg_match_counts = kmer_idx.get_match_counts(kmers)

            print(f"got {len(cdbg_match_counts)} cdbg nodes for {record.name[:15]} ({len(kmers)} kmers)")

            dominators = set()
            for cdbg_node in cdbg_match_counts:
                dominators.add(catlas.cdbg_to_layer1[cdbg_node])

            print(f"got {len(dominators)} dominators for {record.name[:15]}")

            shadow = catlas.shadow(dominators)
            print(f"got {len(shadow)} cdbg_nodes under {len(dominators)} dominators")

            records_to_cdbg[(filename, record.name)] = shadow
            for cdbg_node in shadow:
                cdbg_to_records[cdbg_node].add((filename,record.name))
            

    with open(outfile, 'wb') as fp:
        print(f"saving pickled index to '{outfile}'")
        pickle.dump((args.catlas_prefix, records_to_cdbg, cdbg_to_records), fp)

    return 0
Beispiel #29
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('readfilelist')
    args = p.parse_args()

    filelist = open(args.readfilelist).readlines()
    filelist = [x.strip() for x in filelist]

    inputfile = filelist.pop()
    while not os.path.exists(inputfile):
        inputfile = filelist.pop()
    print('starting with', inputfile)

    collected = 0
    for pos, filename in enumerate(filelist):
        if not os.path.exists(filename):
            print('skipping', filename)
            continue

        print('loading kh:', filename)
        kh = khmer.Nodetable(K, 2e8, 4)
        kh.consume_seqfile(filename)

        print('iterating over reads:', inputfile)

        outputfile = BASE + '.{}'.format(pos)
        fp = open(outputfile, 'w')

        m = 0
        for n, record in enumerate(clean_input_reads(screed.open(inputfile))):
            if len(record.sequence) < K:
                continue

            if kh.median_at_least(record.cleaned_seq, 1):
                khmer.utils.write_record(record, fp)
                m += 1
        fp.close()
        print('read {}, wrote {}'.format(n, m))

        inputfile = outputfile
        collected += 1

        if collected > 5:
            break

    # second round: load results of first round into bloom filter,
    # use that to sweep reads out of all the files.

    kh = khmer.Nodetable(K, 2e7, 4)
    kh.consume_seqfile(inputfile)

    filelist = open(args.readfilelist).readlines()
    filelist = [x.strip() for x in filelist]

    total_read = 0
    total_written = 0
    for n, filename in enumerate(filelist):
        print('reading', n, filename)
        if not os.path.exists(filename):
            continue
        fp = open(os.path.basename(filename) + '.collected', 'w')

        m = 0
        for n, record in enumerate(clean_input_reads(screed.open(filename))):
            if len(record.sequence) < K:
                continue
            if kh.median_at_least(record.cleaned_seq, 1):
                khmer.utils.write_record(record, fp)
                m += 1
        fp.close()
        print('read {}, wrote {}'.format(n, m))
        total_read += n
        total_written += m
        print('total so far:', total_read, total_written)

    print('Results are in *.collected.')
Beispiel #30
0
def main(args):
    # Input and output files
    outstream = kevlar.open(args.out, 'w')
    writer = kevlar.vcf.VCFWriter(
        outstream,
        source='kevlar::call',
        refr=args.refr,
    )
    writer.write_header()

    # Contigs = query sequences
    contigstream = kevlar.parse_partitioned_reads(
        kevlar.parse_augmented_fastx(kevlar.open(args.queryseq, 'r')))
    contigs_by_partition = load_contigs(contigstream)

    gdnastream = kevlar.parse_partitioned_reads(
        kevlar.reference.load_refr_cutouts(kevlar.open(args.targetseq, 'r')))
    mask = None
    if args.gen_mask:
        message = 'generating mask of variant-spanning k-mers'
        kevlar.plog('[kevlar::call]', message)
        ntables = 4
        buckets = args.mask_mem * _buckets_per_byte['nodegraph'] / ntables
        mask = khmer.Nodetable(args.ksize, buckets, ntables)
    progress_indicator = kevlar.ProgressIndicator(
        '[kevlar::call] processed contigs/gDNAs for {counter} partitions',
        interval=10,
        breaks=[100, 1000, 10000],
    )
    for partid, gdnas in gdnastream:
        progress_indicator.update()
        if partid not in contigs_by_partition:
            continue
        contigs = contigs_by_partition[partid]
        caller = call(
            gdnas,
            contigs,
            partid,
            match=args.match,
            mismatch=args.mismatch,
            gapopen=args.open,
            gapextend=args.extend,
            ksize=args.ksize,
            refrfile=args.refr,
            debug=args.debug,
            mindist=5,
            homopolyfilt=not args.no_homopoly_filter,
            maxtargetlen=args.max_target_length,
        )
        for varcall in caller:
            if args.gen_mask:
                window = varcall.attribute('ALTWINDOW')
                if window is not None and len(window) >= args.ksize:
                    mask.consume(window)
            writer.write(varcall)
    if args.gen_mask:
        fpr = khmer.calc_expected_collisions(mask, max_false_pos=1.0)
        if fpr > args.mask_max_fpr:
            message = 'WARNING: mask FPR is {:.4f}'.format(fpr)
            message += '; exceeds user-specified limit'
            message += ' of {:.4f}'.format(args.mask_max_fpr)
            kevlar.plog('[kevlar::call]', message)
        mask.save(args.gen_mask)