def write(self, csv_writer, csvoutfp, outdir):
        containment = self.containment()
        similarity = self.similarity()
        q_name = self.query.filename
        bp = self.total_bp
        seqs = self.total_seq
        k = self.query.ksize
        num_q_kmers = len(self.query.kmers)
        (best_con,
         cdbg_min_oh,
         catlas_min_oh) = self.query.con_sim_upper_bounds(self.catlas,
                                                          self.kmer_idx)
        # output to results.csv!
        csv_writer.writerow([q_name, containment, similarity, bp,
                             seqs, k, num_q_kmers,
                             best_con, cdbg_min_oh,
                             catlas_min_oh])
        csvoutfp.flush()

        # write out signature from retrieved contigs.
        sig_filename = os.path.basename(q_name) + '.contigs.sig'
        with open(os.path.join(outdir, sig_filename), 'wt') as fp:
            ss = sourmash_lib.SourmashSignature(self.contigs_minhash,
                                                name='nbhd:'+self.query.name,
                                                filename=sig_filename)
            sourmash_lib.save_signatures([ss], fp)

        # write out cDBG IDs
        cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz'
        with gzip.open(os.path.join(outdir, cdbg_listname), 'wt') as fp:
            fp.write("\n".join([str(x) for x in sorted(self.shadow)]))

        # write out frontier nodes by seed
        frontier_listname = os.path.basename(q_name) + '.frontier.txt.gz'
        with gzip.open(os.path.join(outdir, frontier_listname), 'wt') as fp:
            for node, seedlist in sorted(self.frontier.items()):
                fp.write('{},{}\n'.format(node,
                                          " ".join([str(x) for x in
                                                    sorted(seedlist)])))

        # write response curve
        response_curve_filename = os.path.basename(q_name) + '.response.txt'
        response_curve_filename = os.path.join(outdir,
                                               response_curve_filename)
        cdbg_match_counts = self.query.cdbg_match_counts[self.catlas.name]
        search_utils.output_response_curve(response_curve_filename,
                                           cdbg_match_counts,
                                           self.kmer_idx,
                                           self.catlas.layer1_to_cdbg)
Exemple #2
0
    def write(self, csv_writer, csvoutfp, outdir):
        containment = self.containment()
        similarity = self.similarity()
        q_name = self.query.filename
        bp = self.total_bp
        seqs = self.total_seq
        k = self.query.ksize
        num_q_kmers = len(self.query.kmers)
        (best_con, cdbg_min_oh,
         catlas_min_oh) = self.query.con_sim_upper_bounds(
             self.catlas, self.kmer_idx)
        # output to results.csv!
        csv_writer.writerow([
            q_name, containment, similarity, bp, seqs, k, num_q_kmers,
            best_con, cdbg_min_oh, catlas_min_oh
        ])
        csvoutfp.flush()

        # write out signature from retrieved contigs.
        sig_filename = os.path.basename(q_name) + '.contigs.sig'
        with open(os.path.join(outdir, sig_filename), 'wt') as fp:
            ss = sourmash_lib.SourmashSignature(self.contigs_minhash,
                                                name='nbhd:' + self.query.name,
                                                filename=sig_filename)
            sourmash_lib.save_signatures([ss], fp)

        # write out cDBG IDs
        cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz'
        with gzip.open(os.path.join(outdir, cdbg_listname), 'wt') as fp:
            fp.write("\n".join([str(x) for x in sorted(self.shadow)]))

        # write out frontier nodes by seed
        frontier_listname = os.path.basename(q_name) + '.frontier.txt.gz'
        with gzip.open(os.path.join(outdir, frontier_listname), 'wt') as fp:
            for node, seedlist in sorted(self.frontier.items()):
                fp.write('{},{}\n'.format(
                    node, " ".join([str(x) for x in sorted(seedlist)])))

        # write response curve
        response_curve_filename = os.path.basename(q_name) + '.response.txt'
        response_curve_filename = os.path.join(outdir, response_curve_filename)
        cdbg_match_counts = self.query.cdbg_match_counts[self.catlas.name]
        search_utils.output_response_curve(response_curve_filename,
                                           cdbg_match_counts, self.kmer_idx,
                                           self.catlas.layer1_to_cdbg)
Exemple #3
0
def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig
def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('output')
    p.add_argument('--minsize', type=float, default=100)
    p.add_argument('--maxsize', type=float, default=10000)
    p.add_argument('--keep-fraction', type=float, default=0.1)
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    args = p.parse_args(args)

    print('minsize: {:g}'.format(args.minsize))
    print('maxsize: {:g}'.format(args.maxsize))

    basename = os.path.basename(args.catlas_prefix)
    catlas = os.path.join(args.catlas_prefix, 'catlas.csv')
    domfile = os.path.join(args.catlas_prefix, 'first_doms.txt')

    # load catlas DAG
    top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(
        catlas)
    print('loaded {} nodes from catlas {}'.format(len(dag), catlas))

    # load mapping between dom nodes and cDBG/graph nodes:
    layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile)
    print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg)))

    # calculate the cDBG shadow sizes for each catlas node.
    print('decorating catlas with shadow size info.')
    node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes(
        layer1_to_cdbg, dag, dag_levels)

    # ...and load cdbg node sizes
    print('loading contig size info')
    cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(
        args.catlas_prefix)

    # decorate catlas with cdbg node sizes underneath them
    print('decorating catlas with contig size info.')
    node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(
        layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes,
        cdbg_weighted_kmer_sizes)

    ### ok, the real work: look at articulation of cDBG graph.

    # find highest nodes with kmer size less than given max_size
    def find_terminal_nodes(node_id, max_size):
        node_list = set()
        for sub_id in dag[node_id]:
            # shadow size
            size = node_kmer_sizes[sub_id]

            if size < max_size:
                node_list.add(sub_id)
            else:
                children = find_terminal_nodes(sub_id, max_size)
                node_list.update(children)

        return node_list

    print('finding terminal nodes for {}.'.format(args.maxsize))

    terminal = find_terminal_nodes(top_node_id, args.maxsize)
    print('...got {}'.format(len(terminal)))
    terminal = {n for n in terminal if node_kmer_sizes[n] > args.minsize}
    print('...down to {} between {} and {} in size.'.format(
        len(terminal), args.minsize, args.maxsize))

    # now, go through and calculate ratios
    x = []
    for node_id in terminal:
        # calculate: how many k-mers per cDBG node?
        kmer_size = node_kmer_sizes[node_id]
        shadow_size = node_shadow_sizes[node_id]

        ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2)

        # track basic info
        x.append((ratio, node_id, shadow_size, kmer_size))

    print('terminal node stats for maxsize: {:g}'.format(args.maxsize))
    print('n tnodes:', len(terminal))
    print('total k-mers:', node_kmer_sizes[top_node_id])

    x.sort(reverse=True)
    for (k, v, a, b) in x[:10]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)
    print('... eliding {} nodes'.format(len(x) - 20))
    for (k, v, a, b) in x[-10:]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)

    # keep the last keep-fraction (default 10%) for examination
    keep_sum_kmer = args.keep_fraction * node_kmer_sizes[top_node_id]
    sofar = 0
    keep_terminal = set()
    for (k, v, a, b) in reversed(x):
        sofar += b
        if sofar > keep_sum_kmer:
            break
        keep_terminal.add(v)

    print(
        'keeping last {} k-mers worth of nodes for examination.'.format(sofar))

    # build cDBG shadow ID list.
    cdbg_shadow = set()
    terminal_shadow = find_shadow(keep_terminal, dag)
    for x in terminal_shadow:
        cdbg_shadow.update(layer1_to_cdbg.get(x))

    #### extract contigs
    print('extracting contigs & building a sourmash signature')
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # track results as signature
    contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000)

    total_bp = 0
    total_seqs = 0

    outfp = open(args.output, 'wt')
    for n, record in enumerate(screed.open(contigs)):
        if n and n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_shadow)
            print('...at n {} ({:.1f}% of shadow)'.format(
                total_seqs, offset_f * 100),
                  end='\r')

        # contig names == cDBG IDs
        contig_id = int(record.name)
        if contig_id not in cdbg_shadow:
            continue

        outfp.write('>{}\n{}\n'.format(record.name, record.sequence))
        contigs_mh.add_sequence(record.sequence)

        # track retrieved sequences in a minhash
        total_bp += len(record.sequence)
        total_seqs += 1

    # done - got all contigs!
    print('')
    print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp))

    print('wrote contigs to {}'.format(args.output))
    with open(args.output + '.sig', 'wt') as fp:
        ss = sourmash_lib.SourmashSignature(contigs_mh)
        sourmash_lib.save_signatures([ss], fp)
Exemple #5
0
def gather_main(args):
    """
    Do a greedy search for the hash components of a query against an LCA db.

    Here we don't actually do a least-common-ancestor search of any kind; we
    do essentially the same kind of search as we do in `sourmash gather`, with
    the main difference that we are implicitly combining different genomes of
    identical lineages.

    This takes advantage of the structure of the LCA db, where we store the
    full lineage information for each known hash, as opposed to storing only
    the least-common-ancestor information for it.
    """
    p = argparse.ArgumentParser(prog="sourmash lca gather")
    p.add_argument('query')
    p.add_argument('db', nargs='+')
    p.add_argument('-d', '--debug', action='store_true')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   help='output CSV containing matches to this file')
    p.add_argument('--output-unassigned', type=argparse.FileType('wt'),
                   help='output unassigned portions of the query as a signature to this file')
    p.add_argument('--ignore-abundance',  action='store_true',
                   help='do NOT use k-mer abundances if present')
    args = p.parse_args(args)

    if args.debug:
        set_debug(args.debug)

    # load all the databases
    dblist, ksize, scaled = lca_utils.load_databases(args.db, None)

    # for each query, gather all the matches across databases
    query_sig = sourmash_args.load_query_signature(args.query, ksize, 'DNA')
    debug('classifying', query_sig.name())

    # make sure we're looking at the same scaled value as database
    query_sig.minhash = query_sig.minhash.downsample_scaled(scaled)

    # do the classification, output results
    found = []
    for result, f_unassigned, est_bp, remaining_mins in gather_signature(query_sig, dblist, args.ignore_abundance):
        # is this our first time through the loop? print headers, if so.
        if not len(found):
            print_results("")
            print_results("overlap     p_query p_match ")
            print_results("---------   ------- --------")

        # output!
        pct_query = '{:.1f}%'.format(result.f_unique_to_query*100)
        pct_match = '{:.1f}%'.format(result.f_match*100)
        str_bp = format_bp(result.intersect_bp)
        name = format_lineage(result.lineage)

        equal_match_str = ""
        if result.n_equal_matches:
            equal_match_str = " (** {} equal matches)".format(result.n_equal_matches)

        print_results('{:9}   {:>6}  {:>6}      {}{}', str_bp, pct_query,
                      pct_match, name, equal_match_str)

        found.append(result)

    if found:
        print_results('')
        if f_unassigned:
            print_results('{:.1f}% ({}) of hashes have no assignment.', f_unassigned*100,
                          format_bp(est_bp))
        else:
            print_results('Query is completely assigned.')
            print_results('')
    # nothing found.
    else:
        est_bp = len(query_sig.minhash.get_mins()) * query_sig.minhash.scaled
        print_results('')
        print_results('No assignment for est {} of sequence.',
                      format_bp(est_bp))
        print_results('')

    if not found:
        sys.exit(0)

    if args.output:
        fieldnames = ['intersect_bp', 'f_match', 'f_unique_to_query', 'f_unique_weighted',
                      'average_abund', 'name', 'n_equal_matches'] + list(lca_utils.taxlist())

        w = csv.DictWriter(args.output, fieldnames=fieldnames)
        w.writeheader()
        for result in found:
            lineage = result.lineage
            d = dict(result._asdict())
            del d['lineage']

            for (rank, value) in lineage:
                d[rank] = value

            w.writerow(d)

    if args.output_unassigned:
        if not found:
            notify('nothing found - entire query signature unassigned.')
        elif not remaining_mins:
            notify('no unassigned hashes! not saving.')
        else:
            outname = args.output_unassigned.name
            notify('saving unassigned hashes to "{}"', outname)

            e = query_sig.minhash.copy_and_clear()
            e.add_many(remaining_mins)

            sourmash_lib.save_signatures([ sourmash_lib.SourmashSignature(e) ],
                                         args.output_unassigned)
def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('output')
    p.add_argument('--minsize', type=float, default=100)
    p.add_argument('--maxsize', type=float, default=10000)
    p.add_argument('--keep-fraction', type=float, default=0.1)
    p.add_argument('-k', '--ksize', default=31, type=int,
                   help='k-mer size (default: 31)')
    args = p.parse_args(args)

    print('minsize: {:g}'.format(args.minsize))
    print('maxsize: {:g}'.format(args.maxsize))

    catlas_file = os.path.join(args.catlas_prefix, 'catlas.csv')
    domfile = os.path.join(args.catlas_prefix, 'first_doms.txt')
    sizefile = os.path.join(args.catlas_prefix, 'contigs.fa.gz.info.csv')

    # load catlas DAG
    catlas = CAtlas(catlas_file, domfile=domfile, sizefile=sizefile)
    print('loaded {} nodes from catlas {}'.format(len(catlas), catlas_file))
    print('loaded {} layer 1 catlas nodes'.format(len(catlas.layer1_to_cdbg)))

    # calculate the cDBG shadow sizes for each catlas node.
    print('decorating catlas with shadow size info.')
    catlas.decorate_with_shadow_sizes()

    # ok, the real work: look at articulation of cDBG graph.

    # find highest nodes with kmer size less than given max_size
    def find_terminal_nodes(node_id, max_size):
        node_list = set()
        for sub_id in catlas.children[node_id]:
            # shadow size
            size = catlas.kmer_sizes[sub_id]

            if size < max_size:
                node_list.add(sub_id)
            else:
                children = find_terminal_nodes(sub_id, max_size)
                node_list.update(children)

        return node_list

    print('finding terminal nodes for {}.'.format(args.maxsize))

    terminal = find_terminal_nodes(catlas.root, args.maxsize)
    print('...got {}'.format(len(terminal)))
    terminal = {n for n in terminal if catlas.kmer_sizes[n] > args.minsize}
    print('...down to {} between {} and {} in size.'.format(len(terminal),
                                                            args.minsize,
                                                            args.maxsize))

    # now, go through and calculate ratios
    x = []
    for node_id in terminal:
        # calculate: how many k-mers per cDBG node?
        kmer_size = catlas.kmer_sizes[node_id]
        shadow_size = catlas.shadow_sizes[node_id]

        ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2)

        # track basic info
        x.append((ratio, node_id, shadow_size, kmer_size))

    print('terminal node stats for maxsize: {:g}'.format(args.maxsize))
    print('n tnodes:', len(terminal))
    print('total k-mers:', catlas.kmer_sizes[catlas.root])

    x.sort(reverse=True)
    for (k, v, a, b) in x[:10]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)
    print('... eliding {} nodes'.format(len(x) - 20))
    for (k, v, a, b) in x[-10:]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)

    # keep the last keep-fraction (default 10%) for examination
    keep_sum_kmer = args.keep_fraction * catlas.kmer_sizes[catlas.root]
    sofar = 0
    keep_terminal = set()
    for (k, v, a, b) in reversed(x):
        sofar += b
        if sofar > keep_sum_kmer:
            break
        keep_terminal.add(v)

    print('keeping last {} k-mers worth of nodes for'
          'examination.'.format(sofar))

    # build cDBG shadow ID list.
    cdbg_shadow = catlas.shadow(keep_terminal)

    # extract contigs
    print('extracting contigs & building a sourmash signature')
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # track results as signature
    contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000)

    total_bp = 0
    total_seqs = 0

    outfp = open(args.output, 'wt')
    for n, record in enumerate(screed.open(contigs)):
        if n and n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_shadow)
            print('...at n {} ({:.1f}% of shadow)'.format(total_seqs,
                  offset_f * 100),
                  end='\r')

        # contig names == cDBG IDs
        contig_id = int(record.name)
        if contig_id not in cdbg_shadow:
            continue

        outfp.write('>{}\n{}\n'.format(record.name, record.sequence))
        contigs_mh.add_sequence(record.sequence)

        # track retrieved sequences in a minhash
        total_bp += len(record.sequence)
        total_seqs += 1

    # done - got all contigs!
    print('')
    print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp))

    print('wrote contigs to {}'.format(args.output))
    with open(args.output + '.sig', 'wt') as fp:
        ss = sourmash_lib.SourmashSignature(contigs_mh)
        sourmash_lib.save_signatures([ss], fp)
def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('query')
    p.add_argument('output')
    p.add_argument('--threshold', default=0.0, type=float)
    p.add_argument('--minsize', default=0, type=int)
    p.add_argument('-k', '--ksize', default=31, type=int,
                   help='k-mer size (default: 31)')
    args = p.parse_args(args)

    print('threshold: {:.3f}'.format(args.threshold))

    basename = os.path.basename(args.catlas_prefix)
    catlas = os.path.join(args.catlas_prefix, 'catlas.csv')
    domfile = os.path.join(args.catlas_prefix, 'first_doms.txt')

    # load catlas DAG
    top_node_id, dag, dag_up, dag_levels, cdbg_to_catlas = search_utils.load_dag(catlas)
    print('loaded {} nodes from catlas {}'.format(len(dag), catlas))

    # load mapping between dom nodes and cDBG/graph nodes:
    layer1_to_cdbg = search_utils.load_layer1_to_cdbg(cdbg_to_catlas, domfile)
    print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg)))

    # calculate the cDBG shadow sizes for each catlas node.
    print('decorating catlas with shadow size info.')
    node_shadow_sizes = search_utils.decorate_catlas_with_shadow_sizes(layer1_to_cdbg, dag, dag_levels)

    # ...and load cdbg node sizes
    print('loading contig size info')
    cdbg_kmer_sizes, cdbg_weighted_kmer_sizes = search_utils.load_cdbg_size_info(args.catlas_prefix)

    # decorate catlas with cdbg node sizes underneath them
    print('decorating catlas with contig size info.')
    node_kmer_sizes, node_weighted_kmer_sizes = search_utils.decorate_catlas_with_kmer_sizes(layer1_to_cdbg, dag, dag_levels, cdbg_kmer_sizes, cdbg_weighted_kmer_sizes)

    # load k-mer index, query, etc. etc.
    kmer_idx = search_utils.load_kmer_index(args.catlas_prefix)

    bf = khmer.Nodetable(args.ksize, 1, 1)

    query_kmers = set()
    for record in screed.open(args.query):
        query_kmers.update(bf.get_kmer_hashes(record.sequence))

    print('got {} k-mers from {}'.format(len(query_kmers), args.query))

    # construct dict cdbg_id -> # of query k-mers
    cdbg_match_counts = kmer_idx.get_match_counts(query_kmers)

    total_match_kmers = sum(cdbg_match_counts.values())
    f_found = total_match_kmers / len(query_kmers)
    print('=> containment: {:.1f}%'.format(f_found * 100))
    print('done loading & counting query k-mers in cDBG.')

    if total_match_kmers == 0:
        print('no match k-mers!?')
        sys.exit(-1)

    # calculate the cDBG matching k-mers sizes for each catlas node.
    catlas_match_counts = kmer_idx.build_catlas_match_counts(cdbg_match_counts, dag, dag_levels, layer1_to_cdbg)

    ### ok, the real work: find nodes that have low # of k-mers in the query.
    def find_unassembled_nodes(node_id, threshold=0.0):
        node_list = set()
        for sub_id in dag[node_id]:
            n_matched = catlas_match_counts.get(sub_id, 0)
            size = node_kmer_sizes[sub_id]

            f_assembled = n_matched / size

            # if the fraction of unassembled k-mers under this node is below
            # our threshold, KEEP the node. Otherwise, descend into children.
            if f_assembled <= threshold:
                node_list.add(sub_id)
            else:
                children = find_unassembled_nodes(sub_id, threshold)
                node_list.update(children)

        return node_list

    print('finding unassembled nodes for threshold {}.'.format(args.threshold))

    terminal = find_unassembled_nodes(top_node_id, args.threshold)
    sum_kmers = sum([ node_kmer_sizes[n] for n in terminal ])
    sum_match_kmers = sum([ catlas_match_counts.get(n, 0) for n in terminal ])
    print('...got {} nodes, representing {} k-mers'.format(len(terminal), sum_kmers))

    # now, go through all nodes and print out characteristics
    print('writing node info to {}'.format(args.output + '.csv'))
    with open(args.output + '.csv', 'wt') as fp:
        w = csv.writer(fp)

        w.writerow(['node_id', 'contained', 'n_kmers', 'n_weighted_kmers', 'average_weight','shadow_size'])
        for n in terminal:
            f_contained = catlas_match_counts.get(n, 0) / node_kmer_sizes[n]
            w.writerow([n,
                        '{:.3f}'.format(f_contained),
                        node_kmer_sizes[n],
                        '{:.1f}'.format(node_weighted_kmer_sizes[n]),
                        '{:.2f}'.format(node_weighted_kmer_sizes[n] / node_kmer_sizes[n]),
                        node_shadow_sizes[n]])

    if args.minsize:
        print('minsize set: {}. filtering.'.format(args.minsize))
        new_terminal = set()
        for n in terminal:
            if node_kmer_sizes[n] >= args.minsize:
                new_terminal.add(n)

        print('removed {} nodes => {}'.format(len(terminal)-len(new_terminal),
                                              len(new_terminal)))
        terminal = new_terminal

    # build cDBG shadow ID list, tagged by parent catlas node.
    cdbg_id_to_node = {}
    for n in terminal:
        this_shadow = find_shadow([n], dag)
        for x in this_shadow:
            v = layer1_to_cdbg[x]
            for vv in v:
                cdbg_id_to_node[vv] = n

    #### extract contigs
    print('extracting contigs & building a sourmash signature')
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # track results as signature
    contigs_mh = sourmash_lib.MinHash(n=0, ksize=args.ksize, scaled=1000)

    total_bp = 0
    total_seqs = 0

    print('writing contigs to {}'.format(args.output + '.fa'))
    outfp = open(args.output + '.fa', 'wt')
    for n, record in enumerate(screed.open(contigs)):
        if n and n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_id_to_node)
            print('...at n {} ({:.1f}% of shadow)'.format(total_seqs,
                  offset_f * 100),
                  end='\r')

        # contig names == cDBG IDs
        contig_id = int(record.name)
        catlas_parent = cdbg_id_to_node.get(contig_id)
        if catlas_parent is None:
            continue

        outfp.write('>{} {}\n{}\n'.format(record.name, catlas_parent, record.sequence))
        contigs_mh.add_sequence(record.sequence)

        # track retrieved sequences in a minhash
        total_bp += len(record.sequence)
        total_seqs += 1

    # done - got all contigs!
    print('')
    print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp))

    print('writing sig to {}'.format(args.output + '.sig'))
    with open(args.output + '.sig', 'wt') as fp:
        ss = sourmash_lib.SourmashSignature(contigs_mh)
        sourmash_lib.save_signatures([ss], fp)
Exemple #8
0
def main(argv):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('--overhead',
                   help='\% of overhead',
                   type=float,
                   default=0.0)
    p.add_argument('output')
    p.add_argument('--min_containment',
                   help="minimum containment",
                   type=float,
                   default=1.0)
    p.add_argument('--max_overhead',
                   help="largest overhead allowed",
                   type=float,
                   default=1.0)
    p.add_argument('--query', help='query sequences', nargs='+')
    p.add_argument('--no-empty', action='store_true')
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    p.add_argument('--scaled', default=1000, type=float)
    p.add_argument('-v', '--verbose', action='store_true')

    args = p.parse_args(argv)

    # make sure all of the query sequences exist.
    for filename in args.query:
        if not os.path.exists(filename):
            print('query seq file {} does not exist.'.format(filename))
            sys.exit(-1)

    # create output directory if it doesn't exist.
    try:
        os.mkdir(args.output)
    except OSError:
        pass
    if not os.path.isdir(args.output):
        print('output {} is not a directory'.format(args.output))
        sys.exit(-1)

    # figure out catlas and domfile information.
    basename = os.path.basename(args.catlas_prefix)
    catlas = os.path.join(args.catlas_prefix, 'catlas.csv')
    domfile = os.path.join(args.catlas_prefix, 'first_doms.txt')

    # load catlas DAG
    top_node_id, dag, dag_up, dag_levels, catlas_to_cdbg = load_dag(catlas)
    print('loaded {} nodes from catlas {}'.format(len(dag), catlas))

    # load mapping between dom nodes and cDBG/graph nodes:
    layer1_to_cdbg = load_layer1_to_cdbg(catlas_to_cdbg, domfile)
    print('loaded {} layer 1 catlas nodes'.format(len(layer1_to_cdbg)))

    # find the contigs filename
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # ...and kmer index.
    ki_start = time.time()
    kmer_idx = load_kmer_index(args.catlas_prefix)
    print('loaded {} k-mers in index ({:.1f}s)'.format(
        len(kmer_idx.mphf_to_kmer),
        time.time() - ki_start))

    # calculate the k-mer sizes for each catlas node.
    node_sizes = kmer_idx.build_catlas_node_sizes(dag, dag_levels,
                                                  layer1_to_cdbg)

    # get a single ksize & scaled
    ksize = int(args.ksize)
    scaled = int(args.scaled)

    # record command line
    with open(os.path.join(args.output, 'command.txt'), 'wt') as fp:
        fp.write(str(sys.argv))
        fp.write("\n")

    # output results.csv in the output directory:
    csvoutfp = open(os.path.join(args.output, 'results.csv'), 'wt')
    csv_writer = csv.writer(csvoutfp)
    csv_writer.writerow([
        'query', 'containment', 'similarity', 'bp', 'contigs', 'ksize',
        'num_query_kmers', 'best_containment', 'cdbg_min_overhead',
        'catlas_min_overhead'
    ])

    # iterate over each query, do the thing.
    for query in args.query:
        # ignore all the problems!
        try:
            print('----')
            print('QUERY FILE:', query)
            start_time = time.time()

            # build hashes for all the query k-mers
            print('loading query kmers...', end=' ')
            bf = khmer.Nodetable(ksize, 1, 1)

            query_kmers = set()
            query_name = None
            for record in screed.open(query):
                if query_name is None:
                    query_name = record.name
                query_kmers.update(bf.get_kmer_hashes(record.sequence))

            print('got {}'.format(len(query_kmers)))

            # construct dict cdbg_id -> # of query k-mers
            cdbg_match_counts = kmer_idx.get_match_counts(query_kmers)
            for k, v in cdbg_match_counts.items():
                assert v <= kmer_idx.get_cdbg_size(k), k

            total_match_kmers = sum(cdbg_match_counts.values())
            f_found = total_match_kmers / len(query_kmers)
            print('=> containment: {:.1f}%'.format(f_found * 100))
            print('done loading & counting query k-mers in cDBG.'
                  ' ({:.1f}s)'.format(time.time() - start_time))

            total_kmers_in_cdbg_matches = 0
            for cdbg_id in set(cdbg_match_counts.keys()):
                total_kmers_in_cdbg_matches += kmer_idx.get_cdbg_size(cdbg_id)

            cdbg_sim = total_match_kmers / total_kmers_in_cdbg_matches
            print('cdbg match node similarity: {:.1f}%'.format(cdbg_sim * 100))
            cdbg_min_overhead = (total_kmers_in_cdbg_matches -
                                 total_match_kmers) /\
                total_kmers_in_cdbg_matches
            print('min cdbg overhead: {}'.format(cdbg_min_overhead))

            # calculate the cDBG matching k-mers sizes for each catlas node.
            catlas_match_counts =\
                kmer_idx.build_catlas_match_counts(cdbg_match_counts, dag,
                                                   dag_levels, layer1_to_cdbg)

            # check a few things - we've propogated properly:
            assert sum(cdbg_match_counts.values()) == \
                catlas_match_counts[top_node_id]
            # ...and all nodes have no more matches than total k-mers.
            for k, v in catlas_match_counts.items():
                assert v <= node_sizes[k], k

            # calculate the minimum overhead of the search, based on level 1
            # nodes.
            catlas_min_overhead = 0
            if catlas_match_counts[top_node_id]:
                all_query_kmers = catlas_match_counts[top_node_id]
                total_kmers_in_query_nodes = 0
                for node_id, level in dag_levels.items():
                    if level == 1 and catlas_match_counts.get(node_id):
                        total_kmers_in_query_nodes += node_sizes[node_id]

                catlas_min_overhead = (total_kmers_in_query_nodes -
                                       all_query_kmers) /\
                    total_kmers_in_query_nodes
                print(
                    'minimum catlas overhead: {}'.format(catlas_min_overhead))

            # gather results of all queries
            fuzzy = args.max_overhead != 1.0
            if fuzzy:
                max_oh = args.max_overhead
                min_con = args.min_containment
                total_frontier = collect_frontier(dag,
                                                  top_node_id,
                                                  node_sizes,
                                                  catlas_match_counts,
                                                  max_overhead=max_oh,
                                                  min_containment=min_con)
            else:
                total_frontier = collect_frontier_exact(dag,
                                                        top_node_id,
                                                        node_sizes,
                                                        catlas_match_counts,
                                                        overhead=args.overhead,
                                                        verbose=args.verbose)

            # calculate level 1 nodes for this frontier in the catlas
            total_shadow = find_shadow(total_frontier, dag)

            # calculate associated cDBG nodes
            cdbg_shadow = set()
            for x in total_shadow:
                cdbg_shadow.update(layer1_to_cdbg.get(x))

            # done with main loop! now extract contigs using cDBG shadow
            # node list.
            print('done searching! {} frontier, {} catlas shadow nodes, {}'
                  ' cdbg nodes.'.format(len(total_frontier), len(total_shadow),
                                        len(cdbg_shadow)))

            # track extracted info
            total_bp = 0
            total_seqs = 0

            # build check MinHash w/seed=42
            query_sig = build_query_mh_for_seed(42, ksize, scaled, query)

            # track minhash of retrieved contigs using original query minhash:
            contigs_minhash = query_sig.minhash.copy_and_clear()

            retrieve_start = time.time()

            # walk through the contigs, retrieving.
            print('extracting contigs...')
            for n, record in enumerate(
                    search_utils.get_contigs_by_cdbg(contigs, cdbg_shadow)):
                if n and n % 10000 == 0:
                    offset_f = total_seqs / len(cdbg_shadow)
                    print('...at n {} ({:.1f}% of shadow)'.format(
                        total_seqs, offset_f * 100),
                          end='\r')

                # track retrieved sequences in a minhash
                contigs_minhash.add_sequence(str(record.sequence), True)

                total_bp += len(record.sequence)
                total_seqs += 1

            # done - got all contigs!
            print('...fetched {} contigs, {} bp matching combined frontiers. '
                  ' ({:.1f}s)'.format(total_seqs, total_bp,
                                      time.time() - retrieve_start))

            # calculate summary values of extracted contigs
            containment = query_sig.minhash.contained_by(contigs_minhash)
            similarity = query_sig.minhash.similarity(contigs_minhash)
            print('query inclusion by retrieved contigs:'
                  ' {:.3f}%'.format(containment * 100))
            print('query similarity to retrieved contigs:'
                  ' {:.3f}%'.format(similarity * 100))

            # recover from above.
            best_containment = f_found

            # output to results.csv!
            csv_writer.writerow([
                query, containment, similarity, total_bp, total_seqs, ksize,
                len(query_kmers), best_containment, cdbg_min_overhead,
                catlas_min_overhead
            ])
            csvoutfp.flush()

            # write out signature from retrieved contigs.
            sig_filename = os.path.basename(query) + '.contigs.sig'
            with open(os.path.join(args.output, sig_filename), 'wt') as fp:
                ss = sourmash_lib.SourmashSignature(contigs_minhash,
                                                    name='nbhd:' + query_name,
                                                    filename=sig_filename)
                sourmash_lib.save_signatures([ss], fp)

            # write out cDBG IDs
            cdbg_listname = os.path.basename(query) + '.cdbg_ids.txt.gz'
            with gzip.open(os.path.join(args.output, cdbg_listname),
                           'wt') as fp:
                fp.write("\n".join([str(x) for x in cdbg_shadow]))

            # write out frontier nodes by seed
            frontier_listname = os.path.basename(query) + '.frontier.txt.gz'
            with gzip.open(os.path.join(args.output, frontier_listname),
                           'wt') as fp:
                for node, seedlist in total_frontier.items():
                    fp.write('{},{}\n'.format(
                        node, " ".join([str(x) for x in seedlist])))

            # write response curve
            response_curve_filename = os.path.basename(query) + '.response.txt'
            response_curve_filename = os.path.join(args.output,
                                                   response_curve_filename)
            search_utils.output_response_curve(response_curve_filename,
                                               cdbg_match_counts, kmer_idx,
                                               layer1_to_cdbg)
            print('total time: {:.1f}s'.format(time.time() - start_time))
        except KeyboardInterrupt:
            raise
        except:
            traceback.print_exc()

    # end main loop!

    sys.exit(0)
Exemple #9
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('lca_filename')
    p.add_argument('sigfiles', nargs='+')
    p.add_argument('-k', '--ksize', default=31, type=int)
    p.add_argument(
        '--output-unassigned',
        type=argparse.FileType('wt'),
        help=
        'output unassigned portions of the query as a signature to this file')
    args = p.parse_args()

    # load lca info
    lca_db = lca_json.LCA_Database(args.lca_filename)
    taxfoo, hashval_to_lca, scaled = lca_db.get_database(args.ksize, SCALED)

    # load signatures
    siglist = []
    print('loading signatures from {} signature files'.format(
        len(args.sigfiles)))
    for sigfile in args.sigfiles:
        sigs = sourmash_lib.load_signatures(sigfile, ksize=args.ksize)
        sigs = list(sigs)
        siglist.extend(sigs)

    print('loaded {} signatures total at k={}'.format(len(siglist),
                                                      args.ksize))

    # downsample
    print('downsampling to scaled value: {}'.format(scaled))
    for sig in siglist:
        if sig.minhash.scaled < scaled:
            sig.minhash = sig.minhash.downsample_scaled(scaled)

    # now, extract hash values!
    hashvals = collections.defaultdict(int)
    for sig in siglist:
        for hashval in sig.minhash.get_mins():
            hashvals[hashval] += 1

    found = 0
    total = 0
    by_taxid = collections.defaultdict(int)

    unassigned_hashvals = set()

    # for every hash, get LCA of labels
    for hashval, count in hashvals.items():
        lca = hashval_to_lca.get(hashval)
        total += count

        if lca is None:
            by_taxid[0] += count
            unassigned_hashvals.add(hashval)
            continue

        by_taxid[lca] += count
        found += count

    print('found LCA classifications for', found, 'of', total, 'hashes')
    not_found = total - found

    # now, propogate counts up the taxonomic tree.
    by_taxid_lca = collections.defaultdict(int)
    for taxid, count in by_taxid.items():
        by_taxid_lca[taxid] += count

        parent = taxfoo.child_to_parent.get(taxid)
        while parent != None and parent != 1:
            by_taxid_lca[parent] += count
            parent = taxfoo.child_to_parent.get(parent)

    total_count = sum(by_taxid.values())

    # sort by lineage length
    x = []
    for taxid, count in by_taxid_lca.items():
        x.append((len(taxfoo.get_lineage(taxid)), taxid, count))

    x.sort()

    # ...aaaaaand output.
    print('{}\t{}\t{}\t{}\t{}\t{}'.format('percent', 'below', 'at node',
                                          'code', 'taxid', 'name'))
    for _, taxid, count_below in x:
        if taxid == 0:
            continue

        percent = round(100 * count_below / total_count, 2)
        count_at = by_taxid[taxid]

        rank = taxfoo.node_to_info.get(taxid)
        if rank:
            rank = rank[0]
            classify_code = kraken_rank_code.get(rank, '-')
        else:
            classify_code = '-'

        name = taxfoo.taxid_to_names.get(taxid)
        if name:
            name = name[0]
        else:
            name = '-'

        print('{}\t{}\t{}\t{}\t{}\t{}'.format(percent, count_below, count_at,
                                              classify_code, taxid, name))

    if not_found:
        classify_code = 'U'
        percent = round(100 * not_found / total_count, 2)
        count_below = not_found
        count_at = not_found
        taxid = 0
        name = 'not classified'

        print('{}\t{}\t{}\t{}\t{}\t{}'.format(percent, count_below, count_at,
                                              classify_code, taxid, name))

        if args.output_unassigned:
            outname = args.output_unassigned.name
            print('saving unassigned hashes to "{}"'.format(outname))

            e = sourmash_lib.MinHash(ksize=args.ksize, n=0, scaled=scaled)
            e.add_many(unassigned_hashvals)
            sourmash_lib.save_signatures(
                [sourmash_lib.SourmashSignature('', e)],
                args.output_unassigned)