Example #1
0
def sbt_combine(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name to save SBT into')
    parser.add_argument('sbts', nargs='+', help='SBTs to combine to a new SBT')
    parser.add_argument('-x', '--bf-size', type=float, default=1e5)

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    moltype = sourmash_args.calculate_moltype(args)

    inp_files = list(args.sbts)
    notify('combining {} SBTs', len(inp_files))

    tree = sourmash_lib.load_sbt_index(inp_files.pop(0))

    for f in inp_files:
        new_tree = sourmash_lib.load_sbt_index(f)
        # TODO: check if parameters are the same for both trees!
        tree.combine(new_tree)

    notify('saving SBT under "{}".', args.sbt_name)
    tree.save(args.sbt_name)
def main():
    p = argparse.ArgumentParser()
    p.add_argument('sbt')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'))
    args = p.parse_args()

    db = sourmash_lib.load_sbt_index(args.sbt)

    for n, leaf in enumerate(db.leaves()):
        if n % 1000 == 0:
            print('... at leaf', n)

        name = leaf.data.name()

        # & output!
        args.output.write('{}\n'.format(name))

    print('got accessions from {} signatures'.format(n + 1))
def gather_main(args):
    """
    """
    p = argparse.ArgumentParser()
    p.add_argument('--debug', action='store_true')
    p.add_argument('spreadsheet')
    p.add_argument('species')
    p.add_argument('--sbt')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'))
    args = p.parse_args(args)

    if args.debug:
        set_debug(args.debug)

    assignments, num_rows = load_taxonomy_assignments(args.spreadsheet)

    found = False
    for ident, lineage in assignments.items():
        for vv in lineage:
            if vv.rank == 'species' and vv.name == args.species:
                found = True
                found_lineage = lineage
                break

    if not found:
        print('nothing found for {}; quitting'.format(args.species))
        sys.exit(-1)

    print('found:', ", ".join(lca_utils.zip_lineage(found_lineage)))

    lineage_search = dict(found_lineage)

    rank_found = defaultdict(list)
    rank_idents = defaultdict(list)
    taxlist = list(reversed(list(lca_utils.taxlist())))

    for ident, lineage in assignments.items():
        dd = dict(lineage)
        for k in taxlist:
            if dd.get(k) and dd.get(k) == lineage_search.get(k):
                rank_found[k].append(lineage)
                rank_idents[k].append(ident)
                break

    retrieve_idents = defaultdict(set)
    gimme_idents = {}
    for k in rank_found:
        print('at', k, 'found', len(rank_found.get(k)))

        num_to_extract = min(len(rank_idents[k]), 10)
        gimme = random.sample(rank_idents[k], num_to_extract)
        for g in gimme:
            gimme_idents[g] = k

    if not args.output or not args.sbt:
        print('no output arg or SBT arg given; quitting without extracting')
        sys.exit(-1)

    print('looking for:', len(gimme_idents))

    tree = sourmash_lib.load_sbt_index(args.sbt)

    w = csv.writer(args.output)
    for n, leaf in enumerate(tree.leaves()):
        if n % 1000 == 0:
            print('...', n)
        name = leaf.data.name()
        # hack for NCBI-style names, etc.
        name = name.split(' ')[0].split('.')[0]

        if name in gimme_idents:
            level = gimme_idents[name]
            level_n = taxlist.index(level)
            filename = leaf.data.d['filename']

            w.writerow([level, level_n, filename, leaf.data.name()])
            print('FOUND!', leaf.data.name(), level)
Example #4
0
def categorize(args):
    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to load')
    parser.add_argument('queries',
                        nargs='+',
                        help='list of signatures to categorize')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('-k', '--ksize', type=int, default=None)
    parser.add_argument('--threshold', default=0.08, type=float)
    parser.add_argument('--traverse-directory', action="store_true")

    sourmash_args.add_moltype_args(parser)

    parser.add_argument('--csv', type=argparse.FileType('at'))
    parser.add_argument('--load-csv', default=None)

    args = parser.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    already_names = set()
    if args.load_csv:
        with open(args.load_csv, 'rt') as fp:
            r = csv.reader(fp)
            for row in r:
                already_names.add(row[0])

    tree = sourmash_lib.load_sbt_index(args.sbt_name)

    if args.traverse_directory:
        inp_files = set(sourmash_args.traverse_find_sigs(args.queries))
    else:
        inp_files = set(args.queries) - already_names

    inp_files = set(inp_files) - already_names

    notify('found {} files to query', len(inp_files))

    loader = sourmash_args.LoadSingleSignatures(inp_files, args.ksize, moltype)

    for queryfile, query, query_moltype, query_ksize in loader:
        notify('loaded query: {}... (k={}, {})',
               query.name()[:30], query_ksize, query_moltype)

        results = []
        search_fn = sourmash_lib.sbtmh.SearchMinHashesFindBest().search

        for leaf in tree.find(search_fn, query, args.threshold):
            if leaf.data.md5sum() != query.md5sum():  # ignore self.
                results.append((query.similarity(leaf.data), leaf.data))

        best_hit_sim = 0.0
        best_hit_query_name = ""
        if results:
            results.sort(key=lambda x: -x[0])  # reverse sort on similarity
            best_hit_sim, best_hit_query = results[0]
            notify('for {}, found: {:.2f} {}', query.name(), best_hit_sim,
                   best_hit_query.name())
            best_hit_query_name = best_hit_query.name()
        else:
            notify('for {}, no match found', query.name())

        if args.csv:
            w = csv.writer(args.csv)
            w.writerow([queryfile, best_hit_query_name, best_hit_sim])

    if loader.skipped_ignore:
        notify('skipped/ignore: {}', loader.skipped_ignore)
    if loader.skipped_nosig:
        notify('skipped/nosig: {}', loader.skipped_nosig)
Example #5
0
def index(args):
    """
    Build an Sequence Bloom Tree index of the given signatures.
    """
    import sourmash_lib.sbtmh

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name to save SBT into')
    parser.add_argument('signatures',
                        nargs='+',
                        help='signatures to load into SBT')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('-k',
                        '--ksize',
                        type=int,
                        default=None,
                        help='k-mer size for which to build the SBT.')
    parser.add_argument('-d',
                        '--n_children',
                        type=int,
                        default=2,
                        help='Number of children for internal nodes')
    parser.add_argument('--traverse-directory',
                        action='store_true',
                        help='load all signatures underneath this directory.')
    parser.add_argument('--append',
                        action='store_true',
                        default=False,
                        help='add signatures to an existing SBT.')
    parser.add_argument('-x',
                        '--bf-size',
                        type=float,
                        default=1e5,
                        help='Bloom filter size used for internal nodes.')
    parser.add_argument(
        '-s',
        '--sparseness',
        type=float,
        default=.0,
        help='What percentage of internal nodes will not be saved. '
        'Ranges from 0.0 (save all nodes) to 1.0 (no nodes saved)')

    sourmash_args.add_moltype_args(parser)

    args = parser.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    if args.append:
        tree = sourmash_lib.load_sbt_index(args.sbt_name)
    else:
        tree = sourmash_lib.create_sbt_index(args.bf_size,
                                             n_children=args.n_children)

    if args.traverse_directory:
        inp_files = list(sourmash_args.traverse_find_sigs(args.signatures))
    else:
        inp_files = list(args.signatures)

    if args.sparseness < 0 or args.sparseness > 1.0:
        error('sparseness must be in range [0.0, 1.0].')

    notify('loading {} files into SBT', len(inp_files))

    n = 0
    ksizes = set()
    moltypes = set()
    nums = set()
    scaleds = set()
    for f in inp_files:
        siglist = sig.load_signatures(f,
                                      ksize=args.ksize,
                                      select_moltype=moltype)

        # load all matching signatures in this file
        for ss in siglist:
            ksizes.add(ss.minhash.ksize)
            moltypes.add(sourmash_args.get_moltype(ss))
            nums.add(ss.minhash.num)
            scaleds.add(ss.minhash.scaled)

            leaf = sourmash_lib.sbtmh.SigLeaf(ss.md5sum(), ss)
            tree.add_node(leaf)
            n += 1

        # check to make sure we aren't loading incompatible signatures
        if len(ksizes) > 1 or len(moltypes) > 1:
            error('multiple k-mer sizes or molecule types present; fail.')
            error('specify --dna/--protein and --ksize as necessary')
            error('ksizes: {}; moltypes: {}', ", ".join(map(str, ksizes)),
                  ", ".join(moltypes))
            sys.exit(-1)

        if nums == {0} and len(scaleds) == 1:
            pass  # good
        elif scaleds == {0} and len(nums) == 1:
            pass  # also good
        else:
            error('trying to build an SBT with incompatible signatures.')
            error('nums = {}; scaleds = {}', repr(nums), repr(scaleds))
            sys.exit(-1)

    # did we load any!?
    if n == 0:
        error('no signatures found to load into tree!? failing.')
        sys.exit(-1)

    notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name)
    tree.save(args.sbt_name, sparseness=args.sparseness)
Example #6
0
def watch(args):
    "Build a signature from raw FASTA/FASTQ coming in on stdin, search."
    from sourmash_lib.sbtmh import SearchMinHashesFindBest

    parser = argparse.ArgumentParser()
    parser.add_argument('sbt_name', help='name of SBT to search')
    parser.add_argument('inp_file', nargs='?', default='/dev/stdin')
    parser.add_argument('-q',
                        '--quiet',
                        action='store_true',
                        help='suppress non-error output')
    parser.add_argument('-o',
                        '--output',
                        type=argparse.FileType('wt'),
                        help='save signature generated from data here')
    parser.add_argument('--threshold',
                        default=0.05,
                        type=float,
                        help='minimum threshold for matches')
    parser.add_argument(
        '--input-is-protein',
        action='store_true',
        help='Consume protein sequences - no translation needed')
    sourmash_args.add_construct_moltype_args(parser)
    parser.add_argument(
        '-n',
        '--num-hashes',
        type=int,
        default=DEFAULT_N,
        help='number of hashes to use in each sketch (default: %(default)i)')
    parser.add_argument('--name',
                        type=str,
                        default='stdin',
                        help='name to use for generated signature')
    sourmash_args.add_ksize_arg(parser, DEFAULT_LOAD_K)
    args = parser.parse_args(args)
    set_quiet(args.quiet)

    if args.input_is_protein and args.dna:
        notify('WARNING: input is protein, turning off DNA hashing.')
        args.dna = False
        args.protein = True

    if args.dna and args.protein:
        notify('ERROR: cannot use "watch" with both DNA and protein.')

    if args.dna:
        moltype = 'DNA'
        is_protein = False
    else:
        moltype = 'protein'
        is_protein = True

    tree = sourmash_lib.load_sbt_index(args.sbt_name)

    # check ksize from the SBT we are loading
    ksize = args.ksize
    if ksize is None:
        leaf = next(iter(tree.leaves()))
        tree_mh = leaf.data.minhash
        ksize = tree_mh.ksize

    E = sourmash_lib.MinHash(ksize=ksize,
                             n=args.num_hashes,
                             is_protein=is_protein)
    streamsig = sig.SourmashSignature(E, filename='stdin', name=args.name)

    notify('Computing signature for k={}, {} from stdin', ksize, moltype)

    def do_search():
        search_fn = SearchMinHashesFindBest().search

        results = []
        for leaf in tree.find(search_fn, streamsig, args.threshold):
            results.append((streamsig.similarity(leaf.data), leaf.data))

        return results

    notify('reading sequences from stdin')
    screed_iter = screed.open(args.inp_file)
    watermark = WATERMARK_SIZE

    # iterate over input records
    n = 0
    for n, record in enumerate(screed_iter):
        # at each watermark, print status & check cardinality
        if n >= watermark:
            notify('\r... read {} sequences', n, end='')
            watermark += WATERMARK_SIZE

            if do_search():
                break

        if args.input_is_protein:
            E.add_protein(record.sequence)
        else:
            E.add_sequence(record.sequence, False)

    results = do_search()
    if not results:
        notify('... read {} sequences, no matches found.', n)
    else:
        results.sort(key=lambda x: -x[0])  # take best
        similarity, found_sig = results[0]
        print_results('FOUND: {}, at {:.3f}', found_sig.name(), similarity)

    if args.output:
        notify('saving signature to {}', args.output.name)
        sig.save_signatures([streamsig], args.output)