Exemple #1
0
def siglist():
    demo_path = utils.get_test_data("demo")
    filenames = sorted(glob.glob(os.path.join(demo_path, "*.sig")))
    sigs = []
    for filename in filenames:
        sigs.extend(sourmash.load_signatures(filename))
    return sigs
Exemple #2
0
def subtract(args):
    """
    subtract one or more signatures from another
    """
    p = SourmashArgumentParser(prog='sourmash signature subtract')
    p.add_argument('signature_from')
    p.add_argument('subtraction_sigs', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--flatten', action='store_true',
                   help='remove abundance from signatures before subtracting')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    from_sigfile = args.signature_from
    from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype)

    from_mh = from_sigobj.minhash
    if from_mh.track_abundance and not args.flatten:
        error('Cannot use subtract on signatures with abundance tracking, sorry!')
        sys.exit(1)

    subtract_mins = set(from_mh.get_mins())

    notify('loaded signature from {}...', from_sigfile, end='\r')

    total_loaded = 0
    for sigfile in args.subtraction_sigs:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):

            if sigobj.minhash.track_abundance and not args.flatten:
                error('Cannot use subtract on signatures with abundance tracking, sorry!')
                sys.exit(1)

            subtract_mins -= set(sigobj.minhash.get_mins())

            notify('loaded and subtracted signatures from {}...', sigfile, end='\r')
            total_loaded += 1

    if not total_loaded:
        error("no signatures to subtract!?")
        sys.exit(-1)
        

    subtract_mh = from_sigobj.minhash.copy_and_clear()
    subtract_mh.add_many(subtract_mins)

    subtract_sigobj = sourmash.SourmashSignature(subtract_mh)

    output_json = sourmash.save_signatures([subtract_sigobj], fp=args.output)

    notify('loaded and subtracted {} signatures', total_loaded)
Exemple #3
0
def test_linear_index_save():
    sig2 = utils.get_test_data('2.fa.sig')
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')

    ss2 = sourmash.load_one_signature(sig2, ksize=31)
    ss47 = sourmash.load_one_signature(sig47)
    ss63 = sourmash.load_one_signature(sig63)

    linear = LinearIndex()
    linear.insert(ss2)
    linear.insert(ss47)
    linear.insert(ss63)

    with utils.TempDirectory() as location:
        filename = os.path.join(location, 'foo')
        linear.save(filename)

        from sourmash import load_signatures
        si = set(load_signatures(filename))

    x = {ss2, ss47, ss63}

    print(len(si))
    print(len(x))

    print(si)
    print(x)

    assert si == x, si
Exemple #4
0
def test_linear_index_moltype_select():
    # this loads two ksizes(21, 30), and two moltypes (DNA and protein)
    filename = utils.get_test_data('genome-s10+s11.sig')
    siglist = sourmash.load_signatures(filename)

    linear = LinearIndex()
    for ss in siglist:
        linear.insert(ss)

    # select most specific DNA
    linear2 = linear.select(ksize=30, moltype='DNA')
    assert len(linear2) == 1

    # select most specific protein
    linear2 = linear.select(ksize=30, moltype='protein')
    assert len(linear2) == 1

    # can leave off ksize, selects all ksizes
    linear2 = linear.select(moltype='DNA')
    assert len(linear2) == 2

    # can leave off ksize, selects all ksizes
    linear2 = linear.select(moltype='protein')
    assert len(linear2) == 2

    # select something impossible
    linear2 = linear.select(ksize=4)
    assert len(linear2) == 0
Exemple #5
0
def flatten(args):
    """
    flatten a signature, removing abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature flatten')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--md5',
                   default=None,
                   help='select signatures whose md5 contains this substring')
    p.add_argument('--name',
                   default=None,
                   help='select signatures whose name contains this substring')

    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename,
                                           ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

        for ss in siglist:
            flattened_mh = ss.minhash.copy_and_clear()
            _flatten(flattened_mh)
            flattened_mh.add_many(ss.minhash.get_mins())

            ss.minhash = flattened_mh

        outlist.extend(siglist)

    output_json = sourmash.save_signatures(outlist, fp=args.output)

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Exemple #6
0
def intersect(args):
    """
    intersect one or more signatures by taking the intersection of hashes.

    This function always removes abundances.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mins = None
    total_loaded = 0

    for sigfile in args.signatures:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            if first_sig is None:
                first_sig = sigobj
                mins = set(sigobj.minhash.get_mins())

            mins.intersection_update(sigobj.minhash.get_mins())
            total_loaded += 1
        notify('loaded and intersected signatures from {}...', sigfile, end='\r')

    if total_loaded == 0:
        error("no signatures to merge!?")
        sys.exit(-1)

    # forcibly turn off track_abundance, unless --abundances-from set.
    if not args.abundances_from:
        intersect_mh = first_sig.minhash.copy_and_clear()
        intersect_mh.track_abundance = False
        intersect_mh.add_many(mins)
        intersect_sigobj = sourmash.SourmashSignature(intersect_mh)
    else:
        notify('loading signature from {}, keeping abundances',
               args.abundances_from)
        abund_sig = sourmash.load_one_signature(args.abundances_from,
                                                ksize=args.ksize,
                                                select_moltype=moltype)
        if not abund_sig.minhash.track_abundance:
            error("--track-abundance not set on loaded signature?! exiting.")
            sys.exit(-1)
        intersect_mh = abund_sig.minhash.copy_and_clear()
        abund_mins = abund_sig.minhash.get_mins(with_abundance=True)

        # do one last intersection
        mins.intersection_update(abund_mins)
        abund_mins = { k: abund_mins[k] for k in mins }

        intersect_mh.set_abundances(abund_mins)
        intersect_sigobj = sourmash.SourmashSignature(intersect_mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([intersect_sigobj], fp=fp)

    notify('loaded and intersected {} signatures', total_loaded)
Exemple #7
0
def downsample(args):
    """
    downsample a scaled signature.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    if not args.num and not args.scaled:
        error('must specify either --num or --scaled value')
        sys.exit(-1)

    if args.num and args.scaled:
        error('cannot specify both --num and --scaled')
        sys.exit(-1)

    output_list = []
    total_loaded = 0
    for sigfile in args.signatures:
        siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True)

        for sigobj in siglist:
            mh = sigobj.minhash

            notify('loading and downsampling signature from {}...', sigfile, end='\r')
            total_loaded += 1
            if args.scaled:
                if mh.scaled:
                    mh_new = mh.downsample_scaled(args.scaled)
                else:                         # try to turn a num into a scaled
                    # first check: can we?
                    max_hash = get_max_hash_for_scaled(args.scaled)
                    mins = mh.get_mins()
                    if max(mins) < max_hash:
                        raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, 0, args.scaled)
            elif args.num:
                if mh.num:
                    mh_new = mh.downsample_n(args.num)
                else:                         # try to turn a scaled into a num
                    # first check: can we?
                    if len(mh) < args.num:
                        raise ValueError("this scaled MinHash has only {} hashes")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, args.num, 0)

            sigobj.minhash = mh_new

            output_list.append(sigobj)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(output_list, fp=fp)

    notify("loaded and downsampled {} signatures", total_loaded)
Exemple #8
0
def extract(args):
    """
    extract signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature extract')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--md5',
                   default=None,
                   help='select signatures whose md5 contains this substring')
    p.add_argument('--name',
                   default=None,
                   help='select signatures whose name contains this substring')

    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename,
                                           ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

        outlist.extend(siglist)

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    if not outlist:
        error("no matching signatures!")
        sys.exit(-1)

    sourmash.save_signatures(outlist, fp=args.output)

    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Exemple #9
0
def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig
def test_sig_downsample_1_scaled_downsample_multisig(c):
    # downsample many scaled signatures in one file
    multisig = utils.get_test_data('47+63-multisig.sig')
    c.run_sourmash('sig', 'downsample', '--scaled', '10000', multisig)

    # stdout should be new signatures
    out = c.last_result.out

    for sig in sourmash.load_signatures(out):
        assert sig.minhash.scaled == 10000
Exemple #11
0
def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig
Exemple #12
0
def merge(args):
    """
    merge one or more signatures.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mh = None
    total_loaded = 0

    # iterate over all the sigs from all the files.
    for sigfile in args.signatures:
        notify('loading signatures from {}...', sigfile, end='\r')
        this_n = 0
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            # first signature? initialize a bunch of stuff
            if first_sig is None:
                first_sig = sigobj
                mh = first_sig.minhash.copy_and_clear()

                # forcibly remove abundance?
                if args.flatten:
                    mh.track_abundance = False

            try:
                sigobj_mh = sigobj.minhash
                if not args.flatten:
                    _check_abundance_compatibility(first_sig, sigobj)
                else:
                    sigobj_mh.track_abundance = False

                mh.merge(sigobj_mh)
            except:
                error("ERROR when merging signature '{}' ({}) from file {}",
                      sigobj.name(), sigobj.md5sum()[:8], sigfile)
                raise

            this_n += 1
            total_loaded += 1
        if this_n:
            notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r')

    if not total_loaded:
        error("no signatures to merge!?")
        sys.exit(-1)

    merged_sigobj = sourmash.SourmashSignature(mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([merged_sigobj], fp=fp)

    notify('loaded and merged {} signatures', total_loaded)
Exemple #13
0
def intersect(args):
    """
    intersect one or more signatures by taking the intersection of hashes.

    This function always removes abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature intersect')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mins = None
    total_loaded = 0

    for sigfile in args.signatures:
        for sigobj in sourmash.load_signatures(sigfile,
                                               ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            if first_sig is None:
                first_sig = sigobj
                mins = set(sigobj.minhash.get_mins())

            mins.intersection_update(sigobj.minhash.get_mins())
            total_loaded += 1
        notify('loaded and intersected signatures from {}...',
               sigfile,
               end='\r')

    if total_loaded == 0:
        error("no signatures to merge!?")
        sys.exit(-1)

    # forcibly turn off track_abundance
    intersect_mh = first_sig.minhash.copy_and_clear()
    _flatten(intersect_mh)
    intersect_mh.add_many(mins)
    intersect_sigobj = sourmash.SourmashSignature(intersect_mh)

    output_json = sourmash.save_signatures([intersect_sigobj], fp=args.output)

    notify('loaded and intersected {} signatures', total_loaded)
Exemple #14
0
def flatten(args):
    """
    flatten a signature, removing abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature flatten')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--md5', default=None,
                   help='select signatures whose md5 contains this substring')
    p.add_argument('--name', default=None,
                   help='select signatures whose name contains this substring')

    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename, ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
        if args.name is not None:
            siglist = [ ss for ss in siglist if args.name in ss.name() ]

        for ss in siglist:
            flattened_mh = ss.minhash.copy_and_clear()
            _flatten(flattened_mh)
            flattened_mh.add_many(ss.minhash.get_mins())

            ss.minhash = flattened_mh

        outlist.extend(siglist)

    output_json = sourmash.save_signatures(outlist, fp=args.output)

    notify("loaded {} total that matched ksize & molecule type",
           total_loaded)
    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Exemple #15
0
def test_sig_flatten_2_ksize(c):
    # flatten only one signature selected using ksize
    psw_mag = utils.get_test_data('lca/TARA_PSW_MAG_00136.sig')
    c.run_sourmash('sig', 'flatten', psw_mag, '-k', '31')

    # stdout should be new signature
    out = c.last_result.out

    siglist = sourmash.load_signatures(out)
    siglist = list(siglist)

    assert len(siglist) == 1
Exemple #16
0
def test_sig_extract_7(c):
    # extract matches based on ksize
    sig2 = utils.get_test_data('2.fa.sig')
    c.run_sourmash('sig', 'extract', sig2, '-k', '31')

    # stdout should be new signature
    out = c.last_result.out

    siglist = sourmash.load_signatures(out)
    siglist = list(siglist)

    assert len(siglist) == 1
Exemple #17
0
def test_sig_extract_7_no_ksize(c):
    # extract all three matches when -k not specified
    sig2 = utils.get_test_data('2.fa.sig')
    c.run_sourmash('sig', 'extract', sig2)

    # stdout should be new signature
    out = c.last_result.out

    siglist = sourmash.load_signatures(out)
    siglist = list(siglist)

    assert len(siglist) == 3
Exemple #18
0
def filter(args):
    """
    filter hashes by abundance in all of the signatures
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename, ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
        if args.name is not None:
            siglist = [ ss for ss in siglist if args.name in ss.name() ]

        for ss in siglist:
            mh = ss.minhash
            if not mh.track_abundance:
                notify('ignoring signature {} - track_abundance not set.',
                       ss)
                continue

            abunds = mh.get_mins(with_abundance=True)
            abunds2 = {}
            for k, v in abunds.items():
                if v >= args.min_abundance:
                    if args.max_abundance is None or \
                       v <= args.max_abundance:
                       abunds2[k] = v

            filtered_mh = mh.copy_and_clear()
            filtered_mh.set_abundances(abunds2)

            ss.minhash = filtered_mh

        outlist.extend(siglist)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(outlist, fp=fp)

    notify("loaded {} total that matched ksize & molecule type",
           total_loaded)
    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
def test_sig_extract_6(c):
    # extract matches to several names from among several signatures
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')
    c.run_sourmash('sig', 'extract', sig47, sig63, '--name', 'Shewanella')

    # stdout should be new signature
    out = c.last_result.out

    siglist = sourmash.load_signatures(out)
    siglist = list(siglist)

    assert len(siglist) == 2
def test_sig_rename_1_multisig(c):
    # set new name for multiple signatures/files
    multisig = utils.get_test_data('47+63-multisig.sig')
    other_sig = utils.get_test_data('2.fa.sig')
    c.run_sourmash('sig', 'rename', multisig, other_sig, 'fiz bar')

    # stdout should be new signature
    out = c.last_result.out

    n = 0
    for sig in sourmash.load_signatures(out):
        assert sig.name() == 'fiz bar'
        n += 1

    assert n == 9, n
Exemple #21
0
def intersect(args):
    """
    intersect one or more signatures by taking the intersection of hashes.

    This function always removes abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature intersect')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mins = None
    total_loaded = 0

    for sigfile in args.signatures:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            if first_sig is None:
                first_sig = sigobj
                mins = set(sigobj.minhash.get_mins())

            mins.intersection_update(sigobj.minhash.get_mins())
            total_loaded += 1
        notify('loaded and intersected signatures from {}...', sigfile, end='\r')

    if total_loaded == 0:
        error("no signatures to merge!?")
        sys.exit(-1)

    # forcibly turn off track_abundance
    intersect_mh = first_sig.minhash.copy_and_clear()
    _flatten(intersect_mh)
    intersect_mh.add_many(mins)
    intersect_sigobj = sourmash.SourmashSignature(intersect_mh)

    output_json = sourmash.save_signatures([intersect_sigobj], fp=args.output)

    notify('loaded and intersected {} signatures', total_loaded)
Exemple #22
0
def subtract(args):
    """
    subtract one or more signatures from another
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    from_sigfile = args.signature_from
    from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype)

    from_mh = from_sigobj.minhash
    if from_mh.track_abundance and not args.flatten:
        error('Cannot use subtract on signatures with abundance tracking, sorry!')
        sys.exit(1)

    subtract_mins = set(from_mh.get_mins())

    notify('loaded signature from {}...', from_sigfile, end='\r')

    total_loaded = 0
    for sigfile in args.subtraction_sigs:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):

            if sigobj.minhash.track_abundance and not args.flatten:
                error('Cannot use subtract on signatures with abundance tracking, sorry!')
                sys.exit(1)

            subtract_mins -= set(sigobj.minhash.get_mins())

            notify('loaded and subtracted signatures from {}...', sigfile, end='\r')
            total_loaded += 1

    if not total_loaded:
        error("no signatures to subtract!?")
        sys.exit(-1)


    subtract_mh = from_sigobj.minhash.copy_and_clear()
    subtract_mh.add_many(subtract_mins)

    subtract_sigobj = sourmash.SourmashSignature(subtract_mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([subtract_sigobj], fp=fp)

    notify('loaded and subtracted {} signatures', total_loaded)
Exemple #23
0
def test_sig_cat_2_out(c):
    # cat several
    sig47 = utils.get_test_data('47.fa.sig')
    sig47abund = utils.get_test_data('track_abund/47.fa.sig')
    multisig = utils.get_test_data('47+63-multisig.sig')
    c.run_sourmash('sig', 'cat', sig47, sig47abund, multisig, '-o', 'out.sig')

    # stdout should be same signatures
    out = c.output('out.sig')

    siglist = list(sourmash.load_signatures(out))
    print(len(siglist))

    assert repr(
        siglist
    ) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]"""
Exemple #24
0
def test_sig_cat_1_no_unique(c):
    # cat 47 to 47... twice
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'cat', sig47, sig47)

    # stdout should be same signature
    out = c.last_result.out

    test_cat_sig = sourmash.load_one_signature(sig47)
    actual_cat_sigs = sourmash.load_signatures(out)

    for n, sig in enumerate(actual_cat_sigs):
        assert sig == test_cat_sig

    assert n == 1  # two signatures, but enumerate stops at 1.
    assert 'encountered 1 MinHashes multiple times' in c.last_result.err
def test_sig_flatten_1(c):
    # extract matches to several names from among several signatures & flatten
    sig47abund = utils.get_test_data('track_abund/47.fa.sig')
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'flatten', sig47abund, '--name', 'Shewanella')

    # stdout should be new signature
    out = c.last_result.out

    siglist = sourmash.load_signatures(out)
    siglist = list(siglist)

    assert len(siglist) == 1

    test_flattened = sourmash.load_one_signature(sig47)
    assert test_flattened.minhash == siglist[0].minhash
Exemple #26
0
def test_linear_index_multik_select():
    # this loads three ksizes, 21/31/51
    sig2 = utils.get_test_data('2.fa.sig')
    siglist = sourmash.load_signatures(sig2)

    linear = LinearIndex()
    for ss in siglist:
        linear.insert(ss)

    # select most specifically
    linear2 = linear.select(ksize=31, moltype='DNA')
    assert len(linear2) == 1

    # all are DNA:
    linear2 = linear.select(moltype='DNA')
    assert len(linear2) == 3
def test_sig_filter_1(c):
    # test basic filtering
    sig47 = utils.get_test_data('track_abund/47.fa.sig')
    sig63 = utils.get_test_data('track_abund/63.fa.sig')
    c.run_sourmash('sig', 'filter', sig47, sig63)

    # stdout should be new signature
    out = c.last_result.out

    filtered_sigs = list(sourmash.load_signatures(out))

    assert len(filtered_sigs) == 2

    mh47 = sourmash.load_one_signature(sig47).minhash
    mh63 = sourmash.load_one_signature(sig63).minhash

    assert filtered_sigs[0].minhash == mh47
    assert filtered_sigs[1].minhash == mh63
Exemple #28
0
def test_sig_cat_1_unique(c):
    # cat 47 to 47... twice... and get unique
    sig47 = utils.get_test_data('47.fa.sig')
    c.run_sourmash('sig', 'cat', sig47, sig47, '--unique')

    # stdout should be same signature
    out = c.last_result.out
    err = c.last_result.err

    test_cat_sig = sourmash.load_one_signature(sig47)
    actual_cat_sigs = sourmash.load_signatures(out)

    for n, sig in enumerate(actual_cat_sigs):
        assert sig == test_cat_sig

    assert n == 0  # enumerate stops at 0, first sig.
    assert 'encountered 1 MinHashes multiple times' in err
    assert '...and removed the duplicates, because --unique was specified.' in err
Exemple #29
0
def rename(args):
    """
    rename one or more signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature rename')
    p.add_argument('sigfiles', nargs='+')
    p.add_argument('name')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-d',
                   '--debug',
                   action='store_true',
                   help='output debugging output')
    p.add_argument('-o', '--output', help='output to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet, args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    for filename in args.sigfiles:
        debug('loading {}', filename)
        siglist = sourmash.load_signatures(filename,
                                           ksize=args.ksize,
                                           select_moltype=moltype)

        for sigobj in siglist:
            sigobj.d['name'] = args.name
            outlist.append(sigobj)

    if args.output:
        fp = open(args.output, 'wt')
    else:
        fp = sys.stdout

    output_json = sourmash.save_signatures(outlist, fp=fp)
    if args.output:
        fp.close()

    notify("set name to '{}' on {} signatures", args.name, len(outlist))
Exemple #30
0
def rename(args):
    """
    rename one or more signatures.
    """
    set_quiet(args.quiet, args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    for filename in args.sigfiles:
        debug('loading {}', filename)
        siglist = sourmash.load_signatures(filename, ksize=args.ksize,
                                           select_moltype=moltype)

        for sigobj in siglist:
            sigobj._name = args.name
            outlist.append(sigobj)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(outlist, fp=fp)

    notify("set name to '{}' on {} signatures", args.name, len(outlist))
Exemple #31
0
def flatten(args):
    """
    flatten a signature, removing abundances.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename, ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
        if args.name is not None:
            siglist = [ ss for ss in siglist if args.name in ss.name() ]

        for ss in siglist:
            flattened_mh = ss.minhash.copy_and_clear()
            flattened_mh.track_abundance = False
            flattened_mh.add_many(ss.minhash.get_mins())

            ss.minhash = flattened_mh

        outlist.extend(siglist)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(outlist, fp=fp)

    notify("loaded {} total that matched ksize & molecule type",
           total_loaded)
    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Exemple #32
0
def rename(args):
    """
    rename one or more signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature rename')
    p.add_argument('sigfiles', nargs='+')
    p.add_argument('name')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-d', '--debug', action='store_true',
                   help='output debugging output')
    p.add_argument('-o', '--output', help='output to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet, args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    for filename in args.sigfiles:
        debug('loading {}', filename)
        siglist = sourmash.load_signatures(filename, ksize=args.ksize,
                                           select_moltype=moltype)

        for sigobj in siglist:
            sigobj.d['name'] = args.name
            outlist.append(sigobj)

    if args.output:
        fp = open(args.output, 'wt')
    else:
        fp = sys.stdout

    output_json = sourmash.save_signatures(outlist, fp=fp)
    if args.output:
        fp.close()

    notify("set name to '{}' on {} signatures", args.name, len(outlist))
Exemple #33
0
def process_sig(sigfile):
    counters = {}

    original = sigfile[5:-4]
    ident = sigfile.split("/")[3]
    try:
        bps, seqs, unique = analyze_file(original)
    except Exception as e:
        print(f"Error: {e}")
        return None

    counters["id"] = ident
    counters["bp"] = bps
    for k in unique:
        counters[f"unique_{k}"] = len(unique[k])

    sigs = sourmash.load_signatures(sigfile)
    for sig in sigs:
        mh = sig.minhash
        k = mh.ksize
        counters[k] = len(mh)

    return counters
Exemple #34
0
def test_sbt_zipstorage(tmpdir):
    # create tree, save to a zip, then load and search.
    factory = GraphFactory(31, 1e5, 4)

    tree = SBT(factory)

    for f in utils.SIG_FILES:
        sig = next(load_signatures(utils.get_test_data(f)))
        leaf = SigLeaf(os.path.basename(f), sig)
        tree.add_node(leaf)
        to_search = leaf

    print('*' * 60)
    print("{}:".format(to_search.metadata))
    old_result = {
        str(s)
        for s in tree.find(search_minhashes, to_search.data, 0.1)
    }
    print(*old_result, sep='\n')

    with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage:
        tree.save(str(tmpdir.join("tree")), storage=storage)

    with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage:
        tree = SBT.load(str(tmpdir.join("tree")),
                        leaf_loader=SigLeaf.load,
                        storage=storage)

        print('*' * 60)
        print("{}:".format(to_search.metadata))
        new_result = {
            str(s)
            for s in tree.find(search_minhashes, to_search.data, 0.1)
        }
        print(*new_result, sep='\n')

        assert old_result == new_result
Exemple #35
0
def extract(args):
    """
    extract signatures.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename, ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
        if args.name is not None:
            siglist = [ ss for ss in siglist if args.name in ss.name() ]

        outlist.extend(siglist)

    notify("loaded {} total that matched ksize & molecule type",
           total_loaded)
    if not outlist:
        error("no matching signatures!")
        sys.exit(-1)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(outlist, fp=fp)

    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Exemple #36
0
def downsample(args):
    """
    downsample a scaled signature.
    """
    p = SourmashArgumentParser(prog='sourmash signature downsample')
    p.add_argument('signatures', nargs="+")
    p.add_argument('--scaled', type=int, default=0,
                   help='scaled value to downsample to')
    p.add_argument('--num', type=int, default=0,
                   help='num value to downsample to')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    if not args.num and not args.scaled:
        error('must specify either --num or --scaled value')
        sys.exit(-1)

    if args.num and args.scaled:
        error('cannot specify both --num and --scaled')
        sys.exit(-1)

    output_list = []
    total_loaded = 0
    for sigfile in args.signatures:
        siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True)

        for sigobj in siglist:
            mh = sigobj.minhash

            notify('loading and downsampling signature from {}...', sigfile, end='\r')
            total_loaded += 1
            if args.scaled:
                if mh.scaled:
                    mh_new = mh.downsample_scaled(args.scaled)
                else:                         # try to turn a num into a scaled
                    # first check: can we?
                    max_hash = get_max_hash_for_scaled(args.scaled)
                    mins = mh.get_mins()
                    if max(mins) < max_hash:
                        raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, 0, args.scaled)
            elif args.num:
                if mh.num:
                    mh_new = mh.downsample_n(args.num)
                else:                         # try to turn a scaled into a num
                    # first check: can we?
                    if len(mh) < args.num:
                        raise ValueError("this scaled MinHash has only {} hashes")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, args.num, 0)

            sigobj.minhash = mh_new

            output_list.append(sigobj)

    output_json = sourmash.save_signatures(output_list, fp=args.output)

    notify("loaded and downsampled {} signatures", total_loaded)
Exemple #37
0
def merge(args):
    """
    merge one or more signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature merge')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--flatten', action='store_true',
                   help='Remove abundances from all signatures.')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)

    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mh = None
    total_loaded = 0

    # iterate over all the sigs from all the files.
    for sigfile in args.signatures:
        notify('loading signatures from {}...', sigfile, end='\r')
        this_n = 0
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            # first signature? initialize a bunch of stuff
            if first_sig is None:
                first_sig = sigobj
                mh = first_sig.minhash.copy_and_clear()

                # forcibly remove abundance?
                if mh.track_abundance and args.flatten:
                    _flatten(mh)

            try:
                if not args.flatten:
                    _check_abundance_compatibility(first_sig, sigobj)

                mh.merge(sigobj.minhash)
            except:
                error("ERROR when merging signature '{}' ({}) from file {}",
                      sigobj.name(), sigobj.md5sum()[:8], sigfile)
                raise

            this_n += 1
            total_loaded += 1
        if this_n:
            notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r')

    if not total_loaded:
        error("no signatures to merge!?")
        sys.exit(-1)

    merged_sigobj = sourmash.SourmashSignature(mh)

    output_json = sourmash.save_signatures([merged_sigobj], fp=args.output)

    notify('loaded and merged {} signatures', total_loaded)
Exemple #38
0
def describe(args):
    """
    provide basic info on signatures
    """
    p = SourmashArgumentParser(prog='sourmash signature describe')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('--csv', type=argparse.FileType('wt'),
                   help='output information to a CSV file')

    args = p.parse_args(args)
    set_quiet(args.quiet)

    siglist = []
    for sigfile in args.signatures:
        this_siglist = []
        try:
            this_siglist = sourmash.load_signatures(sigfile, quiet=True, do_raise=True)
            for k in this_siglist:
                siglist.append((k, sigfile))
        except Exception as exc:
            error('\nError while reading signatures from {}:'.format(sigfile))
            error(str(exc))
            error('(continuing)')

        notify('loaded {} signatures from {}...', len(siglist), sigfile,
               end='\r')

    notify('loaded {} signatures total.', len(siglist))

    # write CSV?
    w = None
    if args.csv:
        w = csv.DictWriter(args.csv,
                           ['signature_file', 'md5', 'ksize', 'moltype', 'num',
                            'scaled', 'n_hashes', 'seed', 'with_abundance',
                            'name', 'filename', 'license'],
                           extrasaction='ignore')
        w.writeheader()

    # extract info, write as appropriate.
    for (sig, signature_file) in siglist:
        mh = sig.minhash
        ksize = mh.ksize
        moltype = 'DNA'
        if mh.is_protein:
            moltype = 'protein'
        scaled = mh.scaled
        num = mh.num
        seed = mh.seed
        n_hashes = len(mh)
        with_abundance = 0
        if mh.track_abundance:
            with_abundance = 1
        md5 = sig.md5sum()
        name = sig.name()
        filename = sig.d.get('filename', '')
        license = sig.d['license']

        if w:
            w.writerow(locals())

        print_results('''\
---
signature filename: {signature_file}
signature: {name}
source file: {filename}
md5: {md5}
k={ksize} molecule={moltype} num={num} scaled={scaled} seed={seed} track_abundance={with_abundance}
size: {n_hashes}
signature license: {license}
''', **locals())