Example #1
0
def sig_import(args):
    """
    import a signature into sourmash format.
    """
    set_quiet(args.quiet)

    siglist = []
    for filename in args.filenames:
        with open(filename) as fp:
            x = json.loads(fp.read())

        ksize = x['kmer']
        num = x['sketchSize']

        assert x['hashType'] == "MurmurHash3_x64_128"
        assert x['hashBits'] == 64
        assert x['hashSeed'] == 42

        xx = x['sketches'][0]
        hashes = xx['hashes']

        mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False)
        mh.add_many(hashes)

        s = sourmash.SourmashSignature(mh, filename=filename)
        siglist.append(s)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(siglist, fp)
    def write(self, csv_writer, csvoutfp, outdir):
        hashval = self.query_hashval
        bp = self.total_bp
        seqs = self.total_seq

        # output to results.csv!
        csv_writer.writerow([hashval, bp, seqs])
        csvoutfp.flush()

        # TR add contigs folder
        # write out cDBG IDs
        q_name = str(hashval)
        cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz'
        with gzip.open(os.path.join(outdir, "contigs", cdbg_listname),
                       'wt') as fp:
            fp.write("\n".join([str(x) for x in sorted(self.cdbg_shadow)]))

        # write out contigs
        contigs_outname = os.path.basename(q_name) + '.contigs.fa.gz'
        with gzip.open(os.path.join(outdir, "contigs", contigs_outname),
                       'wt') as fp:
            for name, sequence in self.contigs:
                fp.write('>{}\n{}\n'.format(name, sequence))

        # save minhash?
        if self.mh:
            ss = sourmash.SourmashSignature(
                self.mh, name='hashval query:{}'.format(q_name))

            sigfile = os.path.join(outdir, "contigs", q_name + '.contigs.sig')
            with open(sigfile, 'wt') as fp:
                sourmash.save_signatures([ss], fp)
Example #3
0
def rename(args):
    """
    rename one or more signatures.
    """
    set_quiet(args.quiet, args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    progress = sourmash_args.SignatureLoadingProgress()

    outlist = []
    for filename in args.sigfiles:
        debug('loading {}', filename)
        siglist = sourmash_args.load_file_as_signatures(filename,
                                                        ksize=args.ksize,
                                                        select_moltype=moltype,
                                                        traverse=True,
                                                        progress=progress)

        for sigobj in siglist:
            sigobj._name = args.name
            outlist.append(sigobj)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(outlist, fp=fp)

    notify("set name to '{}' on {} signatures", args.name, len(outlist))
def main():
    p = argparse.ArgumentParser()
    p.add_argument('contigs')  # this is an assembly
    p.add_argument('read_sig')  #  this contains sourmash sig with abunds
    p.add_argument('-o', '--output', required=True)
    args = p.parse_args()

    siglist = sourmash.load_file_as_signatures(args.read_sig)
    siglist = list(siglist)
    assert len(siglist) == 1
    sig = siglist[0]

    contigs_mh = sig.minhash.copy_and_clear()
    for record in screed.open(args.contigs):
        contigs_mh.add_sequence(record.sequence, force=True)

    # intersect the genome assembly with the read abundances
    # so now we get the abundances of only the k-mers that are in the
    # assembly.
    abunds = {}
    for hashval in contigs_mh.hashes:
        abunds[hashval] = sig.minhash.hashes.get(hashval, 0)

    output_mh = sig.minhash.copy_and_clear()
    output_mh.set_abundances(abunds)

    out_sig = sourmash.SourmashSignature(output_mh)
    with open(args.output, 'wt') as fp:
        print(f"Saving output to '{args.output}'")
        sourmash.save_signatures([out_sig], fp)
Example #5
0
def flatten(args):
    """
    flatten a signature, removing abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature flatten')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--md5',
                   default=None,
                   help='select signatures whose md5 contains this substring')
    p.add_argument('--name',
                   default=None,
                   help='select signatures whose name contains this substring')

    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename,
                                           ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

        for ss in siglist:
            flattened_mh = ss.minhash.copy_and_clear()
            flattened_mh.track_abundance = False
            flattened_mh.add_many(ss.minhash.get_mins())

            ss.minhash = flattened_mh

        outlist.extend(siglist)

    sourmash.save_signatures(outlist, fp=args.output)

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Example #6
0
def get_target_sig(sample_name):
    genome = sample_name
    mh = sourmash.MinHash(n=1000, ksize=31)
    for record in screed.open(genome):
        mh.add_sequence(record.sequence, True)
    sig = SourmashSignature(mh, name=genome)
    with open(sample_name + '.sig', 'wt') as fp:
        save_signatures([sig], fp)
Example #7
0
def intersect(args):
    """
    intersect one or more signatures by taking the intersection of hashes.

    This function always removes abundances.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mins = None
    total_loaded = 0

    for sigfile in args.signatures:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            if first_sig is None:
                first_sig = sigobj
                mins = set(sigobj.minhash.get_mins())

            mins.intersection_update(sigobj.minhash.get_mins())
            total_loaded += 1
        notify('loaded and intersected signatures from {}...', sigfile, end='\r')

    if total_loaded == 0:
        error("no signatures to merge!?")
        sys.exit(-1)

    # forcibly turn off track_abundance, unless --abundances-from set.
    if not args.abundances_from:
        intersect_mh = first_sig.minhash.copy_and_clear()
        intersect_mh.track_abundance = False
        intersect_mh.add_many(mins)
        intersect_sigobj = sourmash.SourmashSignature(intersect_mh)
    else:
        notify('loading signature from {}, keeping abundances',
               args.abundances_from)
        abund_sig = sourmash.load_one_signature(args.abundances_from,
                                                ksize=args.ksize,
                                                select_moltype=moltype)
        if not abund_sig.minhash.track_abundance:
            error("--track-abundance not set on loaded signature?! exiting.")
            sys.exit(-1)
        intersect_mh = abund_sig.minhash.copy_and_clear()
        abund_mins = abund_sig.minhash.get_mins(with_abundance=True)

        # do one last intersection
        mins.intersection_update(abund_mins)
        abund_mins = { k: abund_mins[k] for k in mins }

        intersect_mh.set_abundances(abund_mins)
        intersect_sigobj = sourmash.SourmashSignature(intersect_mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([intersect_sigobj], fp=fp)

    notify('loaded and intersected {} signatures', total_loaded)
Example #8
0
def downsample(args):
    """
    downsample a scaled signature.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    if not args.num and not args.scaled:
        error('must specify either --num or --scaled value')
        sys.exit(-1)

    if args.num and args.scaled:
        error('cannot specify both --num and --scaled')
        sys.exit(-1)

    output_list = []
    total_loaded = 0
    for sigfile in args.signatures:
        siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True)

        for sigobj in siglist:
            mh = sigobj.minhash

            notify('loading and downsampling signature from {}...', sigfile, end='\r')
            total_loaded += 1
            if args.scaled:
                if mh.scaled:
                    mh_new = mh.downsample_scaled(args.scaled)
                else:                         # try to turn a num into a scaled
                    # first check: can we?
                    max_hash = get_max_hash_for_scaled(args.scaled)
                    mins = mh.get_mins()
                    if max(mins) < max_hash:
                        raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, 0, args.scaled)
            elif args.num:
                if mh.num:
                    mh_new = mh.downsample_n(args.num)
                else:                         # try to turn a scaled into a num
                    # first check: can we?
                    if len(mh) < args.num:
                        raise ValueError("this scaled MinHash has only {} hashes")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, args.num, 0)

            sigobj.minhash = mh_new

            output_list.append(sigobj)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(output_list, fp=fp)

    notify("loaded and downsampled {} signatures", total_loaded)
Example #9
0
def extract(args):
    """
    extract signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature extract')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--md5',
                   default=None,
                   help='select signatures whose md5 contains this substring')
    p.add_argument('--name',
                   default=None,
                   help='select signatures whose name contains this substring')

    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename,
                                           ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

        outlist.extend(siglist)

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    if not outlist:
        error("no matching signatures!")
        sys.exit(-1)

    sourmash.save_signatures(outlist, fp=args.output)

    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Example #10
0
def merge(args):
    """
    merge one or more signatures.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mh = None
    total_loaded = 0

    # iterate over all the sigs from all the files.
    for sigfile in args.signatures:
        notify('loading signatures from {}...', sigfile, end='\r')
        this_n = 0
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            # first signature? initialize a bunch of stuff
            if first_sig is None:
                first_sig = sigobj
                mh = first_sig.minhash.copy_and_clear()

                # forcibly remove abundance?
                if args.flatten:
                    mh.track_abundance = False

            try:
                sigobj_mh = sigobj.minhash
                if not args.flatten:
                    _check_abundance_compatibility(first_sig, sigobj)
                else:
                    sigobj_mh.track_abundance = False

                mh.merge(sigobj_mh)
            except:
                error("ERROR when merging signature '{}' ({}) from file {}",
                      sigobj.name(), sigobj.md5sum()[:8], sigfile)
                raise

            this_n += 1
            total_loaded += 1
        if this_n:
            notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r')

    if not total_loaded:
        error("no signatures to merge!?")
        sys.exit(-1)

    merged_sigobj = sourmash.SourmashSignature(mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([merged_sigobj], fp=fp)

    notify('loaded and merged {} signatures', total_loaded)
Example #11
0
def filter(args):
    """
    filter hashes by abundance in all of the signatures
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    progress = sourmash_args.SignatureLoadingProgress()

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash_args.load_file_as_signatures(filename,
                                                        ksize=args.ksize,
                                                        select_moltype=moltype,
                                                        traverse=True,
                                                        progress=progress)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

        for ss in siglist:
            mh = ss.minhash
            if not mh.track_abundance:
                notify('ignoring signature {} - track_abundance not set.', ss)
                continue

            abunds = mh.get_mins(with_abundance=True)
            abunds2 = {}
            for k, v in abunds.items():
                if v >= args.min_abundance:
                    if args.max_abundance is None or \
                       v <= args.max_abundance:
                        abunds2[k] = v

            filtered_mh = mh.copy_and_clear()
            filtered_mh.set_abundances(abunds2)

            ss.minhash = filtered_mh

        outlist.extend(siglist)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(outlist, fp=fp)

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Example #12
0
def subtract(args):
    """
    subtract one or more signatures from another
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    from_sigfile = args.signature_from
    from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype)

    from_mh = from_sigobj.minhash
    if from_mh.track_abundance and not args.flatten:
        error('Cannot use subtract on signatures with abundance tracking, sorry!')
        sys.exit(1)

    subtract_mins = set(from_mh.get_mins())

    notify('loaded signature from {}...', from_sigfile, end='\r')

    total_loaded = 0
    for sigfile in args.subtraction_sigs:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):

            if sigobj.minhash.track_abundance and not args.flatten:
                error('Cannot use subtract on signatures with abundance tracking, sorry!')
                sys.exit(1)

            subtract_mins -= set(sigobj.minhash.get_mins())

            notify('loaded and subtracted signatures from {}...', sigfile, end='\r')
            total_loaded += 1

    if not total_loaded:
        error("no signatures to subtract!?")
        sys.exit(-1)


    subtract_mh = from_sigobj.minhash.copy_and_clear()
    subtract_mh.add_many(subtract_mins)

    subtract_sigobj = sourmash.SourmashSignature(subtract_mh)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures([subtract_sigobj], fp=fp)

    notify('loaded and subtracted {} signatures', total_loaded)
Example #13
0
def create_signatures(file_list, ksize=21, verbose=False):
    file_list = [Path(str(f) + '.sig') for f in file_list]
    gt = GenomeTools()
    if verbose:
        file_list = tqdm(file_list, total=len(file_list))
    for f in file_list:
        if f.is_file():
            sig = sourmash.load_one_signature(str(f))
            if sig.minhash.ksize == ksize:
                continue
        minhash = sourmash.MinHash(n=1000, ksize=ksize)
        genome = gt.read_fasta(f.with_suffix(''))
        minhash.add_sequence(genome, True)
        sig = sourmash.SourmashSignature(minhash, name=f.stem)
        with f.open('wt') as handle:
            sourmash.save_signatures([sig], handle)
Example #14
0
def subtract(args):
    """
    subtract one or more signatures from another
    """
    p = SourmashArgumentParser(prog='sourmash signature subtract')
    p.add_argument('signature_from')
    p.add_argument('subtraction_sigs', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--flatten', action='store_true',
                   help='remove abundance from signatures before subtracting')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    from_sigfile = args.signature_from
    from_sigobj = sourmash.load_one_signature(from_sigfile, ksize=args.ksize, select_moltype=moltype)

    from_mh = from_sigobj.minhash
    if from_mh.track_abundance and not args.flatten:
        error('Cannot use subtract on signatures with abundance tracking, sorry!')
        sys.exit(1)

    subtract_mins = set(from_mh.get_mins())

    notify('loaded signature from {}...', from_sigfile, end='\r')

    total_loaded = 0
    for sigfile in args.subtraction_sigs:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):

            if sigobj.minhash.track_abundance and not args.flatten:
                error('Cannot use subtract on signatures with abundance tracking, sorry!')
                sys.exit(1)

            subtract_mins -= set(sigobj.minhash.get_mins())

            notify('loaded and subtracted signatures from {}...', sigfile, end='\r')
            total_loaded += 1

    if not total_loaded:
        error("no signatures to subtract!?")
        sys.exit(-1)
        

    subtract_mh = from_sigobj.minhash.copy_and_clear()
    subtract_mh.add_many(subtract_mins)

    subtract_sigobj = sourmash.SourmashSignature(subtract_mh)

    output_json = sourmash.save_signatures([subtract_sigobj], fp=args.output)

    notify('loaded and subtracted {} signatures', total_loaded)
    def write(self, csv_writer, csvoutfp, outdir, catlas_name):
        containment = self.containment()
        similarity = self.similarity()
        q_name = self.query.filename
        bp = self.total_bp
        seqs = self.total_seq
        k = self.query.ksize
        num_q_kmers = len(self.query.kmers)
        (best_con,
         cdbg_min_oh,
         catlas_min_oh) = self.query.con_sim_upper_bounds(self.catlas,
                                                          self.kmer_idx)
        # output to results.csv!
        csv_writer.writerow([q_name, containment, similarity, bp,
                             seqs, k, num_q_kmers,
                             best_con, cdbg_min_oh,
                             catlas_min_oh, catlas_name])
        csvoutfp.flush()

        # write out signature from retrieved contigs.
        sig_filename = os.path.basename(q_name) + '.contigs.sig'
        with open(os.path.join(outdir, sig_filename), 'wt') as fp:
            ss = sourmash.SourmashSignature(self.contigs_minhash,
                                                name='nbhd:'+self.query.name,
                                                filename=sig_filename)
            sourmash.save_signatures([ss], fp)

        # write out cDBG IDs
        cdbg_listname = os.path.basename(q_name) + '.cdbg_ids.txt.gz'
        with gzip.open(os.path.join(outdir, cdbg_listname), 'wt') as fp:
            fp.write("\n".join([str(x) for x in sorted(self.shadow)]))

        # write out catlas nodes
        frontier_listname = os.path.basename(q_name) + '.frontier.txt.gz'
        with gzip.open(os.path.join(outdir, frontier_listname), 'wt') as fp:
            for node in sorted(self.leaves):
                fp.write('{}\n'.format(node))

        # write response curve
        response_curve_filename = os.path.basename(q_name) + '.response.txt'
        response_curve_filename = os.path.join(outdir,
                                               response_curve_filename)
        cdbg_match_counts = self.query.cdbg_match_counts[self.catlas.name]
        search_utils.output_response_curve(response_curve_filename,
                                           cdbg_match_counts,
                                           self.kmer_idx,
                                           self.catlas.layer1_to_cdbg)
Example #16
0
def cat(args):
    """
    concatenate all signatures into one file.
    """
    set_quiet(args.quiet)

    encountered_md5sums = defaultdict(int)  # used by --unique
    progress = sourmash_args.SignatureLoadingProgress()

    siglist = []
    for sigfile in args.signatures:
        this_siglist = []
        try:
            loader = sourmash_args.load_file_as_signatures(sigfile,
                                                           traverse=True,
                                                           progress=progress)
            n_loaded = 0
            for sig in loader:
                n_loaded += 1

                md5 = sig.md5sum()
                encountered_md5sums[md5] += 1
                if args.unique and encountered_md5sums[md5] > 1:
                    continue

                siglist.append(sig)
        except Exception as exc:
            error(str(exc))
            error('(continuing)')

        notify('loaded {} signatures from {}...', n_loaded, sigfile, end='\r')

    notify('loaded {} signatures total.', len(siglist))

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(siglist, fp=fp)

    notify('output {} signatures', len(siglist))

    multiple_md5 = [1 for cnt in encountered_md5sums.values() if cnt > 1]
    if multiple_md5:
        notify('encountered {} MinHashes multiple times', sum(multiple_md5))
        if args.unique:
            notify(
                '...and removed the duplicates, because --unique was specified.'
            )
Example #17
0
def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig
Example #18
0
def rename(args):
    """
    rename one or more signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature rename')
    p.add_argument('sigfiles', nargs='+')
    p.add_argument('name')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-d',
                   '--debug',
                   action='store_true',
                   help='output debugging output')
    p.add_argument('-o', '--output', help='output to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet, args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    for filename in args.sigfiles:
        debug('loading {}', filename)
        siglist = sourmash.load_signatures(filename,
                                           ksize=args.ksize,
                                           select_moltype=moltype)

        for sigobj in siglist:
            sigobj.d['name'] = args.name
            outlist.append(sigobj)

    if args.output:
        fp = open(args.output, 'wt')
    else:
        fp = sys.stdout

    sourmash.save_signatures(outlist, fp=fp)
    if args.output:
        fp.close()

    notify("set name to '{}' on {} signatures", args.name, len(outlist))
Example #19
0
def test_sourmash_signature_api():
    e = sourmash.MinHash(n=1, ksize=20)
    sig = sourmash.SourmashSignature(e)

    s = sourmash.save_signatures([sig])
    sig_x1 = sourmash.load_one_signature(s)
    sig_x2 = list(sourmash.load_signatures(s))[0]

    assert sig_x1 == sig
    assert sig_x2 == sig
Example #20
0
def intersect(args):
    """
    intersect one or more signatures by taking the intersection of hashes.

    This function always removes abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature intersect')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mins = None
    total_loaded = 0

    for sigfile in args.signatures:
        for sigobj in sourmash.load_signatures(sigfile,
                                               ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            if first_sig is None:
                first_sig = sigobj
                mins = set(sigobj.minhash.get_mins())

            mins.intersection_update(sigobj.minhash.get_mins())
            total_loaded += 1
        notify('loaded and intersected signatures from {}...',
               sigfile,
               end='\r')

    if total_loaded == 0:
        error("no signatures to merge!?")
        sys.exit(-1)

    # forcibly turn off track_abundance
    intersect_mh = first_sig.minhash.copy_and_clear()
    _flatten(intersect_mh)
    intersect_mh.add_many(mins)
    intersect_sigobj = sourmash.SourmashSignature(intersect_mh)

    output_json = sourmash.save_signatures([intersect_sigobj], fp=args.output)

    notify('loaded and intersected {} signatures', total_loaded)
Example #21
0
def flatten(args):
    """
    flatten a signature, removing abundances.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    progress = sourmash_args.SignatureLoadingProgress()

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash_args.load_file_as_signatures(filename,
                                                        ksize=args.ksize,
                                                        select_moltype=moltype,
                                                        traverse=True,
                                                        progress=progress)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

        for ss in siglist:
            flattened_mh = ss.minhash.copy_and_clear()
            flattened_mh.track_abundance = False
            flattened_mh.add_many(ss.minhash.get_mins())

            ss.minhash = flattened_mh

        outlist.extend(siglist)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(outlist, fp=fp)

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Example #22
0
def test_linear_index_load():
    sig2 = utils.get_test_data('2.fa.sig')
    sig47 = utils.get_test_data('47.fa.sig')
    sig63 = utils.get_test_data('63.fa.sig')

    ss2 = sourmash.load_one_signature(sig2, ksize=31)
    ss47 = sourmash.load_one_signature(sig47)
    ss63 = sourmash.load_one_signature(sig63)

    with utils.TempDirectory() as location:
        from sourmash import save_signatures

        filename = os.path.join(location, 'foo')
        with open(filename, 'wt') as fp:
            sourmash.save_signatures([ss2, ss47, ss63], fp)

        linear = LinearIndex.load(filename)

    x = {ss2, ss47, ss63}
    assert set(linear.signatures()) == x, linear.signatures
    assert linear.filename == filename
Example #23
0
def flatten(args):
    """
    flatten a signature, removing abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature flatten')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--md5', default=None,
                   help='select signatures whose md5 contains this substring')
    p.add_argument('--name', default=None,
                   help='select signatures whose name contains this substring')

    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_signatures(filename, ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ ss for ss in siglist if args.md5 in ss.md5sum() ]
        if args.name is not None:
            siglist = [ ss for ss in siglist if args.name in ss.name() ]

        for ss in siglist:
            flattened_mh = ss.minhash.copy_and_clear()
            _flatten(flattened_mh)
            flattened_mh.add_many(ss.minhash.get_mins())

            ss.minhash = flattened_mh

        outlist.extend(siglist)

    output_json = sourmash.save_signatures(outlist, fp=args.output)

    notify("loaded {} total that matched ksize & molecule type",
           total_loaded)
    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Example #24
0
def sig_import(args):
    """
    import a signature into sourmash format.
    """
    p = SourmashArgumentParser(prog='sourmash signature import')
    p.add_argument('filenames', nargs='+')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    args = p.parse_args(args)
    set_quiet(args.quiet)

    siglist = []
    for filename in args.filenames:
        with open(filename) as fp:
            x = json.loads(fp.read())

        ksize = x['kmer']
        num = x['sketchSize']

        assert x['hashType'] == "MurmurHash3_x64_128"
        assert x['hashBits'] == 64
        assert x['hashSeed'] == 42

        xx = x['sketches'][0]
        hashes = xx['hashes']

        mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False)
        mh.add_many(hashes)

        s = sourmash.SourmashSignature(mh, filename=filename)
        siglist.append(s)

    sourmash.save_signatures(siglist, args.output)
Example #25
0
def extract(args):
    """
    extract signatures.
    """
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    progress = sourmash_args.SignatureLoadingProgress()

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash_args.load_file_as_signatures(filename,
                                                        ksize=args.ksize,
                                                        select_moltype=moltype,
                                                        traverse=True,
                                                        progress=progress)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

        outlist.extend(siglist)

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    if not outlist:
        error("no matching signatures!")
        sys.exit(-1)

    with FileOutput(args.output, 'wt') as fp:
        sourmash.save_signatures(outlist, fp=fp)

    notify("extracted {} signatures from {} file(s)", len(outlist),
           len(args.signatures))
Example #26
0
def sig_import(args):
    """
    import a signature into sourmash format.
    """
    p = SourmashArgumentParser(prog='sourmash signature import')
    p.add_argument('filenames', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    args = p.parse_args(args)
    set_quiet(args.quiet)

    siglist = []
    for filename in args.filenames:
        with open(filename) as fp:
            x = json.loads(fp.read())

        ksize = x['kmer']
        num = x['sketchSize']

        assert x['hashType'] == "MurmurHash3_x64_128"
        assert x['hashBits'] == 64
        assert x['hashSeed'] == 42

        xx = x['sketches'][0]
        hashes = xx['hashes']

        mh = sourmash.MinHash(ksize=ksize, n=num, is_protein=False)
        mh.add_many(hashes)

        s = sourmash.SourmashSignature(mh, filename=filename)
        siglist.append(s)

    sourmash.save_signatures(siglist, args.output)
Example #27
0
def sketch(args):
    cwd = os.getcwd()
    db_path = os.path.join(cwd, args.name + '.db')
    # check for the existence of the database and tables
    if os.path.exists(db_path):
        pass
    else:
        print(
            "Database file not found. Please make sure the name is correct or run mashpit build."
        )
        exit(0)

    fasta_folder = os.path.join(cwd, 'fasta')
    if os.path.exists(fasta_folder):
        pass
    else:
        print("Fasta folder not found.")
        exit(0)

    sig_file_name = args.name + '.sig'

    all_fasta_path = os.path.join(fasta_folder, "*_skeasa.fasta")
    genomes_list = glob.glob(all_fasta_path)
    minhashes = []
    for genome in genomes_list:
        mh = MinHash(n=1000, ksize=31)
        for record in screed.open(genome):
            mh.add_sequence(record.sequence, True)
        minhashes.append(mh)
    siglist = []

    for i in range(len(minhashes)):
        signame = genomes_list[i].strip(fasta_folder).strip('_skesa.fasta')
        siglist.append(SourmashSignature(minhashes[i], name=signame))
    with open(sig_file_name, 'w') as f:
        save_signatures(siglist, fp=f)
Example #28
0
def intersect(args):
    """
    intersect one or more signatures by taking the intersection of hashes.

    This function always removes abundances.
    """
    p = SourmashArgumentParser(prog='sourmash signature intersect')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mins = None
    total_loaded = 0

    for sigfile in args.signatures:
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            if first_sig is None:
                first_sig = sigobj
                mins = set(sigobj.minhash.get_mins())

            mins.intersection_update(sigobj.minhash.get_mins())
            total_loaded += 1
        notify('loaded and intersected signatures from {}...', sigfile, end='\r')

    if total_loaded == 0:
        error("no signatures to merge!?")
        sys.exit(-1)

    # forcibly turn off track_abundance
    intersect_mh = first_sig.minhash.copy_and_clear()
    _flatten(intersect_mh)
    intersect_mh.add_many(mins)
    intersect_sigobj = sourmash.SourmashSignature(intersect_mh)

    output_json = sourmash.save_signatures([intersect_sigobj], fp=args.output)

    notify('loaded and intersected {} signatures', total_loaded)
Example #29
0
def rename(args):
    """
    rename one or more signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature rename')
    p.add_argument('sigfiles', nargs='+')
    p.add_argument('name')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-d', '--debug', action='store_true',
                   help='output debugging output')
    p.add_argument('-o', '--output', help='output to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet, args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    for filename in args.sigfiles:
        debug('loading {}', filename)
        siglist = sourmash.load_signatures(filename, ksize=args.ksize,
                                           select_moltype=moltype)

        for sigobj in siglist:
            sigobj.d['name'] = args.name
            outlist.append(sigobj)

    if args.output:
        fp = open(args.output, 'wt')
    else:
        fp = sys.stdout

    output_json = sourmash.save_signatures(outlist, fp=fp)
    if args.output:
        fp.close()

    notify("set name to '{}' on {} signatures", args.name, len(outlist))
Example #30
0
        for i, dataset in enumerate(sbt.leaves()):
            dataset_mins = dataset.data.minhash.get_mins()
            del dataset._data
            query_mins -= set(dataset_mins)
            if not query_mins:
                break

            if i % 100 == 0:
                print(
                    f"Progress: {i} sigs processed, query has {len(query_mins)} hashes left"
                )

    new_mh = query.minhash.copy_and_clear()
    if new_mh.track_abundance:
        new_mh.set_abundances({
            k: v
            for k, v in query.minhash.get_mins(with_abundance=True).items()
            if k in query_mins
        })
    else:
        new_mh.add_many(query_mins)

    query.minhash = new_mh

    output = args.query + ".unassigned"
    if args.output:
        output = args.output

    with open(output, "w") as fp:
        sourmash.save_signatures([query], fp)
def main(args=sys.argv[1:]):
    p = argparse.ArgumentParser()
    p.add_argument('catlas_prefix', help='catlas prefix')
    p.add_argument('output')
    p.add_argument('--minsize', type=float, default=100)
    p.add_argument('--maxsize', type=float, default=10000)
    p.add_argument('--keep-fraction', type=float, default=0.1)
    p.add_argument('-k',
                   '--ksize',
                   default=31,
                   type=int,
                   help='k-mer size (default: 31)')
    args = p.parse_args(args)

    print('minsize: {:g}'.format(args.minsize))
    print('maxsize: {:g}'.format(args.maxsize))

    # load catlas DAG
    catlas = CAtlas(args.catlas_prefix, load_sizefile=True)
    print('loaded {} nodes from catlas {}'.format(len(catlas), catlas))
    print('loaded {} layer 1 catlas nodes'.format(len(catlas.layer1_to_cdbg)))

    # calculate the cDBG shadow sizes for each catlas node.
    print('decorating catlas with shadow size info.')
    catlas.decorate_with_shadow_sizes()

    # ok, the real work: look at articulation of cDBG graph.

    # find highest nodes with kmer size less than given max_size
    def find_terminal_nodes(node_id, max_size):
        node_list = set()
        for sub_id in catlas.children[node_id]:
            # shadow size
            size = catlas.kmer_sizes[sub_id]

            if size < max_size:
                node_list.add(sub_id)
            else:
                children = find_terminal_nodes(sub_id, max_size)
                node_list.update(children)

        return node_list

    print('finding terminal nodes for {}.'.format(args.maxsize))

    terminal = find_terminal_nodes(catlas.root, args.maxsize)
    print('...got {}'.format(len(terminal)))
    terminal = {n for n in terminal if catlas.kmer_sizes[n] > args.minsize}
    print('...down to {} between {} and {} in size.'.format(
        len(terminal), args.minsize, args.maxsize))

    # now, go through and calculate ratios
    x = []
    for node_id in terminal:
        # calculate: how many k-mers per cDBG node?
        kmer_size = catlas.kmer_sizes[node_id]
        shadow_size = catlas.shadow_sizes[node_id]

        ratio = math.log(kmer_size, 2) - math.log(shadow_size, 2)

        # track basic info
        x.append((ratio, node_id, shadow_size, kmer_size))

    print('terminal node stats for maxsize: {:g}'.format(args.maxsize))
    print('n tnodes:', len(terminal))
    print('total k-mers:', catlas.kmer_sizes[catlas.root])

    x.sort(reverse=True)
    for (k, v, a, b) in x[:10]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)
    print('... eliding {} nodes'.format(len(x) - 20))
    for (k, v, a, b) in x[-10:]:
        print('ratio: {:.3f}'.format(2**k), '/ shadow size:', a, '/ kmers:', b)

    # keep the last keep-fraction (default 10%) for examination
    keep_sum_kmer = args.keep_fraction * catlas.kmer_sizes[catlas.root]
    sofar = 0
    keep_terminal = set()
    for (k, v, a, b) in reversed(x):
        sofar += b
        if sofar > keep_sum_kmer:
            break
        keep_terminal.add(v)

    print('keeping last {} k-mers worth of nodes for'
          'examination.'.format(sofar))

    # build cDBG shadow ID list.
    cdbg_shadow = catlas.shadow(keep_terminal)

    # extract contigs
    print('extracting contigs & building a sourmash signature')
    contigs = os.path.join(args.catlas_prefix, 'contigs.fa.gz')

    # track results as signature
    contigs_mh = sourmash.MinHash(n=0, ksize=args.ksize, scaled=1000)

    total_bp = 0
    total_seqs = 0

    outfp = open(args.output, 'wt')
    for n, record in enumerate(screed.open(contigs)):
        if n and n % 10000 == 0:
            offset_f = total_seqs / len(cdbg_shadow)
            print('...at n {} ({:.1f}% of shadow)'.format(
                total_seqs, offset_f * 100),
                  end='\r')

        # contig names == cDBG IDs
        contig_id = int(record.name)
        if contig_id not in cdbg_shadow:
            continue

        outfp.write('>{}\n{}\n'.format(record.name, record.sequence))
        contigs_mh.add_sequence(record.sequence)

        # track retrieved sequences in a minhash
        total_bp += len(record.sequence)
        total_seqs += 1

    # done - got all contigs!
    print('')
    print('fetched {} contigs, {} bp.'.format(total_seqs, total_bp))

    print('wrote contigs to {}'.format(args.output))
    with open(args.output + '.sig', 'wt') as fp:
        ss = sourmash.SourmashSignature(contigs_mh)
        sourmash.save_signatures([ss], fp)
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('bcalm_unitigs')
    parser.add_argument('gxt_out')
    parser.add_argument('contigs_out')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-d', '--debug', action='store_true')
    parser.add_argument('-P', '--pendants', action="store_true",
                        help="don't remove low abundance pendants")
    parser.add_argument('-a', '--abundance', nargs='?', type=float,
                        default=1.1)
    parser.add_argument('--randomize', help='randomize cDBG order')
    args = parser.parse_args(argv)

    k = args.ksize

    trim = not args.pendants
    trim_cutoff = args.abundance
    unitigs = args.bcalm_unitigs
    debug = args.debug

    if args.debug:
        logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w',
                            level=logging.DEBUG)
    else:
        logging.basicConfig(filename='bcalm_to_gxt.log', filemode='w',
                            level=logging.WARNING)

    logging.debug("starting bcalm_to_gxt run.")

    gxtfp = open(args.gxt_out, 'wt')
    contigsfp = bgzf.open(args.contigs_out, 'wb')
    info_filename = args.contigs_out + '.info.csv'
    info_fp = open(info_filename, 'wt')
    in_mh = sourmash.MinHash(0, 31, scaled=1000)
    out_mh = sourmash.MinHash(0, 31, scaled=1000)

    # load in the basic graph structure from the BCALM output file
    neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k)

    # record input k-mers in a minhash
    for seq in sequences.values():
        in_mh.add_sequence(seq)

    # make order deterministic by reordering around min value of first, last,
    # and reverse complementing sequences appropriately
    print('reordering...')
    reordering = {}

    # first, put sequences in specific orientation
    sequence_list = []
    for key in neighbors:
        v = sequences[key]

        # pick lexicographically smaller of forward & reverse complement.
        v2 = screed.rc(v)
        if v > v2:
            v = v2
        sequence_list.append((v, key))
        del sequences[key]

    # sort all sequences:
    sequence_list.sort(reverse=True)
    if args.randomize:
        print('(!! randomizing order per --randomize !!)')
        random.shuffle(sequence_list)

    # ok, now remap all the things.
    remapping = {}
    new_sequences = {}

    # remap sequences
    new_key = 0
    while sequence_list:                  # consume while iterating
        sequence, old_key = sequence_list.pop()
        remapping[old_key] = new_key
        new_sequences[new_key] = sequence
        new_key += 1

    # remap other things
    new_neighbors = collections.defaultdict(set)
    for old_key, vv in neighbors.items():
        new_vv = [ remapping[v] for v in vv ]
        new_neighbors[remapping[old_key]] = set(new_vv)

    new_mean_abunds = {}
    for old_key, value in mean_abunds.items():
        new_mean_abunds[remapping[old_key]] = value

    new_sizes = {}
    for old_key, value in sizes.items():
        new_sizes[remapping[old_key]] = value

    assert len(sequences) == 0
    print('...done')

    sequences = new_sequences
    mean_abunds = new_mean_abunds
    sizes = new_sizes
    neighbors = new_neighbors

    # if we are removing pendants, we need to relabel the contigs so they are
    # consecutive integers starting from 0.  If not, we create dummy data
    # structures to make the interface the same elsewhere in the data
    if trim:
        print('removing pendants...')
        non_pendants = set(v for v, N in neighbors.items() if len(N) > 1 or
                           mean_abunds[v] > trim_cutoff)
        contract_degree_two(non_pendants, neighbors, sequences, mean_abunds,
                            sizes, k)
    else:
        non_pendants = list(neighbors.keys())
    aliases = {x: i for i, x in enumerate(sorted(non_pendants))}
    n = len(aliases)

    # write out sequences & compute offsets
    offsets = {}
    kv_list = sorted(aliases.items(), key=lambda x:x[1])
    for x, i in kv_list:
        offsets[x] = contigsfp.tell()
        contigsfp.write('>{}\n{}\n'.format(i, sequences[x]))
        out_mh.add_sequence(sequences[x])
    contigsfp.close()

    print('... done! {} unitigs'.format(n))

    # start the gxt file by writing the number of nodes (unitigs))
    gxtfp.write('{}\n'.format(n))

    # write out all of the links, in 'from to' format.
    n_edges = 0
    for v, N in sorted(neighbors.items()):
        for u in sorted(N):
            gxtfp.write('{} {}\n'.format(aliases[v], aliases[u]))
            n_edges += 1

    print('{} vertices, {} edges'.format(n, n_edges))

    info_fp.write('contig_id,offset,mean_abund,n_kmers\n')
    for v, i in aliases.items():
        info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v],
                                                 mean_abunds[v],
                                                 sizes[v]))

    # output two sourmash signatures: one for input contigs, one for
    # output contigs.
    in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs)
    sourmash.save_signatures([ in_sig ],
                             open(args.bcalm_unitigs + '.sig', 'wt'))

    out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out)
    sourmash.save_signatures([ out_sig ],
                             open(args.contigs_out + '.sig', 'wt'))
def main():
    p = argparse.ArgumentParser()
    p.add_argument('zipfile')
    p.add_argument('signatures', nargs='*')
    p.add_argument('--sig-pathlist')
    p.add_argument('--compression', type=int, default=9)
    p.add_argument('--ksize', type=int) # can we accept multiple and write mult sigfiles in one pass?
    p.add_argument('--scaled', type=int)
    p.add_argument('--alphabet')
    args = p.parse_args()

    zf = zipfile.ZipFile(args.zipfile, 'w')

    siglist = [x.rstrip() for x in open(args.sig_pathlist)]
    all_sigs = siglist + args.signatures

    # is this still needed? feel like we accept aliases now...
    if args.alphabet == "nucleotide":
        args.alphabet = "DNA"

    n = 0
    all_md5=set()
    sig_scaled=None
    downsample=False
    for i, filename in enumerate(all_sigs):
        if n % 10000 == 0:
            print(f"... processing {n}th signature; currently reading signatures from '{filename}'")

        for sig in sourmash.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=args.alphabet):
            # zip needs a unique name for each signature. Use sig md5sum.
            md5= sig.md5sum()
            # if this is a duplicate md5sum, add _{number} to make it unique.
            if md5 in all_md5:
                sys.stderr.write(f"{str(sig)} has an md5sum identical to one already in the zipfile ({md5})")
                i=0
                full_md5 = f"{md5}_{i}"
                while full_md5 in all_md5:
                    i+= 1
                    full_md5 = f"{md5}_{i}"
                md5=full_md5
                sys.stderr.write(f"...adding unique md5 {md5} instead")

            all_md5.add(md5)
            md5_name = 'signatures/' + md5 + '.sig'
            # once, check we can downsample
            if args.scaled and not sig_scaled:
                sig_scaled = sig.minhash.scaled
                if args.scaled < sig_scaled:
                    print(f"Can't downsample: desired scaled {args.scaled} is smaller than original scaled, {sig_scaled}. Exiting!")
                    sys.exit(-1)
                else:
                    downsample=True
            # if need to downsample, do it
            if downsample:
                sig.minhash = sig.minhash.downsample(scaled=args.scaled)

            sigstr = sourmash.save_signatures([sig], compression=args.compression)
            zf.writestr(md5_name, sigstr)
            n += 1

    print(f"wrote {n} signatures to '{args.zipfile}'")

    return 0
Example #34
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('bcalm_unitigs')
    parser.add_argument('gxt_out')
    parser.add_argument('contigs_out')
    parser.add_argument('-k', '--ksize', type=int, default=31)
    parser.add_argument('-d', '--debug', action='store_true')
    parser.add_argument('-P',
                        '--pendants',
                        action="store_true",
                        help="don't remove low abundance pendants")
    parser.add_argument('-a',
                        '--abundance',
                        nargs='?',
                        type=float,
                        default=1.1)
    parser.add_argument('--randomize', help='randomize cDBG order')
    args = parser.parse_args(argv)

    k = args.ksize

    trim = not args.pendants
    trim_cutoff = args.abundance
    unitigs = args.bcalm_unitigs
    debug = args.debug

    if args.debug:
        logging.basicConfig(filename='bcalm_to_gxt.log',
                            filemode='w',
                            level=logging.DEBUG)
    else:
        logging.basicConfig(filename='bcalm_to_gxt.log',
                            filemode='w',
                            level=logging.WARNING)

    logging.debug("starting bcalm_to_gxt run.")

    gxtfp = open(args.gxt_out, 'wt')
    contigsfp = bgzf.open(args.contigs_out, 'wb')
    info_filename = args.contigs_out + '.info.csv'
    info_fp = open(info_filename, 'wt')
    in_mh = sourmash.MinHash(0, 31, scaled=1000)
    out_mh = sourmash.MinHash(0, 31, scaled=1000)

    # load in the basic graph structure from the BCALM output file
    neighbors, sequences, mean_abunds, sizes = read_bcalm(unitigs, debug, k)

    # record input k-mers in a minhash
    for seq in sequences.values():
        in_mh.add_sequence(seq)

    # make order deterministic by reordering around min value of first, last,
    # and reverse complementing sequences appropriately
    print('reordering...')
    reordering = {}

    # first, put sequences in specific orientation
    sequence_list = []
    for key in neighbors:
        v = sequences[key]

        # pick lexicographically smaller of forward & reverse complement.
        v2 = screed.rc(v)
        if v > v2:
            v = v2
        sequence_list.append((v, key))
        del sequences[key]

    # sort all sequences:
    sequence_list.sort(reverse=True)
    if args.randomize:
        print('(!! randomizing order per --randomize !!)')
        random.shuffle(sequence_list)

    # ok, now remap all the things.
    remapping = {}
    new_sequences = {}

    # remap sequences
    new_key = 0
    while sequence_list:  # consume while iterating
        sequence, old_key = sequence_list.pop()
        remapping[old_key] = new_key
        new_sequences[new_key] = sequence
        new_key += 1

    # remap other things
    new_neighbors = collections.defaultdict(set)
    for old_key, vv in neighbors.items():
        new_vv = [remapping[v] for v in vv]
        new_neighbors[remapping[old_key]] = set(new_vv)

    new_mean_abunds = {}
    for old_key, value in mean_abunds.items():
        new_mean_abunds[remapping[old_key]] = value

    new_sizes = {}
    for old_key, value in sizes.items():
        new_sizes[remapping[old_key]] = value

    assert len(sequences) == 0
    print('...done')

    sequences = new_sequences
    mean_abunds = new_mean_abunds
    sizes = new_sizes
    neighbors = new_neighbors

    # if we are removing pendants, we need to relabel the contigs so they are
    # consecutive integers starting from 0.  If not, we create dummy data
    # structures to make the interface the same elsewhere in the data
    if trim:
        print('removing pendants...')
        non_pendants = set(v for v, N in neighbors.items()
                           if len(N) > 1 or mean_abunds[v] > trim_cutoff)
        contract_degree_two(non_pendants, neighbors, sequences, mean_abunds,
                            sizes, k)
    else:
        non_pendants = list(neighbors.keys())
    aliases = {x: i for i, x in enumerate(sorted(non_pendants))}
    n = len(aliases)

    # write out sequences & compute offsets
    offsets = {}
    kv_list = sorted(aliases.items(), key=lambda x: x[1])
    for x, i in kv_list:
        offsets[x] = contigsfp.tell()
        contigsfp.write('>{}\n{}\n'.format(i, sequences[x]))
        out_mh.add_sequence(sequences[x])
    contigsfp.close()

    print('... done! {} unitigs'.format(n))

    # start the gxt file by writing the number of nodes (unitigs))
    gxtfp.write('{}\n'.format(n))

    # write out all of the links, in 'from to' format.
    n_edges = 0
    for v, N in sorted(neighbors.items()):
        for u in sorted(N):
            gxtfp.write('{} {}\n'.format(aliases[v], aliases[u]))
            n_edges += 1

    print('{} vertices, {} edges'.format(n, n_edges))

    info_fp.write('contig_id,offset,mean_abund,n_kmers\n')
    for v, i in aliases.items():
        info_fp.write('{},{},{:.3f},{}\n'.format(i, offsets[v], mean_abunds[v],
                                                 sizes[v]))

    # output two sourmash signatures: one for input contigs, one for
    # output contigs.
    in_sig = sourmash.SourmashSignature(in_mh, filename=args.bcalm_unitigs)
    sourmash.save_signatures([in_sig], open(args.bcalm_unitigs + '.sig', 'wt'))

    out_sig = sourmash.SourmashSignature(out_mh, filename=args.contigs_out)
    sourmash.save_signatures([out_sig], open(args.contigs_out + '.sig', 'wt'))
Example #35
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('hashfile') 					# file that contains hashes
    p.add_argument('-o', '--output', default=None,
                   help='file to output signature to')
    p.add_argument('-k', '--ksize', default=None, type=int)
    p.add_argument('--scaled', default=None, type=int)
    p.add_argument('--num', default=None, type=int)
    p.add_argument('--name', default='', help='signature name')
    p.add_argument('--filename', default='',
                   help='filename to add to signature')
    args = p.parse_args()

    # check arguments.
    if args.scaled and args.num:
        error('cannot specify both --num and --scaled! exiting.')
        return -1

    if not args.ksize:
        error('must specify --ksize')
        return -1

    if not args.output:
        error('must specify --output')
        return -1

    # first, load in all the hashes
    hashes = set()
    for line in open(args.hashfile, 'rt'):
        hashval = int(line.strip())
        hashes.add(hashval)

    if not hashes:
        error("ERROR, no hashes loaded from {}!", args.hashfile)
        return -1

    notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile)

    # now, create the MinHash object that we'll use.
    scaled = 0
    num = 0
    if args.scaled:
        scaled = args.scaled
    elif args.num:
        num = args.num
    else:
        notify('setting --num automatically from the number of hashes.')
        num = len(hashes)

    # construct empty MinHash object according to args
    minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled)

    # add hashes into!
    minhash.add_many(hashes)

    if len(minhash) < len(hashes):
        notify("WARNING: loaded {} hashes, but only {} made it into MinHash.",
               len(hashes), len(minhash))
        if scaled:
            notify("This is probably because of the scaled argument.")
        elif args.num:
            notify("This is probably because your --num is set to {}",
                   args.num)

    if num > len(minhash):
        notify("WARNING: --num set to {}, but only {} hashes in signature.",
               num, len(minhash))

    sigobj = sourmash.SourmashSignature(minhash, name=args.name,
                                        filename=args.filename)

    with open(args.output, 'wt') as fp:
        sourmash.save_signatures([sigobj], fp)
    notify('wrote signature to {}', args.output)
Example #36
0
def downsample(args):
    """
    downsample a scaled signature.
    """
    p = SourmashArgumentParser(prog='sourmash signature downsample')
    p.add_argument('signatures', nargs="+")
    p.add_argument('--scaled', type=int, default=0,
                   help='scaled value to downsample to')
    p.add_argument('--num', type=int, default=0,
                   help='num value to downsample to')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    if not args.num and not args.scaled:
        error('must specify either --num or --scaled value')
        sys.exit(-1)

    if args.num and args.scaled:
        error('cannot specify both --num and --scaled')
        sys.exit(-1)

    output_list = []
    total_loaded = 0
    for sigfile in args.signatures:
        siglist = sourmash.load_signatures(sigfile, ksize=args.ksize, select_moltype=moltype, do_raise=True)

        for sigobj in siglist:
            mh = sigobj.minhash

            notify('loading and downsampling signature from {}...', sigfile, end='\r')
            total_loaded += 1
            if args.scaled:
                if mh.scaled:
                    mh_new = mh.downsample_scaled(args.scaled)
                else:                         # try to turn a num into a scaled
                    # first check: can we?
                    max_hash = get_max_hash_for_scaled(args.scaled)
                    mins = mh.get_mins()
                    if max(mins) < max_hash:
                        raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, 0, args.scaled)
            elif args.num:
                if mh.num:
                    mh_new = mh.downsample_n(args.num)
                else:                         # try to turn a scaled into a num
                    # first check: can we?
                    if len(mh) < args.num:
                        raise ValueError("this scaled MinHash has only {} hashes")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, args.num, 0)

            sigobj.minhash = mh_new

            output_list.append(sigobj)

    output_json = sourmash.save_signatures(output_list, fp=args.output)

    notify("loaded and downsampled {} signatures", total_loaded)
Example #37
0
def merge(args):
    """
    merge one or more signatures.
    """
    p = SourmashArgumentParser(prog='sourmash signature merge')
    p.add_argument('signatures', nargs='+')
    p.add_argument('-q', '--quiet', action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o', '--output', type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    p.add_argument('--flatten', action='store_true',
                   help='Remove abundances from all signatures.')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)

    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    first_sig = None
    mh = None
    total_loaded = 0

    # iterate over all the sigs from all the files.
    for sigfile in args.signatures:
        notify('loading signatures from {}...', sigfile, end='\r')
        this_n = 0
        for sigobj in sourmash.load_signatures(sigfile, ksize=args.ksize,
                                               select_moltype=moltype,
                                               do_raise=True):
            # first signature? initialize a bunch of stuff
            if first_sig is None:
                first_sig = sigobj
                mh = first_sig.minhash.copy_and_clear()

                # forcibly remove abundance?
                if mh.track_abundance and args.flatten:
                    _flatten(mh)

            try:
                if not args.flatten:
                    _check_abundance_compatibility(first_sig, sigobj)

                mh.merge(sigobj.minhash)
            except:
                error("ERROR when merging signature '{}' ({}) from file {}",
                      sigobj.name(), sigobj.md5sum()[:8], sigfile)
                raise

            this_n += 1
            total_loaded += 1
        if this_n:
            notify('loaded and merged {} signatures from {}...', this_n, sigfile, end='\r')

    if not total_loaded:
        error("no signatures to merge!?")
        sys.exit(-1)

    merged_sigobj = sourmash.SourmashSignature(mh)

    output_json = sourmash.save_signatures([merged_sigobj], fp=args.output)

    notify('loaded and merged {} signatures', total_loaded)
Example #38
0
                    out.write(str(hsh) + '\n')
            if len(new_mins) > 0:
                minhash = MinHash(
                    n=0, ksize=ksize, scaled=scaled
                )  # scaled=1 so we keep all (though these were previously at some other scaled val)
                minhash.add_many(set(counts.keys()))
                # write sig to file
                sigobj = sourmash.SourmashSignature(
                    minhash,
                    name=f"aggregated_hashvals_above_{min_count}",
                    filename=f"generated with drop_unique_hashes.py")
                sigobjs += [sigobj]

## this part only handles one output file -- doesn't take care of case with many ksizes/moltypes
with open(outsig, 'wt') as sigout:
    sourmash.save_signatures(sigobjs, sigout)
    #notify('wrote signature to {}', args.output)

# write out hashes to a text file

# this part is from
# https://github.com/dib-lab/sourmash/blob/7661087aa0b0e81bfec82a58002463d7c699528a/utils/hashvals-to-signature.py

#ksize = int(snakemake.params.get("ksize", 7))
#do some checking here?
#if scaled==0:
#    num=int(snakemake.params.get("num_hashes", 0))
#    if num==0:
#        notify('setting --num automatically from the number of hashes.')
#        num = len(counts.keys()) # can you access keys this was from Counter object?
Example #39
0
def downsample(args):
    """
    downsample a scaled signature.
    """
    p = SourmashArgumentParser(prog='sourmash signature downsample')
    p.add_argument('signatures', nargs="+")
    p.add_argument('--scaled',
                   type=int,
                   default=0,
                   help='scaled value to downsample to')
    p.add_argument('--num',
                   type=int,
                   default=0,
                   help='num value to downsample to')
    p.add_argument('-q',
                   '--quiet',
                   action='store_true',
                   help='suppress non-error output')
    p.add_argument('-o',
                   '--output',
                   type=argparse.FileType('wt'),
                   default=sys.stdout,
                   help='output signature to this file')
    sourmash_args.add_ksize_arg(p, DEFAULT_LOAD_K)
    sourmash_args.add_moltype_args(p)
    args = p.parse_args(args)
    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    if not args.num and not args.scaled:
        error('must specify either --num or --scaled value')
        sys.exit(-1)

    if args.num and args.scaled:
        error('cannot specify both --num and --scaled')
        sys.exit(-1)

    output_list = []
    total_loaded = 0
    for sigfile in args.signatures:
        siglist = sourmash.load_signatures(sigfile,
                                           ksize=args.ksize,
                                           select_moltype=moltype,
                                           do_raise=True)

        for sigobj in siglist:
            mh = sigobj.minhash

            notify('loading and downsampling signature from {}...',
                   sigfile,
                   end='\r')
            total_loaded += 1
            if args.scaled:
                if mh.scaled:
                    mh_new = mh.downsample_scaled(args.scaled)
                else:  # try to turn a num into a scaled
                    # first check: can we?
                    max_hash = get_max_hash_for_scaled(args.scaled)
                    mins = mh.get_mins()
                    if max(mins) < max_hash:
                        raise ValueError(
                            "this num MinHash does not have enough hashes to convert it into a scaled MinHash."
                        )

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, 0, args.scaled)
            elif args.num:
                if mh.num:
                    mh_new = mh.downsample_n(args.num)
                else:  # try to turn a scaled into a num
                    # first check: can we?
                    if len(mh) < args.num:
                        raise ValueError(
                            "this scaled MinHash has only {} hashes")

                    mh_new = copy.copy(mh)
                    _set_num_scaled(mh_new, args.num, 0)

            sigobj.minhash = mh_new

            output_list.append(sigobj)

    output_json = sourmash.save_signatures(output_list, fp=args.output)

    notify("loaded and downsampled {} signatures", total_loaded)