def test_build_hashCounter(): mh1 = MinHash(0, 21, scaled=1, track_abundance=True) mh2 = MinHash(0, 21, scaled=1, track_abundance=True) mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 2, 5)) true_res = Counter({1: 2, 2: 2, 3: 1, 4: 1, 5: 1}) ss1 = SourmashSignature(mh1) ss2 = SourmashSignature(mh2) counts = Counter() hc = build_hashCounter([ss1, ss2], counts) print("Hash Counter: ", hc) assert hc == true_res
def test_drop_below_mincount_threshold(): mh1 = MinHash(0, 21, scaled=1, track_abundance=True) mh2 = MinHash(0, 21, scaled=1, track_abundance=True) mh1.add_many((1, 2, 3, 4)) mh2.add_many((1, 1, 2, 5)) ss1 = SourmashSignature(mh1) ss2 = SourmashSignature(mh2) counts = Counter() hc = build_hashCounter([ss1, ss2], counts) kept_hashes = drop_below_mincount(hc, 3) true_kept = Counter({1: 3}) print("kept hashes: ", kept_hashes) assert kept_hashes == true_kept
print(f"{hashval}:{ct}") if ct < min_count: counts.pop(hashval) # write out hashes # let's try building a sig. we will use this sig later to intersect with sample-specific sigs new_mins = set(counts.keys()) print(len(new_mins)) with open(outhashes, "w") as out: for hsh in new_mins: out.write(str(hsh) + '\n') if len(new_mins) > 0: minhash = MinHash( n=0, ksize=ksize, scaled=scaled ) # scaled=1 so we keep all (though these were previously at some other scaled val) minhash.add_many(set(counts.keys())) # write sig to file sigobj = sourmash.SourmashSignature( minhash, name=f"aggregated_hashvals_above_{min_count}", filename=f"generated with drop_unique_hashes.py") sigobjs += [sigobj] ## this part only handles one output file -- doesn't take care of case with many ksizes/moltypes with open(outsig, 'wt') as sigout: sourmash.save_signatures(sigobjs, sigout) #notify('wrote signature to {}', args.output) # write out hashes to a text file # this part is from
def main(): p = argparse.ArgumentParser() p.add_argument('hashfile') # file that contains hashes p.add_argument('-o', '--output', default=None, help='file to output signature to') p.add_argument('-k', '--ksize', default=None, type=int) p.add_argument('--scaled', default=None, type=int) p.add_argument('--num', default=None, type=int) p.add_argument('--name', default='', help='signature name') p.add_argument('--filename', default='', help='filename to add to signature') args = p.parse_args() # check arguments. if args.scaled and args.num: error('cannot specify both --num and --scaled! exiting.') return -1 if not args.ksize: error('must specify --ksize') return -1 if not args.output: error('must specify --output') return -1 # first, load in all the hashes hashes = set() for line in open(args.hashfile, 'rt'): hashval = int(line.strip()) hashes.add(hashval) if not hashes: error("ERROR, no hashes loaded from {}!", args.hashfile) return -1 notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile) # now, create the MinHash object that we'll use. scaled = 0 num = 0 if args.scaled: scaled = args.scaled elif args.num: num = args.num else: notify('setting --num automatically from the number of hashes.') num = len(hashes) # construct empty MinHash object according to args minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled) # add hashes into! minhash.add_many(hashes) if len(minhash) < len(hashes): notify("WARNING: loaded {} hashes, but only {} made it into MinHash.", len(hashes), len(minhash)) if scaled: notify("This is probably because of the scaled argument.") elif args.num: notify("This is probably because your --num is set to {}", args.num) if num > len(minhash): notify("WARNING: --num set to {}, but only {} hashes in signature.", num, len(minhash)) sigobj = sourmash.SourmashSignature(minhash, name=args.name, filename=args.filename) with open(args.output, 'wt') as fp: sourmash.save_signatures([sigobj], fp) notify('wrote signature to {}', args.output)