def main():
    p = argparse.ArgumentParser()
    p.add_argument('query_sig')
    p.add_argument('known_sig')
    p.add_argument('unknown_sig')

    p.add_argument("-k", "--ksize", type=int, default=31, help="ksize for analysis")
    p.add_argument("--moltype", default="DNA", help="molecule type for analysis")
    p.add_argument("--scaled", default=None, help="sourmash scaled value for analysis")
    p.add_argument("--report", help="output signature breakdown information in CSV")
    args = p.parse_args()

    ksize = args.ksize
    moltype = args.moltype

    query_sig = sourmash.load_file_as_signatures(args.query_sig,
                                                 ksize=ksize,
                                                 select_moltype=moltype)
    query_sig = list(query_sig)[0]
    known_sig = sourmash.load_file_as_signatures(args.known_sig)
    known_sig = list(known_sig)[0]
    unknown_sig = sourmash.load_file_as_signatures(args.unknown_sig)
    unknown_sig = list(unknown_sig)[0]

    query_mh = query_sig.minhash
    known_mh = known_sig.minhash
    unknown_mh = unknown_sig.minhash

    assert query_mh.ksize == known_mh.ksize
    assert query_mh.moltype == known_mh.moltype
    assert known_mh.scaled == unknown_mh.scaled

    query_mh = query_mh.downsample(scaled=known_mh.scaled)

    assert len(query_mh) == len(known_mh) + len(unknown_mh)

    p_known = len(known_mh) / len(query_mh) * 100
    print(f"{len(known_mh)} known hashes of {len(query_mh)} total ({p_known:.1f}% known, {100-p_known:.1f}% unknown).")

    if args.report:
        print(f"reporting stats to '{args.report}'")
        with open(args.report, 'wt') as fp:
            w = csv.writer(fp)
            w.writerow(["total_hashes", "known_hashes", "unknown_hashes",
                        "scaled", "moltype", "ksize"])
            w.writerow([len(query_mh),
                        len(known_mh),
                        len(unknown_mh),
                        query_mh.scaled,
                        query_mh.moltype,
                        query_mh.ksize
                        ])

    return 0
Beispiel #2
0
    def test_script(self):
        subprocess.run('mashpit sketch test', shell=True)
        sig_dict_expected = {}
        sig_expected = load_file_as_signatures('expected_test.sig')
        for sig in sig_expected:
            sig_dict_expected[str(sig)] = str(sig.md5sum())

        sig_dict_generated = {}
        sig_generated = load_file_as_signatures('test.sig')
        for sig in sig_generated:
            sig_dict_generated[str(sig)] = str(sig.md5sum())

        self.assertDictEqual(dict(sorted(sig_dict_expected.items())),
                             dict(sorted(sig_dict_generated.items())))
def main():
    p = argparse.ArgumentParser()
    p.add_argument('contigs')  # this is an assembly
    p.add_argument('read_sig')  #  this contains sourmash sig with abunds
    p.add_argument('-o', '--output', required=True)
    args = p.parse_args()

    siglist = sourmash.load_file_as_signatures(args.read_sig)
    siglist = list(siglist)
    assert len(siglist) == 1
    sig = siglist[0]

    contigs_mh = sig.minhash.copy_and_clear()
    for record in screed.open(args.contigs):
        contigs_mh.add_sequence(record.sequence, force=True)

    # intersect the genome assembly with the read abundances
    # so now we get the abundances of only the k-mers that are in the
    # assembly.
    abunds = {}
    for hashval in contigs_mh.hashes:
        abunds[hashval] = sig.minhash.hashes.get(hashval, 0)

    output_mh = sig.minhash.copy_and_clear()
    output_mh.set_abundances(abunds)

    out_sig = sourmash.SourmashSignature(output_mh)
    with open(args.output, 'wt') as fp:
        print(f"Saving output to '{args.output}'")
        sourmash.save_signatures([out_sig], fp)
Beispiel #4
0
def test_smash_sig():
    # run 'smash_reads'
    global _tempdir

    abundtrim_dir = os.path.join(_tempdir, "abundtrim")
    os.mkdir(abundtrim_dir)

    conf = utils.relative_file('tests/test-data/SRR5950647_subset.conf')
    src = utils.relative_file("tests/test-data/SRR5950647_subset.abundtrim.fq.gz")
    shutil.copy(src, abundtrim_dir)

    extra_args = ["smash_reads"]
    status = run_snakemake(
        conf,
        verbose=True,
        outdir=_tempdir,
        extra_args=extra_args,
    )
    assert status == 0

    output_sig = f"{_tempdir}/sigs/SRR5950647_subset.abundtrim.sig.zip"
    assert os.path.exists(output_sig)
    sigs = list(sourmash.load_file_as_signatures(output_sig))
    assert len(sigs) == 3
    for s in sigs:
        assert s.minhash.track_abundance
def main():
    p = argparse.ArgumentParser()
    p.add_argument('zipfile')
    p.add_argument('signatures', nargs='*')
    p.add_argument('--sig-pathlist')
    p.add_argument('--compression', type=int, default=9)
    p.add_argument('--ksize', type=int) # can we accept multiple and write mult sigfiles in one pass?
    p.add_argument('--scaled', type=int)
    p.add_argument('--alphabet')
    args = p.parse_args()

    zf = zipfile.ZipFile(args.zipfile, 'w')

    siglist = [x.rstrip() for x in open(args.sig_pathlist)]
    all_sigs = siglist + args.signatures

    # is this still needed? feel like we accept aliases now...
    if args.alphabet == "nucleotide":
        args.alphabet = "DNA"

    n = 0
    all_md5=set()
    sig_scaled=None
    downsample=False
    for i, filename in enumerate(all_sigs):
        if n % 10000 == 0:
            print(f"... processing {n}th signature; currently reading signatures from '{filename}'")

        for sig in sourmash.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=args.alphabet):
            # zip needs a unique name for each signature. Use sig md5sum.
            md5= sig.md5sum()
            # if this is a duplicate md5sum, add _{number} to make it unique.
            if md5 in all_md5:
                sys.stderr.write(f"{str(sig)} has an md5sum identical to one already in the zipfile ({md5})")
                i=0
                full_md5 = f"{md5}_{i}"
                while full_md5 in all_md5:
                    i+= 1
                    full_md5 = f"{md5}_{i}"
                md5=full_md5
                sys.stderr.write(f"...adding unique md5 {md5} instead")

            all_md5.add(md5)
            md5_name = 'signatures/' + md5 + '.sig'
            # once, check we can downsample
            if args.scaled and not sig_scaled:
                sig_scaled = sig.minhash.scaled
                if args.scaled < sig_scaled:
                    print(f"Can't downsample: desired scaled {args.scaled} is smaller than original scaled, {sig_scaled}. Exiting!")
                    sys.exit(-1)
                else:
                    downsample=True
            # if need to downsample, do it
            if downsample:
                sig.minhash = sig.minhash.downsample(scaled=args.scaled)

            sigstr = sourmash.save_signatures([sig], compression=args.compression)
            zf.writestr(md5_name, sigstr)
            n += 1

    print(f"wrote {n} signatures to '{args.zipfile}'")

    return 0
Beispiel #6
0
def abundhist(args):
    """
    output abundance histogram and/or raw abundances.
    """

    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_file_as_signatures(filename,
                                                   ksize=args.ksize,
                                                   select_moltype=moltype)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    if len(siglist) != total_loaded:
        notify("selected {} via name / md5 selectors".format(len(siglist)))
    notify('')

    counts_d = collections.defaultdict(int)
    for ss in siglist:
        for hashval, abund in ss.minhash.hashes.items():
            counts_d[hashval] += abund

    all_counts = list(counts_d.values())

    min_range = 1
    if args.min is not None:
        min_range = args.min
    max_range = max(all_counts)
    if args.max is not None:
        max_range = args.max

    n_bins = args.bins
    if max_range - min_range + 1 < n_bins:
        n_bins = max_range - min_range + 1

    # make hist
    counts, bin_edges = numpy.histogram(all_counts,
                                        range=(min_range, max_range),
                                        bins=n_bins)
    bin_edges = bin_edges.astype(int)

    # plot
    fig = tpl.figure()
    f = fig.barh(counts, [str(x) for x in bin_edges[1:]], force_ascii=True)
    fig.show()

    # output histogram in csv?
    if args.output:
        with FileOutput(args.output, 'wt') as fp:
            w = csv.writer(fp)
            w.writerow(['count', 'n_count'])
            for nc, c in zip(counts, bin_edges[1:]):
                w.writerow([c, nc])

    # output raw counts tagged with hashval?
    if args.abundances:
        with FileOutput(args.abundances, 'wt') as fp:
            w = csv.writer(fp)
            w.writerow(['hashval', 'count'])
            for hashval, count in counts_d.items():
                w.writerow([hashval, count])
Beispiel #7
0
def query(args):
    sample_path = args.sample
    sample_name = ntpath.basename(sample_path)
    cwd = os.getcwd()
    db_path = os.path.join(cwd, args.database + '.db')
    database_sig_path = os.path.join(cwd, args.database + '.sig')
    target_sig_path = os.path.join(cwd, sample_path + '.sig')

    # check if database and signature file exists
    if os.path.exists(db_path):
        pass
    else:
        print("Database not found.")
        exit(0)
    if os.path.exists(database_sig_path):
        pass
    else:
        print("Database signature file not found")
        exit(0)

    conn = create_connection(db_path)
    c = conn.cursor()
    # sketch the query sample and load the signature
    get_target_sig(sample_path)
    target_sig = load_one_signature(target_sig_path)

    # manager dict: a shared variable for multiprocessing but slow in iteration
    manager = multiprocessing.Manager()
    srr_similarity_manager_dict = manager.dict()
    # check if the signature file has been splited (need a more elegant way)
    if os.path.exists(args.database + '_1.sig'):
        proc_list = []
        for i in range(1, args.number):
            proc = Process(target=calculate_similarity,
                           args=(i, srr_similarity_manager_dict, target_sig,
                                 args.database))
            proc.start()
            proc_list.append(proc)
        for i in proc_list:
            i.join()
    else:
        database_sig = load_file_as_signatures(database_sig_path)
        for sig in database_sig:
            similarity = target_sig.jaccard(sig)
            srr_similarity_manager_dict[str(sig)] = similarity

    srr_similarity_dict = {}
    srr_similarity_dict.update(srr_similarity_manager_dict)

    # get the top 50 results
    res_srr_similarity_dict = dict(
        sorted(srr_similarity_dict.items(), key=itemgetter(1),
               reverse=True)[:50])
    c.execute('SELECT * FROM METADATA')
    output_df = pd.DataFrame([])
    names = [description[0] for description in c.description]
    for i in res_srr_similarity_dict:
        sql_query = pd.read_sql_query(
            "select * from METADATA where srr = '" + str(i) + "'", conn)
        df_query = pd.DataFrame(sql_query, columns=names)
        df_query['similarity_score'] = res_srr_similarity_dict[i]
        output_df = output_df.append(df_query, ignore_index=True)

    # if it is a standard database, add the link of the snp cluster to the output
    c.execute("SELECT value FROM DESC where name = 'Type';")
    db_type = c.fetchone()[0]
    if db_type == 'Standard':
        pds_list = output_df['PDS_acc'].to_list()
        cluster_link = []
        for pds in pds_list:
            cluster_link.append(
                'https://www.ncbi.nlm.nih.gov/pathogens/isolates/#' + pds)
        output_df['link'] = cluster_link
    print(output_df)
    output_df.to_csv(sample_name + '_output.csv', index=True)
Beispiel #8
0
def calculate_similarity(i, similarity_dict, target_sig, database):
    database_sig = load_file_as_signatures(database + '_' + str(i) + '.sig')
    for sig in database_sig:
        similarity = target_sig.jaccard(sig)
        similarity_dict[str(sig)] = similarity
    return