Beispiel #1
0
 def print_citation(cls):
     if cls._citation_printed:
         return
     from sourmash.logging import notify
     notify("\n== This is sourmash version {version}. ==",
            version=sourmash.VERSION)
     notify(
         "== Please cite Brown and Irber (2016), doi:10.21105/joss.00027. ==\n"
     )
     cls._citation_printed = True
Beispiel #2
0
def load_databases(filenames, scaled=None, verbose=True):
    "Load multiple LCA databases; return (dblist, ksize, scaled)"
    ksize_vals = set()
    scaled_vals = set()
    dblist = []

    # load all the databases
    for db_name in filenames:
        if verbose:
            notify(u'\r\033[K', end=u'')
            notify('... loading database {}'.format(db_name), end='\r')

        lca_db = LCA_Database.load(db_name)

        ksize_vals.add(lca_db.ksize)
        if len(ksize_vals) > 1:
            raise Exception('multiple ksizes, quitting')

        if scaled and scaled > lca_db.scaled:
            lca_db.downsample_scaled(scaled)
        scaled_vals.add(lca_db.scaled)

        dblist.append(lca_db)

    ksize = ksize_vals.pop()
    scaled = scaled_vals.pop()

    if verbose:
        notify(u'\r\033[K', end=u'')
        notify('loaded {} LCA databases. ksize={}, scaled={}', len(dblist),
               ksize, scaled)

    return dblist, ksize, scaled
Beispiel #3
0
def info(verbose=False):
    notify('sourmash version {}', sourmash.VERSION)
    notify('- loaded from path: {}', os.path.dirname(__file__))
    notify('')

    if verbose:
        import khmer
        notify('khmer version {}', khmer.__version__)
        notify('- loaded from path: {}', os.path.dirname(khmer.__file__))
        notify('')

        notify('screed version {}', screed.__version__)
        notify('- loaded from path: {}', os.path.dirname(screed.__file__))
Beispiel #4
0
def load_taxonomy_assignments(filename, delimiter=',', start_column=2,
                              use_headers=True, force=False):
    """
    Load a taxonomy assignment spreadsheet into a dictionary.

    The 'assignments' dictionary that's returned maps identifiers to
    lineage tuples.
    """
    mode = 'rt'
    if sys.version_info < (3, ):
        mode = 'rtU'

    # parse spreadsheet!
    fp = open(filename, mode)
    r = csv.reader(fp, delimiter=delimiter)
    row_headers = ['identifiers']
    row_headers += ['_skip_']*(start_column - 2)
    row_headers += list(lca_utils.taxlist())

    # first check that headers are interpretable.
    if use_headers:
        notify('examining spreadsheet headers...')
        first_row = next(iter(r))

        n_disagree = 0
        for (column, value) in zip(row_headers, first_row):
            if column == '_skip_':
                continue

            if column.lower() != value.lower():
                notify("** assuming column '{}' is {} in spreadsheet",
                       value, column)
                n_disagree += 1
                if n_disagree > 2:
                    error('whoa, too many assumptions. are the headers right?')
                    error('expecting {}', ",".join(row_headers))
                    if not force:
                        sys.exit(-1)
                    notify('...continue, because --force was specified.')

    # convert into a lineage pair
    assignments = {}
    num_rows = 0
    n_species = 0
    n_strains = 0
    for row in r:
        if row and row[0].strip():        # want non-empty row
            num_rows += 1
            lineage = list(zip(row_headers, row))
            lineage = [ x for x in lineage if x[0] != '_skip_' ]

            ident = lineage[0][1]
            lineage = lineage[1:]

            # clean lineage of null names, replace with 'unassigned'
            lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
            lineage = [ LineagePair(a, b) for (a, b) in lineage ]

            # remove end nulls
            while lineage and lineage[-1].name == 'unassigned':
                lineage = lineage[:-1]

            # store lineage tuple
            if lineage:
                # check duplicates
                if ident in assignments:
                    if assignments[ident] != tuple(lineage):
                        if not force:
                            raise Exception("multiple lineages for identifier {}".format(ident))
                else:
                    assignments[ident] = tuple(lineage)

                    if lineage[-1].rank == 'species':
                        n_species += 1
                    elif lineage[-1].rank == 'strain':
                        n_species += 1
                        n_strains += 1

    fp.close()

    # this is to guard against a bug that happened once and I can't find
    # any more, when building a large GTDB-based database :) --CTB
    if len(assignments) * 0.2 > n_species and len(assignments) > 50:
        if not force:
            error('')
            error("ERROR: fewer than 20% of lineages have species-level resolution!?")
            error("({} species assignments found, of {} assignments total)",
                  n_species, len(assignments))
            error("** If this is intentional, re-run the command with -f.")
            sys.exit(-1)

    return assignments, num_rows
Beispiel #5
0
def index(args):
    """
    main function for building an LCA database.
    """
    if args.start_column < 2:
        error('error, --start-column cannot be less than 2')
        sys.exit(-1)

    set_quiet(args.quiet, args.debug)

    args.scaled = int(args.scaled)

    if args.ksize is None:
        args.ksize = DEFAULT_LOAD_K

    moltype = sourmash_args.calculate_moltype(args, default='DNA')

    notify('Building LCA database with ksize={} scaled={} moltype={}.',
           args.ksize, args.scaled, moltype)

    # first, load taxonomy spreadsheet
    delimiter = ','
    if args.tabs:
        delimiter = '\t'
    assignments, num_rows = load_taxonomy_assignments(args.csv,
                                               delimiter=delimiter,
                                               start_column=args.start_column,
                                               use_headers=not args.no_headers,
                                               force=args.force)

    notify('{} distinct identities in spreadsheet out of {} rows.',
           len(assignments), num_rows)
    notify('{} distinct lineages in spreadsheet out of {} rows.',
           len(set(assignments.values())), num_rows)

    db = LCA_Database(args.ksize, args.scaled, moltype)

#    notify('finding signatures...')
    if args.traverse_directory:
        yield_all_files = False           # only pick up *.sig files?
        if args.force:
            yield_all_files = True
        inp_files = list(sourmash_args.traverse_find_sigs(args.signatures,
                                                          yield_all_files=yield_all_files))
    else:
        inp_files = list(args.signatures)

    # track duplicates
    md5_to_name = {}

    #
    # main loop, connecting lineage ID to signature.
    #

    n = 0
    total_n = len(inp_files)
    record_duplicates = set()
    record_no_lineage = set()
    record_remnants = set(assignments)
    record_used_lineages = set()
    record_used_idents = set()
    n_skipped = 0
    for filename in inp_files:
        n += 1
        for sig in load_signatures(filename, ksize=args.ksize,
                                   select_moltype=moltype):
            notify(u'\r\033[K', end=u'')
            notify('\r... loading signature {} ({} of {}); skipped {} so far', sig.name()[:30], n, total_n, n_skipped, end='')
            debug(filename, sig.name())

            # block off duplicates.
            if sig.md5sum() in md5_to_name:
                debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum())
                record_duplicates.add(filename)
                continue

            md5_to_name[sig.md5sum()] = sig.name()

            # parse identifier, potentially with splitting
            ident = sig.name()
            if args.split_identifiers: # hack for NCBI-style names, etc.
                # split on space...
                ident = ident.split(' ')[0]
                # ...and on period.
                ident = ident.split('.')[0]

            lineage = assignments.get(ident)

            # punt if no lineage and --require-taxonomy
            if lineage is None and args.require_taxonomy:
                debug('(skipping, because --require-taxonomy was specified)')
                n_skipped += 1
                continue

            # add the signature into the database.
            db.insert(sig, ident=ident, lineage=lineage)

            if lineage:
                # remove from our list of remaining ident -> lineage
                record_remnants.remove(ident)

                # track ident as used
                record_used_idents.add(ident)
                record_used_lineages.add(lineage)

            # track lineage info - either no lineage, or this lineage used.
            else:
                debug('WARNING: no lineage assignment for {}.', ident)
                record_no_lineage.add(ident)

    # end main add signatures loop

    if n_skipped:
        notify('... loaded {} signatures; skipped {} because of --require-taxonomy.', total_n, n_skipped)
    else:
        notify('... loaded {} signatures.', total_n)

    # check -- did we find any signatures?
    if n == 0:
        error('ERROR: no signatures found. ??')
        if args.traverse_directory and not args.force:
            error('(note, with --traverse-directory, you may want to use -f)')
        sys.exit(1)

    # check -- did the signatures we found have any hashes?
    if not db.hashval_to_idx:
        error('ERROR: no hash values found - are there any signatures?')
        sys.exit(1)
    notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx),
           args.ksize, args.scaled)

    # summarize:
    notify('{} assigned lineages out of {} distinct lineages in spreadsheet.',
           len(record_used_lineages), len(set(assignments.values())))
    unused_lineages = set(assignments.values()) - record_used_lineages

    notify('{} identifiers used out of {} distinct identifiers in spreadsheet.',
           len(record_used_idents), len(set(assignments)))

    assert record_used_idents.issubset(set(assignments))
    unused_identifiers = set(assignments) - record_used_idents

    # now, save!
    db_outfile = args.lca_db_out
    if not (db_outfile.endswith('.lca.json') or \
                db_outfile.endswith('.lca.json.gz')):   # logic -> db.save
        db_outfile += '.lca.json'
    notify('saving to LCA DB: {}'.format(db_outfile))

    db.save(db_outfile)

    ## done!

    # output a record of stuff if requested/available:
    if record_duplicates or record_no_lineage or record_remnants or unused_lineages:
        if record_duplicates:
            notify('WARNING: {} duplicate signatures.', len(record_duplicates))
        if record_no_lineage:
            notify('WARNING: no lineage provided for {} signatures.',
                   len(record_no_lineage))
        if record_remnants:
            notify('WARNING: no signatures for {} spreadsheet rows.',
                   len(record_remnants))
        if unused_lineages:
            notify('WARNING: {} unused lineages.', len(unused_lineages))

        if unused_identifiers:
            notify('WARNING: {} unused identifiers.', len(unused_identifiers))

        if args.report:
            notify("generating a report and saving in '{}'", args.report)
            generate_report(record_duplicates, record_no_lineage,
                            record_remnants, unused_lineages,
                            unused_identifiers, args.report)
        else:
            notify('(You can use --report to generate a detailed report.)')
Beispiel #6
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('hashfile') 					# file that contains hashes
    p.add_argument('-o', '--output', default=None,
                   help='file to output signature to')
    p.add_argument('-k', '--ksize', default=None, type=int)
    p.add_argument('--scaled', default=None, type=int)
    p.add_argument('--num', default=None, type=int)
    p.add_argument('--name', default='', help='signature name')
    p.add_argument('--filename', default='',
                   help='filename to add to signature')
    args = p.parse_args()

    # check arguments.
    if args.scaled and args.num:
        error('cannot specify both --num and --scaled! exiting.')
        return -1

    if not args.ksize:
        error('must specify --ksize')
        return -1

    if not args.output:
        error('must specify --output')
        return -1

    # first, load in all the hashes
    hashes = set()
    for line in open(args.hashfile, 'rt'):
        hashval = int(line.strip())
        hashes.add(hashval)

    if not hashes:
        error("ERROR, no hashes loaded from {}!", args.hashfile)
        return -1

    notify('loaded {} distinct hashes from {}', len(hashes), args.hashfile)

    # now, create the MinHash object that we'll use.
    scaled = 0
    num = 0
    if args.scaled:
        scaled = args.scaled
    elif args.num:
        num = args.num
    else:
        notify('setting --num automatically from the number of hashes.')
        num = len(hashes)

    # construct empty MinHash object according to args
    minhash = MinHash(n=num, ksize=args.ksize, scaled=scaled)

    # add hashes into!
    minhash.add_many(hashes)

    if len(minhash) < len(hashes):
        notify("WARNING: loaded {} hashes, but only {} made it into MinHash.",
               len(hashes), len(minhash))
        if scaled:
            notify("This is probably because of the scaled argument.")
        elif args.num:
            notify("This is probably because your --num is set to {}",
                   args.num)

    if num > len(minhash):
        notify("WARNING: --num set to {}, but only {} hashes in signature.",
               num, len(minhash))

    sigobj = sourmash.SourmashSignature(minhash, name=args.name,
                                        filename=args.filename)

    with open(args.output, 'wt') as fp:
        sourmash.save_signatures([sigobj], fp)
    notify('wrote signature to {}', args.output)
Beispiel #7
0
def info(verbose=False):
    "Report sourmash version + version of installed dependencies."
    notify('sourmash version {}', sourmash.VERSION)
    notify('- loaded from path: {}', os.path.dirname(__file__))
    notify('')

    if verbose:
        notify('khmer version: None (internal Nodegraph)')
        notify('')

        notify('screed version {}', screed.__version__)
        notify('- loaded from path: {}', os.path.dirname(screed.__file__))
Beispiel #8
0
def compare_all_seqs(
    seqlist1,
    seqlist2=None,
    n_jobs=4,
    ksizes=KSIZES,
    moltype="protein",
    n_background=100,
    paired_seqlists=True,
    intermediate_csv=False,
    intermediate_parquet=False,
    no_final_concatenation=False,
):
    """Compare k-mer content of sequences across k-mer sizes and alphabets

    Parameters
    ----------
    seqlist1 : list
        List of (id, seq) tuples
    seqlist2 : list, optional
        List of (id, seq) tuples. If None, then an all-by-all comparison of
        sequences in seqlist1 is performed, as if seqlist1 was provided as
        seqlist2.
    ksizes : iterable of int
        K-mer sizes to extract and compare the sequences on
    moltype : str
        One of "protein" or "dna" -- for knowing which alphabets to use
    n_background : int
        When paired_seqlist is True, how many random background sequences to
        choose from seqlist2
    n_jobs : int
        Number of jobs for multiprocessing
    paired_seqlists : bool
        If True, then seqlist1 and seqlist2 have sequences at the same index
        that need to be compared, i.e. index 0 across the two. Best used when
        seqlist1 and seqlist2 are lists of homologous protein sequences across
        two different species
    intermediate_parquet : bool
        Write intermediate file of all comparisons at index i to an
        IO-efficient parquet format
    intermediate_csv : bool
        Write intermediate file of all comparisons at index i to an
        csv format

    Returns
    -------
    kmer_comparisons : pandas.DataFrame
        A table of seq1_id, seq2_id, ksize, alphabet encoding, jaccard
        similarity

    Raises
    ------
    ValueError:
        If paired_seqlist=True and seqlist1 and seqlist2 are of different
        lengths, as the comparison is done pairwise across both, as if the
        'zip' operator was used.

    """
    if seqlist2 is not None:
        if paired_seqlists and len(seqlist1) != len(seqlist2):
            raise ValueError(
                "When comparing pairs of sequences, can only "
                "compare two sequences of equal length"
            )
        elif not paired_seqlists:
            # Want seqlist1 to be shorter so that there are fewer, bigger jobs
            # to minimize thread spawning costs
            if len(seqlist2) > len(seqlist1):
                # Swap the seqlist orders so seqlist1 is the shorter one
                old_seqlist1 = seqlist1
                old_seqlist2 = seqlist2
                seqlist2 = old_seqlist1
                seqlist1 = old_seqlist2
    else:
        seqlist2 = seqlist1

    n = len(seqlist1)
    m = len(seqlist2)

    n_comparisons = n * m
    t0 = time.time()
    len_seqlist1 = len(seqlist1)
    notify(f"Number of comparisons: {n} * {m} = {n_comparisons:,}")

    # Initialize the function using func.partial with the common arguments like
    # siglist, ignore_abundance, downsample, for computing all the signatures
    # The only changing parameter that will be mapped from the pool is the
    # index
    func = partial(
        get_comparison_at_index,
        seqlist1=seqlist1,
        seqlist2=seqlist2,
        n_background=n_background,
        ksizes=ksizes,
        moltype=moltype,
        paired_seqlists=paired_seqlists,
        intermediate_csv=intermediate_csv,
        intermediate_parquet=intermediate_parquet,
        no_final_concatenation=no_final_concatenation,
    )
    notify("Created similarity func")

    # Initialize multiprocess.pool
    pool = multiprocessing.Pool(processes=n_jobs)

    # Calculate chunk size, by default pool.imap chunk size is 1
    chunksize, extra = divmod(len_seqlist1, n_jobs)
    if extra:
        chunksize += 1
    notify("Calculated chunk size for multiprocessing")

    # This will not generate the results yet, since pool.imap returns a
    # generator
    result = pool.imap(func, range(len_seqlist1), chunksize=chunksize)
    notify("Initialized multiprocessing pool.imap")

    peptide_kmer_comparisons = pd.concat(itertools.chain(*result), ignore_index=True)

    notify(f"Total time: {time.time() - t0}")
    return peptide_kmer_comparisons
Beispiel #9
0
def get_comparison_at_index(
    index,
    seqlist1,
    seqlist2=None,
    ksizes=KSIZES,
    n_background=100,
    moltype="protein",
    verbose=False,
    paired_seqlists=True,
    intermediate_csv=False,
    intermediate_parquet=False,
    no_final_concatenation=False,
):
    """Returns similarities of all combinations of seqlist1 seqlist2 at index

    Parameters
    ----------
    index : int
        generate masks from this image
    seqlist1 : list
        List of (id, seq) tuples
    seqlist2 : list, optional (default None)
        List of (id, seq) tuples. If None, then an all-by-all comparison of
        sequences in seqlist1 is performed, as if seqlist1 was provided as
        seqlist2.
    ksizes : iterable of int
        K-mer sizes to extract and compare the sequences on
    moltype : str, optional (default "protein")
        One of "protein" or "dna" -- for knowing which alphabets to use
    verbose : boolean, default False
    n_background : int, optional (default 100)
        When paired_seqlist is True, how many random background sequences to
        choose from seqlist2
    paired_seqlists : bool, optional (default True)
        If True, then seqlist1 and seqlist2 have sequences at the same index
        that need to be compared, i.e. index 0 across the two. Best used when
        seqlist1 and seqlist2 are lists of homologous protein sequences across
        two different species
    intermediate_parquet : bool
        Write intermediate file of all comparisons at index i to an
        IO-efficient parquet format
    intermediate_csv : bool
        Write intermediate file of all comparisons at index i to an
        csv format

    Returns
    -------
    comparison_df_list : list
        list of pandas.DataFrame tables for the combinations of seqlist1 at
        index, compared to seqlist2
    """
    startt = time.time()
    id1 = seqlist1[index][0]
    id1_sanitized = sanitize_id(id1)
    csv = id1_sanitized + ".csv"
    parquet = id1_sanitized + ".parquet"
    if os.path.exists(parquet):
        notify(f"Found {parquet} already exists for {id1}, skipping", end="\r")
        return []
    if os.path.exists(csv):
        notify(f"Found {csv} already exists for {id1}, skipping", end="\r")
        return []

    if seqlist2 is not None:
        if paired_seqlists:
            seq_iterator = get_paired_seq_iterator(
                index, n_background, seqlist1, seqlist2, verbose
            )
        else:
            seq_iterator = itertools.product([seqlist1[index]], seqlist2)
    else:
        seq_iterator = itertools.product([seqlist1[index]], seqlist1[index + 1 :])

    func = partial(compare_args_unpack, ksizes=ksizes, moltype=moltype)
    comparision_df_list = list(map(func, seq_iterator))
    notify(
        "comparison for index {} (id: {}) done in {:.5f} seconds",
        index,
        id1,
        time.time() - startt,
        end="\n",
    )

    if intermediate_csv or intermediate_parquet:
        df = pd.concat(comparision_df_list)
        if intermediate_csv:
            df.to_csv(csv)
        if intermediate_parquet:
            df.to_parquet(parquet)
        del df
    if no_final_concatenation:
        del comparision_df_list
        return []
    else:
        return comparision_df_list
Beispiel #10
0
def abundhist(args):
    """
    output abundance histogram and/or raw abundances.
    """

    set_quiet(args.quiet)
    moltype = sourmash_args.calculate_moltype(args)

    outlist = []
    total_loaded = 0
    for filename in args.signatures:
        siglist = sourmash.load_file_as_signatures(filename,
                                                   ksize=args.ksize,
                                                   select_moltype=moltype)
        siglist = list(siglist)

        total_loaded += len(siglist)

        # select!
        if args.md5 is not None:
            siglist = [ss for ss in siglist if args.md5 in ss.md5sum()]
        if args.name is not None:
            siglist = [ss for ss in siglist if args.name in ss.name()]

    notify("loaded {} total that matched ksize & molecule type", total_loaded)
    if len(siglist) != total_loaded:
        notify("selected {} via name / md5 selectors".format(len(siglist)))
    notify('')

    counts_d = collections.defaultdict(int)
    for ss in siglist:
        for hashval, abund in ss.minhash.hashes.items():
            counts_d[hashval] += abund

    all_counts = list(counts_d.values())

    min_range = 1
    if args.min is not None:
        min_range = args.min
    max_range = max(all_counts)
    if args.max is not None:
        max_range = args.max

    n_bins = args.bins
    if max_range - min_range + 1 < n_bins:
        n_bins = max_range - min_range + 1

    # make hist
    counts, bin_edges = numpy.histogram(all_counts,
                                        range=(min_range, max_range),
                                        bins=n_bins)
    bin_edges = bin_edges.astype(int)

    # plot
    fig = tpl.figure()
    f = fig.barh(counts, [str(x) for x in bin_edges[1:]], force_ascii=True)
    fig.show()

    # output histogram in csv?
    if args.output:
        with FileOutput(args.output, 'wt') as fp:
            w = csv.writer(fp)
            w.writerow(['count', 'n_count'])
            for nc, c in zip(counts, bin_edges[1:]):
                w.writerow([c, nc])

    # output raw counts tagged with hashval?
    if args.abundances:
        with FileOutput(args.abundances, 'wt') as fp:
            w = csv.writer(fp)
            w.writerow(['hashval', 'count'])
            for hashval, count in counts_d.items():
                w.writerow([hashval, count])