Exemple #1
0
def addRandom(oPrefix, sequence_names, klist,
              strand_preserved = False, overwrite = False, threads = 1):
    """Add chance of random match to a HDF5 sketch DB

    Args:
        oPrefix (str)
            Sketch database prefix
        sequence_names (list)
            Names of sequences to include in calculation
        klist (list)
            List of k-mer sizes to sketch
        strand_preserved (bool)
            Set true to ignore rc k-mers
        overwrite (str)
            Set true to overwrite existing random match chances
        threads (int)
            Number of threads to use (default = 1)
    """
    if len(sequence_names) <= 2:
        sys.stderr.write("Cannot add random match chances with this few genomes\n")
    else:
        dbname = oPrefix + "/" + os.path.basename(oPrefix)
        hdf_in = h5py.File(dbname + ".h5", 'r+')

        if 'random' in hdf_in:
            if overwrite:
                del hdf_in['random']
            else:
                sys.stderr.write("Using existing random match chances in DB\n")
                return

        hdf_in.close()
        pp_sketchlib.addRandom(dbname,
                               sequence_names,
                               klist,
                               not strand_preserved,
                               threads)
Exemple #2
0
def constructDatabase(assemblyList,
                      klist,
                      sketch_size,
                      oPrefix,
                      threads,
                      overwrite,
                      strand_preserved,
                      min_count,
                      use_exact,
                      qc_dict,
                      calc_random=True,
                      codon_phased=False,
                      use_gpu=False,
                      deviceid=0):
    """Sketch the input assemblies at the requested k-mer lengths

    A multithread wrapper around :func:`~runSketch`. Threads are used to either run multiple sketch
    processes for each klist value, or increase the threads used by each ``mash sketch`` process
    if len(klist) > threads.

    Also calculates random match probability based on length of first genome
    in assemblyList.

    Args:
        assemblyList (str)
            File with locations of assembly files to be sketched
        klist (list)
            List of k-mer sizes to sketch
        sketch_size (int)
            Size of sketch (``-s`` option)
        oPrefix (str)
            Output prefix for resulting sketch files
        threads (int)
            Number of threads to use (default = 1)
        overwrite (bool)
            Whether to overwrite sketch DBs, if they already exist.
            (default = False)
        strand_preserved (bool)
            Ignore reverse complement k-mers (default = False)
        min_count (int)
            Minimum count of k-mer in reads to include
            (default = 0)
        use_exact (bool)
            Use exact count of k-mer appearance in reads
            (default = False)
        qc_dict (dict)
            Dict containg QC settings
        calc_random (bool)
            Add random match chances to DB (turn off for queries)
        codon_phased (bool)
            Use codon phased seeds
            (default = False)
        use_gpu (bool)
            Use GPU for read sketching
            (default = False)
        deviceid (int)
            GPU device id
            (default = 0)
    """
    # read file names
    names, sequences = readRfile(assemblyList)

    # create directory
    dbname = oPrefix + "/" + os.path.basename(oPrefix)
    dbfilename = dbname + ".h5"
    if os.path.isfile(dbfilename) and overwrite == True:
        sys.stderr.write("Overwriting db: " + dbfilename + "\n")
        os.remove(dbfilename)

    # generate sketches
    pp_sketchlib.constructDatabase(dbname, names, sequences, klist,
                                   sketch_size, codon_phased, False,
                                   not strand_preserved, min_count, use_exact,
                                   threads, use_gpu, deviceid)

    # QC sequences
    if qc_dict['run_qc']:
        filtered_names = sketchlib_assembly_qc(oPrefix, klist, qc_dict,
                                               strand_preserved, threads)
    else:
        filtered_names = names

    # Add random matches if required
    # (typically on for reference, off for query)
    if (calc_random):
        pp_sketchlib.addRandom(dbname, filtered_names, klist,
                               not strand_preserved, threads)

    # return filtered file names
    return filtered_names
Exemple #3
0
def joinDBs(db1, db2, output, update_random = None):
    """Join two sketch databases with the low-level HDF5 copy interface

    Args:
        db1 (str)
            Prefix for db1
        db2 (str)
            Prefix for db2
        output (str)
            Prefix for joined output
        update_random (dict)
            Whether to re-calculate the random object. May contain
            control arguments strand_preserved and threads (see :func:`addRandom`)
    """
    join_prefix = output + "/" + os.path.basename(output)
    db1_name = db1 + "/" + os.path.basename(db1) + ".h5"
    db2_name = db2 + "/" + os.path.basename(db2) + ".h5"

    hdf1 = h5py.File(db1_name, 'r')
    hdf2 = h5py.File(db2_name, 'r')
    hdf_join = h5py.File(join_prefix + ".tmp.h5", 'w') # add .tmp in case join_name exists

    # Can only copy into new group, so for second file these are appended one at a time
    try:
        hdf1.copy('sketches', hdf_join)

        join_grp = hdf_join['sketches']
        read_grp = hdf2['sketches']
        for dataset in read_grp:
            join_grp.copy(read_grp[dataset], dataset)

        # Copy or update random matches
        if update_random is not None:
            threads = 1
            strand_preserved = False
            if isinstance(update_random, dict):
                if "threads" in update_random:
                    threads = update_random["threads"]
                if "strand_preserved" in update_random:
                    strand_preserved = update_random["strand_preserved"]

            sequence_names = list(hdf_join['sketches'].keys())
            kmer_size = hdf_join['sketches/' + sequence_names[0]].attrs['kmers']

            # Need to close before adding random
            hdf_join.close()
            if len(sequence_names) > 2:
                sys.stderr.write("Updating random match chances\n")
                pp_sketchlib.addRandom(join_prefix + ".tmp",
                                       sequence_names,
                                       kmer_size,
                                       not strand_preserved,
                                       threads)
        elif 'random' in hdf1:
            hdf1.copy('random', hdf_join)

        # Clean up
        hdf1.close()
        hdf2.close()
        if update_random is None:
            hdf_join.close()

    except RuntimeError as e:
        sys.stderr.write("ERROR: " + str(e) + "\n")
        sys.stderr.write("Joining sketches failed, try running without --update-db\n")
        sys.exit(1)

    # Rename results to correct location
    os.rename(join_prefix + ".tmp.h5", join_prefix + ".h5")
def main():
    args = get_options()

    if args.min_k >= args.max_k or args.min_k < 3 or args.max_k > 101 or args.k_step < 1:
        sys.stderr.write(
            "Minimum kmer size " + str(args.min_k) +
            " must be smaller than maximum kmer size " + str(args.max_k) +
            "; range must be between 3 and 101, step must be at least one\n")
        sys.exit(1)
    kmers = np.arange(args.min_k, args.max_k + 1, args.k_step)

    #
    # Create a database (sketch input)
    #
    if args.sketch:
        names = []
        sequences = []

        with open(args.rfile, 'rU') as refFile:
            for refLine in refFile:
                refFields = refLine.rstrip().split("\t")
                names.append(refFields[0])
                sequences.append(list(refFields[1:]))

        if len(set(names)) != len(names):
            sys.stderr.write(
                "Input contains duplicate names! All names must be unique\n")
            sys.exit(1)

        pp_sketchlib.constructDatabase(args.ref_db, names, sequences, kmers,
                                       int(round(args.sketch_size / 64)),
                                       args.codon_phased, not args.no_random,
                                       args.strand, args.min_count,
                                       args.exact_counter, args.cpus,
                                       args.use_gpu, args.gpu_id)

    #
    # Join two databases
    #
    elif args.join:
        join_name = args.output + ".h5"
        db1_name = args.ref_db + ".h5"
        db2_name = args.query_db + ".h5"

        hdf1 = h5py.File(db1_name, 'r')
        hdf2 = h5py.File(db2_name, 'r')

        try:
            v1 = hdf1['sketches'].attrs['sketch_version']
            v2 = hdf2['sketches'].attrs['sketch_version']
            if (v1 != v2):
                sys.stderr.write(
                    "Databases have been written with different sketch versions, "
                    "joining not recommended (but proceeding anyway)\n")
            p1 = hdf1['sketches'].attrs['codon_phased']
            p2 = hdf2['sketches'].attrs['codon_phased']
            if (p1 != p2):
                sys.stderr.write(
                    "One database uses codon-phased seeds - cannot join "
                    "with a standard seed database\n")
        except RuntimeError as e:
            sys.stderr.write("Unable to check sketch version\n")

        hdf_join = h5py.File(join_name + ".tmp",
                             'w')  # add .tmp in case join_name exists

        # Can only copy into new group, so for second file these are appended one at a time
        try:
            hdf1.copy('sketches', hdf_join)
            join_grp = hdf_join['sketches']
            read_grp = hdf2['sketches']
            for dataset in read_grp:
                join_grp.copy(read_grp[dataset], dataset)

            if 'random' in hdf1 or 'random' in hdf2:
                sys.stderr.write(
                    "Random matches found in one database, which will not be copied\n"
                    "Use --add-random to recalculate for the joined DB\n")
        except RuntimeError as e:
            sys.stderr.write("ERROR: " + str(e) + "\n")
            sys.stderr.write("Joining sketches failed\n")
            sys.exit(1)

        # Clean up
        hdf1.close()
        hdf2.close()
        hdf_join.close()
        os.rename(join_name + ".tmp", join_name)

    #
    # Query a database (calculate distances)
    #
    elif args.query:
        rList = getSampleNames(args.ref_db)
        qList = getSampleNames(args.query_db)

        if args.subset != None:
            subset = []
            with open(args.subset, 'r') as subset_file:
                for line in subset_file:
                    sample_name = line.rstrip().split("\t")[0]
                    subset.append(sample_name)
            rList = list(set(rList).intersection(subset))
            qList = list(set(qList).intersection(subset))
            if (len(rList) == 0 or len(qList) == 0):
                sys.stderr.write("Subset has removed all samples\n")
                sys.exit(1)

        # Check inputs overlap
        ref = h5py.File(args.ref_db + ".h5", 'r')
        query = h5py.File(args.query_db + ".h5", 'r')
        db_kmers = set(ref['sketches/' +
                           rList[0]].attrs['kmers']).intersection(
                               query['sketches/' + qList[0]].attrs['kmers'])
        if args.read_k:
            query_kmers = sorted(db_kmers)
        else:
            query_kmers = sorted(set(kmers).intersection(db_kmers))
            if (len(query_kmers) == 0):
                sys.stderr.write("No requested k-mer lengths found in DB\n")
                sys.exit(1)
            elif (len(query_kmers) < len(query_kmers)):
                sys.stderr.write(
                    "Some requested k-mer lengths not found in DB\n")
        ref.close()
        query.close()

        if args.sparse:
            sparseIdx = pp_sketchlib.queryDatabaseSparse(
                args.ref_db, args.query_db, rList, qList, query_kmers,
                not args.no_correction, args.threshold, args.kNN,
                not args.accessory, args.cpus, args.use_gpu, args.gpu_id)
            if args.print:
                if args.accessory:
                    distName = 'Accessory'
                else:
                    distName = 'Core'
                sys.stdout.write("\t".join(['Query', 'Reference', distName]) +
                                 "\n")

                (i_vec, j_vec, dist_vec) = sparseIdx
                for (i, j, dist) in zip(i_vec, j_vec, dist_vec):
                    sys.stdout.write("\t".join([rList[i], qList[j],
                                                str(dist)]) + "\n")

            else:
                coo_matrix = ijv_to_coo(sparseIdx, (len(rList), len(qList)),
                                        np.float32)
                storePickle(rList, qList, rList == qList, coo_matrix,
                            args.output)

        else:
            distMat = pp_sketchlib.queryDatabase(args.ref_db, args.query_db,
                                                 rList, qList, query_kmers,
                                                 not args.no_correction,
                                                 args.jaccard, args.cpus,
                                                 args.use_gpu, args.gpu_id)

            # get names order
            if args.print:
                names = iterDistRows(rList, qList, rList == qList)
                if not args.jaccard:
                    sys.stdout.write("\t".join(
                        ['Query', 'Reference', 'Core', 'Accessory']) + "\n")
                    for i, (ref, query) in enumerate(names):
                        sys.stdout.write("\t".join([
                            query, ref,
                            str(distMat[i, 0]),
                            str(distMat[i, 1])
                        ]) + "\n")
                else:
                    sys.stdout.write("\t".join(['Query', 'Reference'] +
                                               [str(i)
                                                for i in query_kmers]) + "\n")
                    for i, (ref, query) in enumerate(names):
                        sys.stdout.write("\t".join(
                            [query, ref] + [str(k)
                                            for k in distMat[i, ]]) + "\n")
            else:
                storePickle(rList, qList, rList == qList, distMat, args.output)

    #
    # Add random match chances to an older database
    #
    elif args.add_random:
        rList = getSampleNames(args.ref_db)
        ref = h5py.File(args.ref_db + ".h5", 'r')
        db_kmers = ref['sketches/' + rList[0]].attrs['kmers']
        ref.close()

        pp_sketchlib.addRandom(args.ref_db, rList, db_kmers, args.strand,
                               args.cpus)

    sys.exit(0)