Exemple #1
0
def check_distances(ref_fasta, fastq_folder, work_dir):
    bad_fastqs = list()
    # fastqs = glob.glob(os.path.join(fastq_folder, '*R1*'))
    mash.sketch(os.path.join(fastq_folder, '*R1*'), output_sketch=os.path.join(work_dir, 'sketch.msh'), threads=5)
    mash.dist(os.path.join(work_dir, 'sketch.msh'), ref_fasta, threads=48,
              output_file=os.path.join(work_dir, 'distances.tab'))
    mash_output = mash.read_mash_output(os.path.join(work_dir, 'distances.tab'))
    for item in mash_output:
        print(item.reference, item.query, str(item.distance))
        if item.distance > 0.15:  # Moved value from 0.06 to 0.15 - was definitely too conservative before.
            bad_fastqs.append(item.reference)
    return bad_fastqs
Exemple #2
0
def test_mash_sketch_call_multithreaded():
    out, err, cmd = mash.sketch('tests/dummy_fastq/*fastq',
                                output_sketch='tests/sketch.msh',
                                returncmd=True,
                                threads=4)
    assert cmd == 'mash sketch tests/dummy_fastq/*fastq -o tests/sketch.msh -p 4 '
    os.remove('tests/sketch.msh')
def check_distances(ref_fasta, fasta_folder):
    bad_fastqs = list()
    # fastqs = glob.glob(os.path.join(fastq_folder, '*R1*'))
    mash.sketch(os.path.join(fasta_folder, '*.fasta'),
                output_sketch=os.path.join(fasta_folder, 'sketch.msh'),
                threads=56)
    mash.dist(os.path.join(fasta_folder, 'sketch.msh'),
              ref_fasta,
              threads=56,
              output_file=os.path.join(fasta_folder, 'distances.tab'))
    mash_output = mash.read_mash_output(
        os.path.join(fasta_folder, 'distances.tab'))
    for item in mash_output:
        print(item.reference, item.query, str(item.distance))
        if item.distance > 0.06:  # May need to adjust this value.
            bad_fastqs.append(item.reference)
    return bad_fastqs
Exemple #4
0
def test_sketch_no_input_files():
    with pytest.raises(ValueError):
        mash.sketch()
Exemple #5
0
def test_mash_sketch_call():
    out, err, cmd = mash.sketch('tests/dummy_fastq/*fastq',
                                output_sketch='tests/sketch.msh',
                                returncmd=True)
    assert cmd == 'mash sketch tests/dummy_fastq/*fastq -o tests/sketch.msh -p 1 '
    os.remove('tests/sketch.msh')
def mash_for_potential_plasmids(plasmid_db,
                                forward_reads,
                                output_dir,
                                reverse_reads=None,
                                threads=1,
                                logfile=None,
                                identity_cutoff=0.95):
    """
    Uses mash to find a list of potential plasmids in a set of forward (and optionally reverse) reads.
    :param plasmid_db: Path to a multi-Fasta-formatted file that has plasmid sequences of interest.
    :param forward_reads: Path to forward reads.
    :param output_dir: Path to output directory where mash sketch/screen result file will be stored.
    :param reverse_reads: Path to reverse reads. If not specified, things will work in unpaired mode.
    :param threads: Number of threads to run mash analyses on.
    :param logfile: Path to logfile you want to use.
    :param identity_cutoff: Mash screen identity cutoff. Values lower than this won't be reported.
    :return: potential_plasmids: A list where each entry is a putatively present plasmid, identified by
    the fasta header.
    """
    potential_plasmids = list()
    # Make sure the output dir specified gets created if it doesn't exist.
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    # Make a sketch of the plasmid db.
    out, err = mash.sketch(plasmid_db,
                           output_sketch=os.path.join(output_dir,
                                                      'plasmid_sketch.msh'),
                           threads=threads,
                           i='')
    if logfile:
        accessoryFunctions.write_to_logfile(out, err, logfile)

    # Now it's time to use mash screen to try to figure out what plasmids might be present in our sample.
    if reverse_reads:  # As usual, do things slightly differently for paired vs unpaired reads.
        out, err = mash.screen(os.path.join(output_dir, 'plasmid_sketch.msh'),
                               forward_reads,
                               reverse_reads,
                               output_file=os.path.join(
                                   output_dir, 'screen_results.tsv'),
                               threads=threads,
                               i=identity_cutoff)
        if logfile:
            accessoryFunctions.write_to_logfile(out, err, logfile)
    else:  # Unpaired read mode.
        out, err = mash.screen(os.path.join(output_dir, 'plasmid_sketch.msh'),
                               forward_reads,
                               output_file=os.path.join(
                                   output_dir, 'screen_results.tsv'),
                               threads=threads,
                               i=identity_cutoff)
        if logfile:
            accessoryFunctions.write_to_logfile(out, err, logfile)

    # Now need to read through the list of potential plasmids generated by the mash screen.
    results = mash.read_mash_screen(
        screen_result=os.path.join(output_dir, 'screen_results.tsv'))
    for item in results:
        potential_plasmids.append(item.query_id)

    return potential_plasmids