def check_distances(ref_fasta, fastq_folder, work_dir): bad_fastqs = list() # fastqs = glob.glob(os.path.join(fastq_folder, '*R1*')) mash.sketch(os.path.join(fastq_folder, '*R1*'), output_sketch=os.path.join(work_dir, 'sketch.msh'), threads=5) mash.dist(os.path.join(work_dir, 'sketch.msh'), ref_fasta, threads=48, output_file=os.path.join(work_dir, 'distances.tab')) mash_output = mash.read_mash_output(os.path.join(work_dir, 'distances.tab')) for item in mash_output: print(item.reference, item.query, str(item.distance)) if item.distance > 0.15: # Moved value from 0.06 to 0.15 - was definitely too conservative before. bad_fastqs.append(item.reference) return bad_fastqs
def test_mash_sketch_call_multithreaded(): out, err, cmd = mash.sketch('tests/dummy_fastq/*fastq', output_sketch='tests/sketch.msh', returncmd=True, threads=4) assert cmd == 'mash sketch tests/dummy_fastq/*fastq -o tests/sketch.msh -p 4 ' os.remove('tests/sketch.msh')
def check_distances(ref_fasta, fasta_folder): bad_fastqs = list() # fastqs = glob.glob(os.path.join(fastq_folder, '*R1*')) mash.sketch(os.path.join(fasta_folder, '*.fasta'), output_sketch=os.path.join(fasta_folder, 'sketch.msh'), threads=56) mash.dist(os.path.join(fasta_folder, 'sketch.msh'), ref_fasta, threads=56, output_file=os.path.join(fasta_folder, 'distances.tab')) mash_output = mash.read_mash_output( os.path.join(fasta_folder, 'distances.tab')) for item in mash_output: print(item.reference, item.query, str(item.distance)) if item.distance > 0.06: # May need to adjust this value. bad_fastqs.append(item.reference) return bad_fastqs
def test_sketch_no_input_files(): with pytest.raises(ValueError): mash.sketch()
def test_mash_sketch_call(): out, err, cmd = mash.sketch('tests/dummy_fastq/*fastq', output_sketch='tests/sketch.msh', returncmd=True) assert cmd == 'mash sketch tests/dummy_fastq/*fastq -o tests/sketch.msh -p 1 ' os.remove('tests/sketch.msh')
def mash_for_potential_plasmids(plasmid_db, forward_reads, output_dir, reverse_reads=None, threads=1, logfile=None, identity_cutoff=0.95): """ Uses mash to find a list of potential plasmids in a set of forward (and optionally reverse) reads. :param plasmid_db: Path to a multi-Fasta-formatted file that has plasmid sequences of interest. :param forward_reads: Path to forward reads. :param output_dir: Path to output directory where mash sketch/screen result file will be stored. :param reverse_reads: Path to reverse reads. If not specified, things will work in unpaired mode. :param threads: Number of threads to run mash analyses on. :param logfile: Path to logfile you want to use. :param identity_cutoff: Mash screen identity cutoff. Values lower than this won't be reported. :return: potential_plasmids: A list where each entry is a putatively present plasmid, identified by the fasta header. """ potential_plasmids = list() # Make sure the output dir specified gets created if it doesn't exist. if not os.path.isdir(output_dir): os.makedirs(output_dir) # Make a sketch of the plasmid db. out, err = mash.sketch(plasmid_db, output_sketch=os.path.join(output_dir, 'plasmid_sketch.msh'), threads=threads, i='') if logfile: accessoryFunctions.write_to_logfile(out, err, logfile) # Now it's time to use mash screen to try to figure out what plasmids might be present in our sample. if reverse_reads: # As usual, do things slightly differently for paired vs unpaired reads. out, err = mash.screen(os.path.join(output_dir, 'plasmid_sketch.msh'), forward_reads, reverse_reads, output_file=os.path.join( output_dir, 'screen_results.tsv'), threads=threads, i=identity_cutoff) if logfile: accessoryFunctions.write_to_logfile(out, err, logfile) else: # Unpaired read mode. out, err = mash.screen(os.path.join(output_dir, 'plasmid_sketch.msh'), forward_reads, output_file=os.path.join( output_dir, 'screen_results.tsv'), threads=threads, i=identity_cutoff) if logfile: accessoryFunctions.write_to_logfile(out, err, logfile) # Now need to read through the list of potential plasmids generated by the mash screen. results = mash.read_mash_screen( screen_result=os.path.join(output_dir, 'screen_results.tsv')) for item in results: potential_plasmids.append(item.query_id) return potential_plasmids