def test_run_racon(): # This has a SNP and two indels to fix. Also one position where # about 2/3 of the reads say A and the rest say T. Expect this to get # corrected to T. fa_to_polish = os.path.join(data_dir, "run_racon.to_polish.fa") seq_to_polish = utils.load_single_seq_fasta(fa_to_polish) reads = os.path.join(data_dir, "run_racon.reads.fa") pre_out = "tmp.run_racon" utils.rm_rf(f"{pre_out}.sam") utils.rm_rf(f"{pre_out}.to_polish.fa") polished1 = racon.run_racon(seq_to_polish, reads, pre_out, debug=True) assert polished1 != fa_to_polish assert ( polished1 == "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT" ) # we used debug mode, so intermediate files should be left on disk assert os.path.exists(f"{pre_out}.sam") assert os.path.exists(f"{pre_out}.to_polish.fa") os.unlink(f"{pre_out}.sam") os.unlink(f"{pre_out}.to_polish.fa") # Another round of polishing shouldn't do anything polished2 = racon.run_racon(polished1, reads, pre_out, debug=False) assert polished1 == polished2 # we didn't use debug mode so intermediate files should be deleted assert not os.path.exists(f"{pre_out}.sam") assert not os.path.exists(f"{pre_out}.to_polish.fa")
def test_run_racon_iterations(): # A bit hard to come with small artificial test data for this one. # We'll just use the same data as for test_run_racon. Should stop after # 2 iterations because only the first run corrects anything fa_to_polish = os.path.join(data_dir, "run_racon.to_polish.fa") seq_to_polish = utils.load_single_seq_fasta(fa_to_polish) reads = os.path.join(data_dir, "run_racon.reads.fa") outdir = "tmp.run_racon_iterations" utils.rm_rf(outdir) got_polished = racon.run_racon_iterations(seq_to_polish, reads, outdir, max_iterations=3, debug=True) for i in range(2): outprefix = os.path.join(outdir, f"racon.{i}") assert os.path.exists(f"{outprefix}.sam") assert os.path.exists(f"{outprefix}.polished.fa") assert os.path.exists(f"{outprefix}.to_polish.fa") assert len(os.listdir(outdir)) == 6 assert ( got_polished == "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT" ) utils.rm_rf(outdir)
def test_mask_low_coverage(): outprefix = "tmp.mask_low_coverage" expect_debug_files = [f"{outprefix}.{x}" for x in ["fa", "sam", "bam"]] for filename in expect_debug_files: utils.rm_rf(filename) ref_seq = "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT" reads_file = os.path.join(data_dir, "mask_low_coverage.reads.fa") got_masked = utils.mask_low_coverage(ref_seq, reads_file, outprefix, min_depth=4, debug=True) assert ( got_masked == "NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT" ) for filename in expect_debug_files: assert os.path.exists(filename) os.unlink(filename) got_masked = utils.mask_low_coverage(ref_seq, reads_file, outprefix, min_depth=1, debug=False) for filename in expect_debug_files: assert not os.path.exists(filename) assert ( got_masked == "NNNNNNNNNNNNNNNNGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT" )
def test_run_racon_iterations_bad_data(): fa_to_polish = os.path.join(data_dir, "run_racon.to_polish.fa") seq_to_polish = utils.load_single_seq_fasta(fa_to_polish) reads = os.path.join(data_dir, "run_racon_bad_reads.fa") outdir = "tmp.run_racon_iterations" utils.rm_rf(outdir) got_polished = racon.run_racon_iterations(seq_to_polish, reads, outdir, max_iterations=3, debug=True) assert got_polished is None utils.rm_rf(outdir)
def run(options): if not utils.look_for_required_binaries_in_path(): raise Exception( "At least one required program was not found in $PATH. Cannot continue" ) if (len([ x for x in (options.bam, options.reads_to_map, options.reads_per_amp_dir) if x is not None ]) != 1): raise Exception( "Must provide exactly one of: --bam, --reads_to_map, --reads_per_amp_dir" ) if options.mates_to_map is not None and options.reads_to_map is None: raise Exception( "--mates_to_map was used, but --reads_to_map was not. --reads_to_map is required by --mates_to_map" ) if options.force: utils.rm_rf(options.outdir) if options.amplicons_to_fail_file is None: amplicons_to_fail = None else: with open(options.amplicons_to_fail_file) as f: amplicons_to_fail = set([x.rstrip() for x in f]) return assemble.run_assembly_pipeline( options.ref_fasta, options.amplicons_json, options.outdir, sorted_bam=options.bam, reads_per_amp_dir=options.reads_per_amp_dir, reads_fastaq=options.reads_to_map, mates_fastaq=options.mates_to_map, minimap_opts=options.minimap_opts, min_mean_coverage=options.min_mean_coverage, target_coverage=options.target_coverage, read_end_trim=options.read_end_trim, read_map_tolerance=options.read_map_tolerance, min_read_length=options.min_read_length, racon_iterations=options.racon_iterations, min_depth_for_not_N=options.min_depth_for_not_N, min_amp_overlap_len=options.min_amp_overlap_len, contig_map_end_allowance=options.contig_map_end_allowance, amplicons_to_fail=amplicons_to_fail, wgs=options.wgs, debug=options.debug, command_line_args=options, )
def test_run_racon_bad_data(): fa_to_polish = os.path.join(data_dir, "run_racon.to_polish.fa") seq_to_polish = utils.load_single_seq_fasta(fa_to_polish) reads = os.path.join(data_dir, "run_racon_bad_reads.fa") pre_out = "tmp.run_racon" utils.rm_rf(f"{pre_out}.sam") utils.rm_rf(f"{pre_out}.to_polish.fa") polished = racon.run_racon(seq_to_polish, reads, pre_out, debug=True) utils.rm_rf(f"{pre_out}.sam") utils.rm_rf(f"{pre_out}.to_polish.fa") assert polished is None
def test_get_reads_for_polishing(): reads_bam = os.path.join(data_dir, "get_reads_for_polishing.bam") bam = pysam.AlignmentFile(reads_bam, "rb") reads_out = "tmp.get_reads_for_polishing.reads.fa" utils.rm_rf(reads_out) amplicon = amplicons.Amplicon("amp1", 59, 419, 1, 1) got_reads, got_used, got_cov = amplicon.get_reads_for_polishing( "ref1", bam, reads_out, min_coverage=1, trim_ends=5, tolerance=1, min_output_length=300, target_depth=3, ) assert got_reads == 6 assert got_used == 4 assert got_cov == pytest.approx(4.49, 4.50) expect_reads = os.path.join(data_dir, "get_reads_for_polishing.expect.fa") assert filecmp.cmp(reads_out, expect_reads, shallow=False) os.unlink(reads_out) amplicon = amplicons.Amplicon("amp1", 50, 100, 1, 1) got_reads, got_used, got_cov = amplicon.get_reads_for_polishing( "ref2", bam, reads_out, min_coverage=1, trim_ends=5, tolerance=1, min_output_length=30, target_depth=1, ) assert got_reads == 0 assert got_used == 0 assert got_cov == 0 assert not os.path.exists(reads_out)
def test_load_and_check_reads_amp_dir(): tmp_dir = "tmp.load_and_check_reads_amp_dir" utils.rm_rf(tmp_dir) os.mkdir(tmp_dir) json_file = os.path.join(tmp_dir, "manifest.json") json_data = { "a1": "a1.fasta", "a2": "a2.fasta", } with open(json_file, "w") as f: json.dump(json_data, f) amp1 = mock.Mock() amp1.name = "a1" amp2 = mock.Mock() amp2.name = "a2" amp3 = mock.Mock() amp3.name = "a3" with pytest.raises(Exception): assemble.load_and_check_reads_amp_dir(tmp_dir, set()) with pytest.raises(Exception): assemble.load_and_check_reads_amp_dir(tmp_dir, [amp1]) with pytest.raises(Exception): assemble.load_and_check_reads_amp_dir(tmp_dir, [amp1, amp2]) for filename in json_data.values(): with open(os.path.join(tmp_dir, filename), "w"): pass got = assemble.load_and_check_reads_amp_dir(tmp_dir, [amp1, amp2]) assert got == {k: os.path.join(tmp_dir, v) for k, v in json_data.items()} with pytest.raises(Exception): assemble.load_and_check_reads_amp_dir(tmp_dir, [amp1, amp2, amp3]) utils.rm_rf(tmp_dir)
def test_consensus_contigs_to_consensus(): ref_fasta = os.path.join(data_dir, "consensus_contigs_to_consensus.fa") outprefix = "tmp.consensus_contigs_to_consensus" utils.rm_rf(f"{outprefix}.*") assert (amplicon_overlapper.consensus_contigs_to_consensus( None, ref_fasta, outprefix) is None) assert (amplicon_overlapper.consensus_contigs_to_consensus( [], ref_fasta, outprefix) is None) # contig is in ref from 1-120 contig1 = "GGGTCCTCGGCCTACGACTATATCGCATGGCACGGTGCGGCTGTAGGGACACAAGATAATGTTCCGAGCAATTACGCACTTATTTGGTTCAGGAATCAGACTTCCGGTTTCGAACTTTCG" contigs = [contig1] got = amplicon_overlapper.consensus_contigs_to_consensus( contigs, ref_fasta, outprefix) assert got == contig1 utils.rm_rf(f"{outprefix}.*") # contig2 is in ref from 181-300 contig2 = "CTATTTGCACCGTTGTAAATGCGCAGTTTGAGCTGTTGTTTCGCGGCACCGTAAGAAAAAAGATGTACTGCCGAACTCGGGGCGTAGTGAGGGGTTCATAGCGAGAAACGTCTTGTACGC" contigs = [contig1, contig2] got = amplicon_overlapper.consensus_contigs_to_consensus( contigs, ref_fasta, outprefix) assert got == contig1 + "N" * 60 + contig2 utils.rm_rf(f"{outprefix}.*") # contigs in wrong order, should result in aborted assembly contigs = [contig2, contig1] got = amplicon_overlapper.consensus_contigs_to_consensus( contigs, ref_fasta, outprefix) assert got is None utils.rm_rf(f"{outprefix}.*") # Add one short contig that should get removed because won't map well # enough to the ref contigs = [contig1, contig2[:40]] got = amplicon_overlapper.consensus_contigs_to_consensus( contigs, ref_fasta, outprefix) assert got == contig1 utils.rm_rf(f"{outprefix}.*")
def test_assemble(): data_dir = os.path.join(data_root, "assemble") options = mock.Mock() outdir = "tmp.test_task_assemble" utils.rm_rf(outdir) options.bam = None options.ref_fasta = os.path.join(data_dir, "run_assembly_pipeline.ref.fa") options.amplicons_json = os.path.join( data_dir, "run_assembly_pipeline.amplicons.json") options.outdir = outdir options.reads_to_map = os.path.join(data_dir, "run_assembly_pipeline.reads.fa") options.reads_per_amp_dir = None options.mates_to_map = None options.minimap_opts = "-t 1 -x map-ont" options.min_mean_coverage = 5 options.target_coverage = 500 options.read_end_trim = 1 options.read_map_tolerance = 20 options.min_read_length = 200 options.racon_iterations = 3 options.min_depth_for_not_N = 1 options.min_amp_overlap_len = 20 options.contig_map_end_allowance = 20 options.amplicons_to_fail_file = None options.wgs = False options.debug = True got = tasks.assemble.run(options) expect_fa = os.path.join(data_dir, "run_assembly_pipeline.expect.fa") expect_seq = utils.load_single_seq_fasta(expect_fa) # expected fasta is the fasta used to generate the reads. But the amplicons # don't cover the whole genome, so we expect to miss the ends assert got == expect_seq[11:988] consensus_from_file = utils.load_single_seq_fasta( os.path.join(outdir, "consensus.final_assembly.fa")) assert got == consensus_from_file.seq assert os.path.exists(os.path.join(options.outdir, "run_info.json")) utils.rm_rf(outdir) # Test the option amplicons_to_fail_file options.amplicons_to_fail_file = "tmp.amplicons_to_fail.txt" with open(options.amplicons_to_fail_file, "w") as f: print("a1", file=f) got = tasks.assemble.run(options) expect_fa = os.path.join(data_dir, "run_assembly_pipeline.expect.fa") expect_seq = utils.load_single_seq_fasta(expect_fa) assert got == expect_seq[356:988] consensus_from_file = utils.load_single_seq_fasta( os.path.join(outdir, "consensus.final_assembly.fa")) assert got == consensus_from_file.seq assert os.path.exists(os.path.join(options.outdir, "run_info.json")) utils.rm_rf(outdir) os.unlink(options.amplicons_to_fail_file)
def test_polish(): ref_fasta = os.path.join(data_dir, "polish.ref.fa") ref_genome = utils.load_single_seq_fasta(ref_fasta) amplicon = amplicons.Amplicon("amplicon1", 60, 259, 1, 1) reads_bam = os.path.join(data_dir, "polish.bam") bam = pysam.AlignmentFile(reads_bam, "rb") outdir = "tmp.polish.out" utils.rm_rf(outdir) amplicon.polish( ref_genome, outdir, bam_to_slice_reads=bam, min_mean_coverage=3, racon_iterations=3, min_depth_for_not_N=3, min_read_length=100, max_polished_N_prop=0.5, debug=True, ) assert ( amplicon.masked_seq == "NNNNNNNNNNNNNNNNNNNNAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGNNNNNNNNNNNNNNNNNNNNN" ) assert amplicon.assemble_success assert amplicon.polish_data["Polish success"] utils.rm_rf(outdir) # Same again, but this time use the fasta of reads instead of the BAM file. # Plus, this is giving untrimmed reads, so we get less masking. In the # previous run 20bp trimmed off all the reads reads_file = os.path.join(data_dir, "polish.reads.fa") amplicon.polish( ref_genome, outdir, reads_file=reads_file, min_mean_coverage=3, racon_iterations=3, min_depth_for_not_N=3, min_read_length=100, max_polished_N_prop=0.5, debug=True, ) assert ( amplicon.masked_seq == "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT" ) assert amplicon.assemble_success assert amplicon.polish_data["Polish success"] utils.rm_rf(outdir) # The reads are such that there's a dip in coverage in the middle of the # amplicon. Setting min_depth_for_not_N higher makes this region get # masked, and then the amplicon should get failed amplicon = amplicons.Amplicon("amplicon1", 60, 259, 1, 1) amplicon.polish( ref_genome, outdir, bam_to_slice_reads=bam, min_mean_coverage=3, racon_iterations=3, min_depth_for_not_N=18, min_read_length=50, max_polished_N_prop=0.1, debug=True, ) assert ( amplicon.masked_seq == "NNNNNNNNNNNNNNNNNNNNAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGNNNNNNNNNNNNNNNNNNNNN" ) assert not amplicon.assemble_success assert not amplicon.polish_data["Polish success"] utils.rm_rf(outdir)
def polish( self, ref_genome, outdir, bam_to_slice_reads=None, reads_file=None, min_mean_coverage=25, target_coverage=500, read_end_trim=20, read_map_tolerance=20, min_read_length=200, racon_iterations=3, min_depth_for_not_N=5, max_polished_N_prop=0.1, wgs=False, debug=False, minimap_opts=None, ): os.mkdir(outdir) if reads_file is None: if bam_to_slice_reads is None: self.polish_data["Comments"].append(f"No reads provided. Calling this amplicon failed") return reads_file = os.path.join(outdir, "reads.fa") total_reads, used_reads, coverage = self.get_reads_for_polishing( ref_genome.id, bam_to_slice_reads, reads_file, min_coverage=min_mean_coverage, trim_ends=read_end_trim, tolerance=read_map_tolerance, min_output_length=min_read_length, target_depth=target_coverage, wgs=wgs, ) logging.debug( f"Extracted {total_reads} reads for amplicon {self.name}. Using {used_reads} for polishing, at mean depth of {coverage}" ) if coverage < min_mean_coverage: logging.warning( f"Mean coverage for amplicon {self.name} is too low: {coverage}. Considering this a failed amplicon" ) self.polish_data["Comments"].append(f"Coverage {coverage} too low") return else: logging.debug( "Using user-supplied reads {reads_file} for amplicon {self.name}" ) self.polish_data["Comments"].append( f"Read stats not calculated because user supplied file {reads_file}" ) amplicon_seq = ref_genome[self.start : self.end + 1] racon_dir = os.path.join(outdir, "Racon") self.polished_seq = racon.run_racon_iterations( amplicon_seq, reads_file, racon_dir, debug=debug, max_iterations=racon_iterations, minimap_opts=minimap_opts, ) logging.debug(f"polished_seq: {self.polished_seq}") if self.polished_seq is None: self.polish_data["Comments"].append("No sequenced returned from racon") return mask_outprefix = os.path.join(outdir, "masked") self.masked_seq = utils.mask_low_coverage( self.polished_seq, reads_file, mask_outprefix, min_depth=min_depth_for_not_N, debug=debug, ) logging.debug(f"masked: {self.masked_seq}") masked_strip_ns = self.masked_seq.strip("N") if len(masked_strip_ns) == 0: proportion_masked = 0 else: proportion_masked = round( masked_strip_ns.count("N") / len(masked_strip_ns), 2 ) if proportion_masked > max_polished_N_prop: percent_N = 100 * proportion_masked self.polish_data["Comments"].append( f"Too many Ns ({percent_N}%) after masking polished sequence (not including Ns at the start/end)" ) else: self.polish_data["Polish success"] = True self.assemble_success = True if not debug: utils.rm_rf(outdir)
def polish_each_amplicon( ref_genome, amplicons, outdir, bam_to_slice_reads=None, amplicon_to_reads_file=None, min_mean_coverage=25, target_coverage=500, read_end_trim=20, read_map_tolerance=20, min_read_length=200, racon_iterations=3, min_depth_for_not_N=5, amplicons_to_fail=None, wgs=False, debug=False, minimap_opts=None, ): if amplicon_to_reads_file is None: amplicon_to_reads_file = {} if amplicons_to_fail is None: amplicons_to_fail = set() for i, amplicon in enumerate(amplicons): logging.debug( f"Start processing amplicon {amplicon.name} ({i+1}/{len(amplicons)})" ) if amplicon.name in amplicons_to_fail: logging.debug( f"User chose to fail amplicon {amplicon.name}. Moving on") amplicon.force_polish_fail() continue logging.debug( f"Extracting reads and polishing amplicon {amplicon.name}") amplicon_dir = os.path.join(outdir, str(i + 1)) amplicon.polish( ref_genome, amplicon_dir, bam_to_slice_reads=bam_to_slice_reads, reads_file=amplicon_to_reads_file.get(amplicon.name, None), min_mean_coverage=min_mean_coverage, target_coverage=target_coverage, read_end_trim=read_end_trim, read_map_tolerance=read_map_tolerance, min_read_length=min_read_length, racon_iterations=racon_iterations, min_depth_for_not_N=min_depth_for_not_N, wgs=wgs, debug=debug, minimap_opts=minimap_opts, ) ok = "yes" if amplicon.assemble_success else "no" logging.debug( f"Finish polishing amplicon {amplicon.name}. Success: {ok}") if not debug: utils.rm_rf(amplicon_dir) logging.debug( f"Finish processing amplicon {amplicon.name} ({i+1}/{len(amplicons)})" ) if i % 10 == 0: logging.info(f"Processed {i+1} of {len(amplicons)} amplicons")
def run_assembly_pipeline( ref_fasta, amplicons_json, outdir, sorted_bam=None, reads_per_amp_dir=None, reads_fastaq=None, mates_fastaq=None, minimap_opts=None, min_mean_coverage=25, target_coverage=500, read_end_trim=20, read_map_tolerance=20, min_read_length=200, racon_iterations=3, min_depth_for_not_N=5, min_amp_overlap_len=20, contig_map_end_allowance=20, amplicons_to_fail=None, wgs=False, debug=False, command_line_args=None, ): # Make a dict of the command line options to go in the JSON output file. # The tests don't use argparse (they use Mock), which means convert to dict # doesn't work. Don't care about that case anyway in the final output, so # just set to None if isinstance(command_line_args, argparse.Namespace): options_dict = { k: v for k, v in vars(command_line_args).items() if k != "func" } else: options_dict = None start_time = datetime.datetime.now() os.mkdir(outdir) json_out = os.path.join(outdir, "run_info.json") json_data = { "run_summary": { "total_amplicons": None, "successful_amplicons": None, "command": " ".join(sys.argv), "options": options_dict, "cwd": os.getcwd(), "version": viridian_version, "finished_running": False, "made_consensus": False, "consensus": None, "start_time": start_time.replace(microsecond=0).isoformat(), "end_time": None, "hostname": socket.gethostname(), }, "amplicons": None, } with open(json_out, "w") as f: json.dump(json_data, f, indent=2, sort_keys=True) ref_genome = utils.load_single_seq_fasta(ref_fasta) logging.info(f"Loaded ref genome {ref_genome.id}") amplicons = amps.load_amplicons_json_file(amplicons_json) json_data["run_summary"]["total_amplicons"] = len(amplicons) logging.info(f"Loaded amplicons file {amplicons_json}") amplicon_to_reads_file = None if reads_per_amp_dir is not None: assert reads_fastaq is None assert sorted_bam is None amplicon_to_reads_file = load_and_check_reads_amp_dir( reads_per_amp_dir, amplicons) bam = None elif reads_fastaq is not None: assert sorted_bam is None assert reads_per_amp_dir is None logging.info("Reads in FASTA/FASTQ format provided. Mapping reads") sorted_bam = os.path.join(outdir, "map_reads.bam") map_reads( ref_fasta, reads_fastaq, sorted_bam, minimap_opts=minimap_opts, mates_file=mates_fastaq, ) logging.info("Finished mapping reads") bam = pysam.AlignmentFile(sorted_bam, "rb") else: assert sorted_bam is not None bam = pysam.AlignmentFile(sorted_bam, "rb") if debug: polish_root_dir = os.path.join(outdir, "Amplicon_polish") os.mkdir(polish_root_dir) else: polish_root_dir = tempfile.mkdtemp(prefix="viridian_polish_") logging.info( f"Start polishing each amplicon. Directory: {polish_root_dir}") try: polish_each_amplicon( ref_genome, amplicons, polish_root_dir, bam_to_slice_reads=bam, amplicon_to_reads_file=amplicon_to_reads_file, min_mean_coverage=min_mean_coverage, target_coverage=target_coverage, read_end_trim=read_end_trim, read_map_tolerance=read_map_tolerance, min_read_length=min_read_length, racon_iterations=racon_iterations, min_depth_for_not_N=min_depth_for_not_N, amplicons_to_fail=amplicons_to_fail, wgs=wgs, debug=debug, minimap_opts=minimap_opts, ) finally: if not debug: utils.rm_rf(polish_root_dir) logging.info("Finished polishing each amplicon") add_successful_amplicons_to_json_data(json_data, amplicons) if json_data["run_summary"]["successful_amplicons"] == 0: logging.warning("No amplicons successfully polished!") consensus = None else: logging.info("Start making consensus from polished amplicons") overlap_out = os.path.join(outdir, "consensus") consensus = amplicon_overlapper.assemble_amplicons( amplicons, ref_fasta, overlap_out, min_match_length=min_amp_overlap_len, ref_map_end_allowance=contig_map_end_allowance, debug=debug, ) json_data["run_summary"]["consensus"] = consensus # Need to recalculate successful amplicons because they can get failed # during overlapping. If two adjacent amplicons have no overlap, then they # both get failed. add_successful_amplicons_to_json_data(json_data, amplicons) if consensus is None: logging.warning( "Did not make consensus sequence. Please see previous warnings") else: logging.info("Finished making consensus sequence.") json_data["run_summary"]["made_consensus"] = True add_consensus_length_N_count_to_json_data(json_data) json_data["amplicons"] = amps.amplicons_to_list_of_dicts(amplicons) json_data["run_summary"]["finished_running"] = True end_time = datetime.datetime.now() json_data["run_summary"]["end_time"] = end_time.replace( microsecond=0).isoformat() json_data["run_summary"]["run_time"] = str(end_time - start_time) with open(json_out, "w") as f: json.dump(json_data, f, indent=2, sort_keys=True) return consensus
def test_run_assembly_pipeline(): ref_fa = os.path.join(data_dir, "run_assembly_pipeline.ref.fa") reads_fa = os.path.join(data_dir, "run_assembly_pipeline.reads.fa") amplicon_json = os.path.join(data_dir, "run_assembly_pipeline.amplicons.json") outdir = "tmp.run_assembly_pipeline" utils.rm_rf(outdir) got = assemble.run_assembly_pipeline( ref_fa, amplicon_json, outdir, reads_fastaq=reads_fa, debug=True, min_mean_coverage=5, min_depth_for_not_N=1, read_end_trim=1, ) expect_fa = os.path.join(data_dir, "run_assembly_pipeline.expect.fa") expect_seq = utils.load_single_seq_fasta(expect_fa) # expected fasta is the fasta used to generate the reads. But the amplicons # don't cover the whole genome, so we expect to miss the ends assert got == expect_seq[11:988] consensus_from_file = utils.load_single_seq_fasta( os.path.join(outdir, "consensus.final_assembly.fa")) assert got == consensus_from_file.seq utils.rm_rf(outdir) # rerun, but using a directory of one file of reads per amplicon. This is # what viridian workflow will be making as input reads_per_amp_dir = os.path.join(data_dir, "run_assembly_pipeline.reads_per_amp") got = assemble.run_assembly_pipeline( ref_fa, amplicon_json, outdir, reads_per_amp_dir=reads_per_amp_dir, debug=True, min_mean_coverage=5, min_depth_for_not_N=1, read_end_trim=1, ) assert got == expect_seq[10:988] consensus_from_file = utils.load_single_seq_fasta( os.path.join(outdir, "consensus.final_assembly.fa")) assert got == consensus_from_file.seq utils.rm_rf(outdir) # Rerun, but test force failing the first amplicon got = assemble.run_assembly_pipeline( ref_fa, amplicon_json, outdir, reads_fastaq=reads_fa, debug=True, min_mean_coverage=5, min_depth_for_not_N=1, read_end_trim=1, amplicons_to_fail={"a1"}, ) expect_fa = os.path.join(data_dir, "run_assembly_pipeline.expect.fa") expect_seq = utils.load_single_seq_fasta(expect_fa) # This time, we should not have the first amplicon, and the returned # sequence should start with the second amplicon assert got == expect_seq[356:988] consensus_from_file = utils.load_single_seq_fasta( os.path.join(outdir, "consensus.final_assembly.fa")) assert got == consensus_from_file.seq # some checks of the contents of the json summary with open(os.path.join(outdir, "run_info.json")) as f: run_info = json.load(f) assert run_info["run_summary"]["made_consensus"] is True assert run_info["run_summary"]["amplicon_success"] == { "a1": False, "a2": True, "a3": True, } assert run_info["run_summary"]["successful_amplicons"] == 2 assert run_info["run_summary"]["total_amplicons"] == 3 assert run_info["run_summary"]["consensus_length"] == 632 assert run_info["run_summary"]["consensus_N_count"] == 0 utils.rm_rf(outdir)
def test_assemble_amplicons(): ref_fasta = os.path.join(data_dir, "assemble_amplicons.ref.fa") ref_seq = utils.load_single_seq_fasta(ref_fasta) amplicons = [ amps.Amplicon("a1", 20, 300, 1, 2), amps.Amplicon("a2", 240, 550, 3, 4), amps.Amplicon("a3", 500, 850, 5, 6), amps.Amplicon("a4", 790, 970, 7, 8), ] outprefix = "tmp.assemble_amplicons" utils.rm_rf(f"{outprefix}.*") got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got is None utils.rm_rf(f"{outprefix}.*") amplicons[0].masked_seq = ref_seq[20:301] amplicons[0].assemble_success = True got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got == amplicons[0].masked_seq[0:-2] utils.rm_rf(f"{outprefix}.*") amplicons[1].masked_seq = ref_seq[250:545] amplicons[1].assemble_success = True got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got == ref_seq[20:541] utils.rm_rf(f"{outprefix}.*") amplicons[3].masked_seq = ref_seq[790:952] amplicons[3].assemble_success = True got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got == ref_seq[20:541] + "N" * 256 + ref_seq[797:951] utils.rm_rf(f"{outprefix}.*") # putting in junk for amplicon 2 means it won't overlap amplicons 1 or 3, # and we should only get amplicon 0 back amplicons[ 2].masked_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTT" amplicons[2].assemble_success = True got = amplicon_overlapper.assemble_amplicons(amplicons, ref_fasta, outprefix, debug=True) assert got == ref_seq[20:299] utils.rm_rf(f"{outprefix}.*")