Example #1
0
def test_load_amplicons_json_file():
    expect = [
        amplicons.Amplicon("name1", 42, 99, 5, 3),
        amplicons.Amplicon("name2", 90, 150, 2, 9),
    ]
    infile = os.path.join(data_dir, "load_amplicons_json_file.json")
    assert expect == amplicons.load_amplicons_json_file(infile)
Example #2
0
def test_assemble_amplicons():
    ref_fasta = os.path.join(data_dir, "assemble_amplicons.ref.fa")
    ref_seq = utils.load_single_seq_fasta(ref_fasta)
    amplicons = [
        amps.Amplicon("a1", 20, 300, 1, 2),
        amps.Amplicon("a2", 240, 550, 3, 4),
        amps.Amplicon("a3", 500, 850, 5, 6),
        amps.Amplicon("a4", 790, 970, 7, 8),
    ]
    outprefix = "tmp.assemble_amplicons"
    utils.rm_rf(f"{outprefix}.*")
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got is None
    utils.rm_rf(f"{outprefix}.*")

    amplicons[0].masked_seq = ref_seq[20:301]
    amplicons[0].assemble_success = True
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got == amplicons[0].masked_seq[0:-2]
    utils.rm_rf(f"{outprefix}.*")

    amplicons[1].masked_seq = ref_seq[250:545]
    amplicons[1].assemble_success = True
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got == ref_seq[20:541]
    utils.rm_rf(f"{outprefix}.*")

    amplicons[3].masked_seq = ref_seq[790:952]
    amplicons[3].assemble_success = True
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got == ref_seq[20:541] + "N" * 256 + ref_seq[797:951]
    utils.rm_rf(f"{outprefix}.*")

    # putting in junk for amplicon 2 means it won't overlap amplicons 1 or 3,
    # and we should only get amplicon 0 back
    amplicons[
        2].masked_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTT"
    amplicons[2].assemble_success = True
    got = amplicon_overlapper.assemble_amplicons(amplicons,
                                                 ref_fasta,
                                                 outprefix,
                                                 debug=True)
    assert got == ref_seq[20:299]
    utils.rm_rf(f"{outprefix}.*")
Example #3
0
def test_get_amplicon_overlaps():
    amplicons = [
        amps.Amplicon("a1", 10, 100, 1, 1),
        amps.Amplicon("a2", 10, 100, 1, 1),
        amps.Amplicon("a3", 110, 142, 1, 1),
    ]
    amplicons[1].masked_seq = "AAAAAAAAAAAAAAAAAATGCTGAACAGTCCCCCCC"
    amplicons[
        2].masked_seq = "CCTGCTGAACGGTTGATGCATCTCATGCTGACNNAGGTGTGGCCAAAAA"

    expect_overlaps = [None, Match(18, 2, 8)]
    got_overlaps = amplicon_overlapper.get_amplicon_overlaps(amplicons, 8)
    assert got_overlaps == expect_overlaps
    assert amplicon_overlapper.get_amplicon_overlaps(amplicons,
                                                     9) == [None, None]
Example #4
0
def test_use_read_for_polishing():
    amplicon = amplicons.Amplicon("name", 50, 100, 1, 1)
    read = mock.Mock()
    # Test start of read is within X bp of start of amplicon
    read.reference_start = 48
    read.reference_end = 75
    assert amplicon.use_read_for_polishing(read, 2, None, wgs=False)
    assert not amplicon.use_read_for_polishing(read, 1, None, wgs=False)

    # Test end of read is within X bp of end of amplicon
    read.reference_start = 75
    read.reference_end = 103
    assert amplicon.use_read_for_polishing(read, 3, None, wgs=False)
    assert not amplicon.use_read_for_polishing(read, 2, None, wgs=False)

    # Test overlapping read when wgs is True
    read.reference_start = 1
    read.reference_end = 49
    assert not amplicon.use_read_for_polishing(read, None, 10, wgs=True)
    read.reference_end = 60
    assert amplicon.use_read_for_polishing(read, None, 11, wgs=True)
    assert not amplicon.use_read_for_polishing(read, None, 12, wgs=True)
    read.reference_start = 60
    read.reference_end = 69
    assert amplicon.use_read_for_polishing(read, None, 10, wgs=True)
    assert not amplicon.use_read_for_polishing(read, None, 11, wgs=True)
    read.reference_start = 20
    read.reference_end = 110
    assert amplicon.use_read_for_polishing(read, None, 51, wgs=True)
    assert not amplicon.use_read_for_polishing(read, None, 52, wgs=True)
Example #5
0
def test_masked_seq_centre_coord():
    amplicon = amplicons.Amplicon("name", 0, 10, 1, 1)
    assert amplicon.masked_seq_centre_coord() is None
    amplicon.masked_seq = "ACT"
    assert amplicon.masked_seq_centre_coord() == 1
    amplicon.masked_seq = "ACTG"
    assert amplicon.masked_seq_centre_coord() == 2
    amplicon.masked_seq = "ACTGA"
    assert amplicon.masked_seq_centre_coord() == 2
    amplicon.masked_seq = "ACTGAT"
    assert amplicon.masked_seq_centre_coord() == 3
Example #6
0
def test_ref_centre_coord():
    amplicon = amplicons.Amplicon("name", 0, 9, 1, 1)
    assert amplicon.ref_centre_coord() == 5
    amplicon = amplicons.Amplicon("name", 0, 10, 1, 1)
    assert amplicon.ref_centre_coord() == 5
    amplicon = amplicons.Amplicon("name", 0, 11, 1, 1)
    assert amplicon.ref_centre_coord() == 6
    amplicon = amplicons.Amplicon("name", 0, 12, 1, 1)
    assert amplicon.ref_centre_coord() == 6
    amplicon = amplicons.Amplicon("name", 0, 13, 1, 1)
    assert amplicon.ref_centre_coord() == 7
    amplicon = amplicons.Amplicon("name", 10, 19, 1, 1)
    assert amplicon.ref_centre_coord() == 15
    amplicon = amplicons.Amplicon("name", 10, 20, 1, 1)
    assert amplicon.ref_centre_coord() == 15
    amplicon = amplicons.Amplicon("name", 10, 21, 1, 1)
    assert amplicon.ref_centre_coord() == 16
    amplicon = amplicons.Amplicon("name", 10, 22, 1, 1)
    assert amplicon.ref_centre_coord() == 16
    amplicon = amplicons.Amplicon("name", 10, 23, 1, 1)
    assert amplicon.ref_centre_coord() == 17
Example #7
0
def test_amplicons_to_consensus_contigs_2():
    # This hits case not seen in previous test. Need a combination of amplicons
    # that pass fail pass fail pass. Was a bug where new contig was not being
    # started when we had two amplicons that didn't overlap and got removed
    # ref is 130bp of random sequence
    #                10        20        30        40        50        60        70        80        90        100       110       120
    #      0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
    ref = "GGCAACAAGCCCCGTAACCCAGCTCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGGCTCCTATGCACAGCGCGGACCAACAGTCGATCGTTATCCTAACTCTACATACTAA"
    amplicons = [
        amps.Amplicon("amp1", 0, 30, 1, 1),
        amps.Amplicon("amp2", 20, 50, 1, 2),
        amps.Amplicon("amp3", 40, 70, 1, 1),
        amps.Amplicon("amp4", 60, 90, 1, 1),
        amps.Amplicon("amp5", 80, 110, 2, 1),
        amps.Amplicon("amp6", 100, 130, 1, 3),
    ]
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 7)
    assert got_contigs == None
    amplicons[0].masked_seq = ref[0:30]
    amplicons[0].assemble_success = True
    amplicons[1].masked_seq = ref[20:50]
    amplicons[1].assemble_success = True
    amplicons[2].masked_seq = ref[40:55] + "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
    amplicons[2].assemble_success = True
    amplicons[3].masked_seq = "GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG" + ref[75:90]
    amplicons[3].assemble_success = True
    amplicons[4].masked_seq = ref[80:110]
    amplicons[4].assemble_success = True
    amplicons[5].masked_seq = ref[100:130]
    amplicons[5].assemble_success = True
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 7)
    assert got_contigs == [ref[0:48], ref[82:129]]
Example #8
0
def test_get_reads_for_polishing():
    reads_bam = os.path.join(data_dir, "get_reads_for_polishing.bam")
    bam = pysam.AlignmentFile(reads_bam, "rb")
    reads_out = "tmp.get_reads_for_polishing.reads.fa"
    utils.rm_rf(reads_out)
    amplicon = amplicons.Amplicon("amp1", 59, 419, 1, 1)

    got_reads, got_used, got_cov = amplicon.get_reads_for_polishing(
        "ref1",
        bam,
        reads_out,
        min_coverage=1,
        trim_ends=5,
        tolerance=1,
        min_output_length=300,
        target_depth=3,
    )
    assert got_reads == 6
    assert got_used == 4
    assert got_cov == pytest.approx(4.49, 4.50)
    expect_reads = os.path.join(data_dir, "get_reads_for_polishing.expect.fa")
    assert filecmp.cmp(reads_out, expect_reads, shallow=False)
    os.unlink(reads_out)

    amplicon = amplicons.Amplicon("amp1", 50, 100, 1, 1)
    got_reads, got_used, got_cov = amplicon.get_reads_for_polishing(
        "ref2",
        bam,
        reads_out,
        min_coverage=1,
        trim_ends=5,
        tolerance=1,
        min_output_length=30,
        target_depth=1,
    )
    assert got_reads == 0
    assert got_used == 0
    assert got_cov == 0
    assert not os.path.exists(reads_out)
Example #9
0
def test_masked_overlap():
    Match = collections.namedtuple("Match", ("a", "b", "size"))
    amp1 = amplicons.Amplicon("a1", 50, 100, 1, 1)
    amp2 = amplicons.Amplicon("a2", 90, 150, 1, 1)
    assert amp1.masked_overlap(amp2, 10) is None
    amp2.masked_seq = "AAAAAAAAAAAAAAAAAATGCTGAACAGTCCCCCCC"
    assert amp1.masked_overlap(amp2, 10) is None
    amp1.masked_seq = "AAAAAAAAAAAAAAAAAATGCTGAACAGTCCCCCCC"
    amp2.masked_seq = None
    assert amp1.masked_overlap(amp2, 10) is None
    amp2.masked_seq = "CCTGCTGAACGGTTGATGCATCTCATGCTGACNNAGGTGTGGCCAAAAA"
    assert amp1.masked_overlap(amp2, 7) == Match(18, 2, 8)
    assert amp1.masked_overlap(amp2, 8) == Match(18, 2, 8)
    assert amp1.masked_overlap(amp2, 9) == None

    amp1.masked_seq = "CCTGCTGAACGGTTGATGCATCTCATGCTGACNNAGGTGTGGCCAAAAA"
    amp2.masked_seq = "NNAGGTGTGGCCTTTTTTTTTTTTTTTTTTTTTTTTTT"
    assert amp1.masked_overlap(amp2, 10) == Match(34, 2, 10)
    assert amp1.masked_overlap(amp2, 11) == None

    amp1.masked_seq = "NNAGGTGTGGCCTTTTTTTTTTTTTTTTTTTTTTTTTT"
    amp2.masked_seq = "AAAAAAAAAAAAAAAAANNNNNNNNNNN"
    assert amp1.masked_overlap(amp2, 0) == Match(2, 0, 1)
    assert amp1.masked_overlap(amp2, 2) == None
Example #10
0
def test_expected_overlap_length():
    amplicon1 = amplicons.Amplicon("name", 0, 10, 1, 1)
    amplicon2 = amplicons.Amplicon("name", 8, 20, 1, 1)
    amplicon3 = amplicons.Amplicon("name", 11, 20, 1, 1)
    assert amplicon1.expected_overlap_length(amplicon2) == 3
    assert amplicon1.expected_overlap_length(amplicon3) is None
Example #11
0
def test_polish():
    ref_fasta = os.path.join(data_dir, "polish.ref.fa")
    ref_genome = utils.load_single_seq_fasta(ref_fasta)
    amplicon = amplicons.Amplicon("amplicon1", 60, 259, 1, 1)
    reads_bam = os.path.join(data_dir, "polish.bam")
    bam = pysam.AlignmentFile(reads_bam, "rb")
    outdir = "tmp.polish.out"
    utils.rm_rf(outdir)
    amplicon.polish(
        ref_genome,
        outdir,
        bam_to_slice_reads=bam,
        min_mean_coverage=3,
        racon_iterations=3,
        min_depth_for_not_N=3,
        min_read_length=100,
        max_polished_N_prop=0.5,
        debug=True,
    )
    assert (
        amplicon.masked_seq ==
        "NNNNNNNNNNNNNNNNNNNNAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGNNNNNNNNNNNNNNNNNNNNN"
    )
    assert amplicon.assemble_success
    assert amplicon.polish_data["Polish success"]
    utils.rm_rf(outdir)

    # Same again, but this time use the fasta of reads instead of the BAM file.
    # Plus, this is giving untrimmed reads, so we get less masking. In the
    # previous run 20bp trimmed off all the reads
    reads_file = os.path.join(data_dir, "polish.reads.fa")
    amplicon.polish(
        ref_genome,
        outdir,
        reads_file=reads_file,
        min_mean_coverage=3,
        racon_iterations=3,
        min_depth_for_not_N=3,
        min_read_length=100,
        max_polished_N_prop=0.5,
        debug=True,
    )
    assert (
        amplicon.masked_seq ==
        "CGTTAATCCTAGGGCAGTTAAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTTCCAGCTAGAAATGATCATCGAACCTGGGTAAGGGCATAATACGAGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGAAATCTGCAACAAGCCCGGT"
    )
    assert amplicon.assemble_success
    assert amplicon.polish_data["Polish success"]
    utils.rm_rf(outdir)

    # The reads are such that there's a dip in coverage in the middle of the
    # amplicon. Setting min_depth_for_not_N higher makes this region get
    # masked, and then the amplicon should get failed
    amplicon = amplicons.Amplicon("amplicon1", 60, 259, 1, 1)
    amplicon.polish(
        ref_genome,
        outdir,
        bam_to_slice_reads=bam,
        min_mean_coverage=3,
        racon_iterations=3,
        min_depth_for_not_N=18,
        min_read_length=50,
        max_polished_N_prop=0.1,
        debug=True,
    )
    assert (
        amplicon.masked_seq ==
        "NNNNNNNNNNNNNNNNNNNNAAAGCCCCATTTTGTACAGCTTTTTCTAGAACAGTCAGGGCGCGCTCCCAGGAGTTGCTTCGCTNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNGAATGCTGCCCTATTGCCAGTGCTTAGAAATGGACTGGTGTTACGTCCACGNNNNNNNNNNNNNNNNNNNNN"
    )
    assert not amplicon.assemble_success
    assert not amplicon.polish_data["Polish success"]
    utils.rm_rf(outdir)
Example #12
0
def test_amplicons_to_consensus_contigs():
    # ref is 100bp of random sequence
    #                10        20        30        40        50        60        70        80        90
    #      0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
    # ref = "GGCAACAAGCCCCGTAACCCAGCTCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGGCTCCTATGCACAGCGCGGACCAACA"
    amplicons = [
        amps.Amplicon("amp1", 9, 39, 1, 2),
        amps.Amplicon("amp2", 24, 75, 3, 4),
        amps.Amplicon("amp3", 63, 99, 5, 6),
    ]
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 20)
    assert got_contigs == None

    amplicons[0].masked_seq = "CCCCGTAACCGAGCTCACCAGCGAATCACAA"
    amplicons[0].assemble_success = True
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 10)
    assert got_contigs == [amplicons[0].masked_seq[0:-2]]

    amplicons[
        1].masked_seq = "TCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGG"
    amplicons[1].assemble_success = True
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 10)
    expect = ["CCCCGTAACCGAGCTCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACT"]
    assert got_contigs == expect

    amplicons[2].masked_seq = "NNCAGAACACTTTGGCTCCTATGCACAGCGCGGACCAANN"
    amplicons[2].assemble_success = True
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 10)
    expect = [
        "CCCCGTAACCGAGCTCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGGCTCCTATGCACAGCGCGGACCAA"
    ]
    assert got_contigs == expect

    amplicons[1].masked_seq = None
    amplicons[1].assemble_success = False
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 10)
    expect = [
        amplicons[0].masked_seq[0:-2], amplicons[2].masked_seq[5:].rstrip("N")
    ]
    assert got_contigs == expect

    amplicons[0].masked_seq = None
    amplicons[0].assemble_success = False
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 10)
    expect = [amplicons[2].masked_seq[5:].rstrip("N")]
    assert got_contigs == expect

    amplicons[0].masked_seq = "AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
    amplicons[
        1].masked_seq = "TCACCAGCGAATCACAAGTGTTAAGAGACAAAGAAGCGGCAGAACACTTTGG"
    amplicons[2].masked_seq = "NNNNNNACACTTTGGCTCCTATGCACAGCGCGGANNNNNN"
    amplicons[0].assemble_success = False
    amplicons[1].assemble_success = False
    got_contigs = amplicon_overlapper.amplicons_to_consensus_contigs(
        amplicons, 10)
    expect = [amplicons[2].masked_seq[5:].strip("N")]
    assert got_contigs == expect
            "start": 350,
            "end": 799,
            "left_primer_end": 355,
            "right_primer_start": 790,
        },
        "a3": {
            "start": 740,
            "end": 989,
            "left_primer_end": 745,
            "right_primer_start": 980,
        },
    }
}

amplicons = [
    amps.Amplicon("a1", 10, 399, 0, 10),
    amps.Amplicon("a2", 350, 799, 6, 10),
    amps.Amplicon("a3", 740, 989, 6, 10),
]

amp_seqs = ["".join(ref_for_amplicons[x.start:x.end + 1]) for x in amplicons]
amp_to_seq_files_dir = "run_assembly_pipeline.reads_per_amp"
subprocess.check_output(f"rm -rf {amp_to_seq_files_dir}", shell=True)
os.mkdir(amp_to_seq_files_dir)
amp_to_seq_files = {f"a{i}": f"reads.{i}.fa" for i in (1, 2, 3)}
with open(os.path.join(amp_to_seq_files_dir, "manifest.json"), "w") as f:
    json.dump(amp_to_seq_files, f)

with open("run_assembly_pipeline.reads.fa", "w") as f_all:
    for i, seq in enumerate(amp_seqs):
        amp_name = amplicons[i].name