Beispiel #1
0
def test_assemble_transcript_fragments_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(contig=chromosome,
                      start=base1_location,
                      ref=ref,
                      alt=alt,
                      ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=samfile,
        chromosome=chromosome,
    )

    sequences = iterative_overlap_assembly(
        initial_variant_sequences_from_reads(variant_reads),
        min_overlap_size=30)

    assert len(sequences) > 0
    max_read_length = max(len(r) for r in variant_reads)
    for s in sequences:
        print("%s%s%s weight=%d length=%d" %
              (s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence)))
        eq_(s.alt, alt)
        if len(s.read_names) > 1:
            # expect sequences supported by more than one read to be greater
            # than the read length
            assert len(s) > max_read_length, \
                "Expected assembled sequences to be longer than read length (%d)" % (
                    max_read_length,)
Beispiel #2
0
def test_assemble_transcript_fragments_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(
        contig=chromosome,
        start=base1_location,
        ref=ref,
        alt=alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=samfile,
        chromosome=chromosome,)

    sequences = iterative_overlap_assembly(
        initial_variant_sequences_from_reads(variant_reads),
        min_overlap_size=30)

    assert len(sequences) > 0
    max_read_length = max(len(r) for r in variant_reads)
    for s in sequences:
        print("%s%s%s weight=%d length=%d" % (
            s.prefix,
            s.alt,
            s.suffix,
            len(s.reads),
            len(s.sequence)))
        eq_(s.alt, alt)
        assert len(s) > max_read_length, \
            "Expected assembled sequences to be longer than read length (%d)" % (
                max_read_length,)
Beispiel #3
0
def test_assembly_time():
    original_prefix = "ACTGAACCTTGGAAACCCTTTGGG"
    original_allele = "CCCTTT"
    original_suffix = "GGAAGGAAGGAATTTTTTTTGGCC"

    # generate 400 subsequences of all combinations of 0-19
    # characters trimmed from beginning of prefix vs. end of suffix
    subsequences = [
        VariantSequence(
            prefix=original_prefix[i:],
            alt=original_allele,
            suffix=original_suffix[:-j] if j > 0 else original_suffix,
            reads={str(i) + "_" + str(j)}) for i in range(20)
        for j in range(20)
    ]
    eq_(len(subsequences), 400)
    t_start = time()
    results = iterative_overlap_assembly(subsequences,
                                         min_overlap_size=len(original_allele))
    t_end = time()
    eq_(len(results), 1)
    result = results[0]
    eq_(result.prefix, original_prefix)
    eq_(result.suffix, original_suffix)
    t_elapsed = t_end - t_start
    assert t_elapsed < 0.1, \
        "Expected assembly of 400 sequences to take less than 100ms: %0.4fms" % (
            t_elapsed * 1000,)
Beispiel #4
0
def test_assembly_of_many_subsequences():
    original_prefix = "ACTGAACCTTGGAAACCCTTTGGG"
    original_allele = "CCCTTT"
    original_suffix = "GGAAGGAAGGAATTTTTTTT"

    # generate 100 subsequences of all combinations of 0-9
    # characters trimmed from beginning of prefix vs. end of suffix
    subsequences = [
        VariantSequence(
            prefix=original_prefix[i:],
            alt=original_allele,
            suffix=original_suffix[:-j] if j > 0 else original_suffix,
            reads={str(i) + "_" + str(j)}) for i in range(10)
        for j in range(10)
    ]
    eq_(100, len(subsequences))
    # adding one decoy sequence which doesn't match
    decoy = VariantSequence(prefix="G" + original_prefix[1:],
                            alt=original_allele,
                            suffix=original_suffix,
                            reads={"decoy"})
    input_sequences = subsequences + [decoy]
    results = iterative_overlap_assembly(input_sequences,
                                         min_overlap_size=len(original_allele))

    eq_(len(results), 2)

    result = results[0]
    eq_(result.prefix, original_prefix)
    eq_(result.alt, original_allele)
    eq_(result.suffix, original_suffix)
    eq_(len(result.reads), len(subsequences))

    result_decoy = results[1]
    eq_(result_decoy.sequence, decoy.sequence)
Beispiel #5
0
def test_assembly_of_simple_sequence_from_mock_reads():
    #
    # Read sequences:
    #    AAAAA|CC|TTTTT
    #    AAAAA|CC|TTTTT
    #   GAAAAA|CC|TTTTTG
    #     AAAA|CC|TTTT
    reads = [
        # two identical reads with sequence AAAAA|CC|TTTTT
        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"),
        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"),
        # longer sequence GAAAAA|CC|TTTTTG
        AlleleRead(prefix="G" + "A" * 5, allele="CC", suffix="T" * 5 + "G", name="longer"),
        # shorter sequence AAAA|CC|TTTT
        AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4, name="shorter"),
    ]
    expected_variant_sequence = VariantSequence(
        prefix="G" + "A" * 5, alt="CC", suffix="T" * 5 + "G", reads=reads)
    initial_variant_sequences = initial_variant_sequences_from_reads(reads)
    # expecting one fewer sequence than reads since two of the reads are
    # duplicates
    eq_(len(initial_variant_sequences), len(reads) - 1)

    assembled_variant_sequences = iterative_overlap_assembly(
        initial_variant_sequences,
        min_overlap_size=1)

    # since no reads contradict each other then we should get back a single
    # assembled sequence
    eq_(len(assembled_variant_sequences),
        1,
        "Unexpected number of variant sequences: %s" % (assembled_variant_sequences,))
    assembled_variant_sequence = assembled_variant_sequences[0]
    eq_(assembled_variant_sequence, expected_variant_sequence)

    eq_(len(assembled_variant_sequence.reads), len(reads))

    eq_(assembled_variant_sequence.min_coverage(), 1)
    # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with
    # all 4/4 reads
    expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14
    eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage)
Beispiel #6
0
def test_assembly_unrelated_sequences():
    # 2 overlapping sequences, 1 with a different suffix,
    # and 2 totally unrelated sequences
    variant_sequences = [
        VariantSequence(prefix="CCC", alt="T", suffix="GGG", reads={"1"}),
        VariantSequence(prefix="TCCC", alt="T", suffix="G", reads={"2"}),
        VariantSequence(prefix="CCC", alt="T", suffix="AAA", reads={"3"}),
        VariantSequence(prefix="AGG", alt="T", suffix="CGG", reads={"4"}),
        VariantSequence(prefix="CAC", alt="T", suffix="TTT", reads={"5"})
    ]
    results = iterative_overlap_assembly(variant_sequences, min_overlap_size=1)
    eq_(len(results), 4)
    # first two sequences were overlapping
    count_multiple = 0
    count_singleton = 0
    for result in results:
        # all but one result are singletons
        if len(result.reads) > 1:
            eq_(result.reads, {"1", "2"})
            count_multiple += 1
        else:
            count_singleton += 1
    eq_(3, count_singleton)
    eq_(1, count_multiple)
Beispiel #7
0
def test_assembly_1_sequence():
    vs = VariantSequence(prefix="CCC", alt="T", suffix="GGG", reads={"1"})
    eq_(iterative_overlap_assembly([vs]), [vs])
Beispiel #8
0
def test_assembly_no_sequences():
    eq_(iterative_overlap_assembly([]), [])