Example #1
0
def test_assemble_transcript_fragments_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(
        contig=chromosome,
        start=base1_location,
        ref=ref,
        alt=alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=samfile,
        chromosome=chromosome,)

    sequences = iterative_overlap_assembly(
        initial_variant_sequences_from_reads(variant_reads),
        min_overlap_size=30)

    assert len(sequences) > 0
    max_read_length = max(len(r) for r in variant_reads)
    for s in sequences:
        print("%s%s%s weight=%d length=%d" % (
            s.prefix,
            s.alt,
            s.suffix,
            len(s.reads),
            len(s.sequence)))
        eq_(s.alt, alt)
        assert len(s) > max_read_length, \
            "Expected assembled sequences to be longer than read length (%d)" % (
                max_read_length,)
Example #2
0
def test_assemble_transcript_fragments_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(contig=chromosome,
                      start=base1_location,
                      ref=ref,
                      alt=alt,
                      ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=samfile,
        chromosome=chromosome,
    )

    sequences = iterative_overlap_assembly(
        initial_variant_sequences_from_reads(variant_reads),
        min_overlap_size=30)

    assert len(sequences) > 0
    max_read_length = max(len(r) for r in variant_reads)
    for s in sequences:
        print("%s%s%s weight=%d length=%d" %
              (s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence)))
        eq_(s.alt, alt)
        if len(s.read_names) > 1:
            # expect sequences supported by more than one read to be greater
            # than the read length
            assert len(s) > max_read_length, \
                "Expected assembled sequences to be longer than read length (%d)" % (
                    max_read_length,)
Example #3
0
def test_assembly_of_simple_sequence_from_mock_reads():
    # Read sequences:
    #    AAAAA|CC|TTTTT
    #    AAAAA|CC|TTTTT
    #   GAAAAA|CC|TTTTTG
    #     AAAA|CC|TTTT
    reads = [
        # two identical reads with sequence AAAAA|CC|TTTTT
        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"),
        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"),
        # longer sequence GAAAAA|CC|TTTTTG
        AlleleRead(prefix="G" + "A" * 5,
                   allele="CC",
                   suffix="T" * 5 + "G",
                   name="longer"),
        # shorter sequence AAAA|CC|TTTT
        AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4,
                   name="shorter"),
    ]
    expected_variant_sequence = VariantSequence(prefix="G" + "A" * 5,
                                                alt="CC",
                                                suffix="T" * 5 + "G",
                                                reads=reads)
    initial_variant_sequences = initial_variant_sequences_from_reads(reads)
    # expecting one fewer sequence than reads since two of the reads are
    # duplicates
    eq_(len(initial_variant_sequences), len(reads) - 1)

    # calling into either iterative_overlap_assembly or greedy_merge should
    # give same results
    for fn in [greedy_merge, iterative_overlap_assembly]:

        assembled_variant_sequences = fn(initial_variant_sequences,
                                         min_overlap_size=1)

        # since no reads contradict each other then we should get back a single
        # assembled sequence
        eq_(
            len(assembled_variant_sequences), 1,
            "Unexpected number of variant sequences: %s" %
            (assembled_variant_sequences, ))
        assembled_variant_sequence = assembled_variant_sequences[0]
        eq_(assembled_variant_sequence, expected_variant_sequence)

        eq_(len(assembled_variant_sequence.reads), len(reads))

        eq_(assembled_variant_sequence.min_coverage(), 1)
        # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with
        # all 4/4 reads
        expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14
        eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage)
Example #4
0
def test_assembly_of_simple_sequence_from_mock_reads():
    #
    # Read sequences:
    #    AAAAA|CC|TTTTT
    #    AAAAA|CC|TTTTT
    #   GAAAAA|CC|TTTTTG
    #     AAAA|CC|TTTT
    reads = [
        # two identical reads with sequence AAAAA|CC|TTTTT
        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup1"),
        AlleleRead(prefix="A" * 5, allele="CC", suffix="T" * 5, name="dup2"),
        # longer sequence GAAAAA|CC|TTTTTG
        AlleleRead(prefix="G" + "A" * 5, allele="CC", suffix="T" * 5 + "G", name="longer"),
        # shorter sequence AAAA|CC|TTTT
        AlleleRead(prefix="A" * 4, allele="CC", suffix="T" * 4, name="shorter"),
    ]
    expected_variant_sequence = VariantSequence(
        prefix="G" + "A" * 5, alt="CC", suffix="T" * 5 + "G", reads=reads)
    initial_variant_sequences = initial_variant_sequences_from_reads(reads)
    # expecting one fewer sequence than reads since two of the reads are
    # duplicates
    eq_(len(initial_variant_sequences), len(reads) - 1)

    assembled_variant_sequences = iterative_overlap_assembly(
        initial_variant_sequences,
        min_overlap_size=1)

    # since no reads contradict each other then we should get back a single
    # assembled sequence
    eq_(len(assembled_variant_sequences),
        1,
        "Unexpected number of variant sequences: %s" % (assembled_variant_sequences,))
    assembled_variant_sequence = assembled_variant_sequences[0]
    eq_(assembled_variant_sequence, expected_variant_sequence)

    eq_(len(assembled_variant_sequence.reads), len(reads))

    eq_(assembled_variant_sequence.min_coverage(), 1)
    # 2 bases with 1/4 reads, 2 bases with 3/4 reads, remaining 10 bases with
    # all 4/4 reads
    expected_mean_coverage = (2 * 1 + 2 * 3 + 10 * 4) / 14
    eq_(assembled_variant_sequence.mean_coverage(), expected_mean_coverage)