Ejemplo n.º 1
0
def test_somatic_variant_with_2_supporting_rna_reads():
    variant = Variant("14", 105849746, "G", "A", grch38)
    base_dir = "data/somatic-variant-with-2-supporting-rna-reads/"
    normal_reads = load_bam(base_dir +
                            "normal.14.105849746.G.A.no-alt.sorted.bam")
    tumor_reads = load_bam(base_dir +
                           "tumor.14.105849746.G.A.many-alt.sorted.bam")
    rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam")
    read_creator = ReadCollector()
    normal_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant, alignment_file=normal_reads)
    eq_(len(normal_sample_variant_reads), 0)
    print(normal_sample_variant_reads)

    tumor_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant, alignment_file=tumor_reads)
    print(tumor_sample_variant_reads)
    eq_(len(tumor_sample_variant_reads), 8)

    rna_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant, alignment_file=rna_reads)
    print(rna_sample_variant_reads)
    eq_(len(rna_sample_variant_reads), 2)
    # Arun went through the hassle of pulling out the exact read names
    # in IGV
    expected_variant_rna_read_names = {
        "K00193:50:H5NKVBBXX:5:2202:6421:24964",
        "K00193:50:H5NKVBBXX:5:2119:30908:1138",
    }
    for variant_read in rna_sample_variant_reads:
        assert variant_read.name in expected_variant_rna_read_names
def test_somatic_variant_with_2_supporting_rna_reads():
    variant = Variant("14", 105849746, "G", "A")
    base_dir = "data/somatic-variant-with-2-supporting-rna-reads/"
    normal_reads = load_bam(base_dir + "normal.14.105849746.G.A.no-alt.sorted.bam")
    tumor_reads = load_bam(base_dir + "tumor.14.105849746.G.A.many-alt.sorted.bam")
    rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam")

    normal_sample_variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=normal_reads)
    eq_(len(normal_sample_variant_reads), 0)
    print(normal_sample_variant_reads)

    tumor_sample_variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=tumor_reads)
    print(tumor_sample_variant_reads)
    eq_(len(tumor_sample_variant_reads), 8)

    rna_sample_variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=rna_reads)
    print(rna_sample_variant_reads)
    eq_(len(rna_sample_variant_reads), 2)
    # Arun went through the hassle of pulling out the exact read names
    # in IGV
    expected_variant_rna_read_names = {
        "K00193:50:H5NKVBBXX:5:2202:6421:24964",
        "K00193:50:H5NKVBBXX:5:2119:30908:1138",
    }
    for variant_read in rna_sample_variant_reads:
        assert variant_read.name in expected_variant_rna_read_names
Ejemplo n.º 3
0
def test_most_common_nucleotides_for_chr12_deletion():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 70091490
    ref = "TTGTAGATGCTGCCTCTCC"
    alt = ""
    variant = Variant(
        chromosome,
        base1_location,
        ref,
        alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        samfile=samfile,
        chromosome=chromosome,
        variant=variant)
    consensus_sequence, chosen_counts, other_counts = most_common_nucleotides(
        variant_reads)
    print(chosen_counts)
    print(other_counts)
    eq_(len(chosen_counts), len(consensus_sequence))
    eq_(len(other_counts), len(consensus_sequence))
    assert other_counts.sum() < chosen_counts.sum(), \
        "Counts for alternate nucleotides should not exceed the chosen sequence"

    number_matching_reads = 0
    for variant_read in variant_reads:
        full_seq = variant_read.prefix + variant_read.allele + variant_read.suffix
        number_matching_reads += (full_seq in consensus_sequence)
    fraction_matching_reads = number_matching_reads / float(len(variant_reads))
    print("Fraction matching reads is %d/%d = %f" % (
        number_matching_reads, len(variant_reads), fraction_matching_reads))
    assert fraction_matching_reads > 0.5, \
        "Expected majority of reads to match consensus sequence"
Ejemplo n.º 4
0
def test_sequence_counts_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(chromosome, base1_location, ref, alt)

    variant_reads = reads_supporting_variant(
        samfile=samfile,
        chromosome=chromosome,
        variant=variant)

    variant_sequences = reads_to_variant_sequences(
        variant=variant,
        reads=variant_reads,
        preferred_sequence_length=61)
    assert len(variant_sequences) == 1
    for variant_sequence in variant_sequences:
        print(variant_sequence)
        eq_(variant_sequence.alt, alt)
        eq_(len(variant_sequence.prefix), 30)
        eq_(len(variant_sequence.suffix), 30)
        eq_(
            variant_sequence.prefix + variant_sequence.alt + variant_sequence.suffix,
            variant_sequence.sequence)
Ejemplo n.º 5
0
def test_assemble_transcript_fragments_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(
        contig=chromosome,
        start=base1_location,
        ref=ref,
        alt=alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=samfile,
        chromosome=chromosome,)

    sequences = iterative_overlap_assembly(
        initial_variant_sequences_from_reads(variant_reads),
        min_overlap_size=30)

    assert len(sequences) > 0
    max_read_length = max(len(r) for r in variant_reads)
    for s in sequences:
        print("%s%s%s weight=%d length=%d" % (
            s.prefix,
            s.alt,
            s.suffix,
            len(s.reads),
            len(s.sequence)))
        eq_(s.alt, alt)
        assert len(s) > max_read_length, \
            "Expected assembled sequences to be longer than read length (%d)" % (
                max_read_length,)
Ejemplo n.º 6
0
def test_group_unique_sequences():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(
        contig=chromosome,
        start=base1_location,
        ref=ref, alt=alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        samfile=samfile,
        chromosome=chromosome,
        variant=variant)
    print("%d variant reads: %s" % (
        len(variant_reads), variant_reads))
    groups = group_unique_sequences(
        variant_reads,
        max_prefix_size=30,
        max_suffix_size=30)
    print("%d unique sequences: %s" % (
        len(groups), groups))
    # there are some redundant reads, so we expect that the number of
    # unique entries should be less than the total read partitions
    assert len(variant_reads) > len(groups)
Ejemplo n.º 7
0
def variants_to_protein_sequences_dataframe(
        expressed_vcf="data/b16.f10/b16.expressed.vcf",
        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
        min_mapping_quality=0,
        max_protein_sequences_per_variant=1,
        variant_sequence_assembly=False):
    """
    Helper function to load pair of VCFs and tumor RNA BAM
    and use them to generate a DataFrame of expressed variant protein
    sequences.
    """
    expressed_variants = load_vcf(expressed_vcf)
    not_expressed_variants = load_vcf(not_expressed_vcf)

    combined_variants = VariantCollection(
        list(expressed_variants) + list(not_expressed_variants))
    alignment_file = load_bam(tumor_rna_bam)
    read_collector = ReadCollector(min_mapping_quality=min_mapping_quality)
    read_evidence_gen = read_collector.read_evidence_generator(
        variants=combined_variants, alignment_file=alignment_file)

    creator = ProteinSequenceCreator(
        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
        variant_sequence_assembly=variant_sequence_assembly)
    protein_sequences_generator = \
        creator.protein_sequences_from_read_evidence_generator(read_evidence_gen)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    return df, expressed_variants, combined_variants
Ejemplo n.º 8
0
def variants_to_protein_sequences_dataframe(
        expressed_vcf="data/b16.f10/b16.expressed.vcf",
        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
        min_mapping_quality=0,
        max_protein_sequences_per_variant=1,
        variant_sequence_assembly=False):
    """
    Helper function to load pair of VCFs and tumor RNA BAM
    and use them to generate a DataFrame of expressed variant protein
    sequences.
    """
    expressed_variants = load_vcf(expressed_vcf)
    not_expressed_variants = load_vcf(not_expressed_vcf)

    combined_variants = VariantCollection(
        list(expressed_variants) + list(not_expressed_variants))
    samfile = load_bam(tumor_rna_bam)

    allele_reads_generator = reads_overlapping_variants(
        variants=combined_variants,
        samfile=samfile,
        min_mapping_quality=min_mapping_quality)

    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        allele_reads_generator,
        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
        variant_sequence_assembly=variant_sequence_assembly)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    return df, expressed_variants, combined_variants
Ejemplo n.º 9
0
def test_assemble_transcript_fragments_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(contig=chromosome,
                      start=base1_location,
                      ref=ref,
                      alt=alt,
                      ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        variant=variant,
        samfile=samfile,
        chromosome=chromosome,
    )

    sequences = iterative_overlap_assembly(
        initial_variant_sequences_from_reads(variant_reads),
        min_overlap_size=30)

    assert len(sequences) > 0
    max_read_length = max(len(r) for r in variant_reads)
    for s in sequences:
        print("%s%s%s weight=%d length=%d" %
              (s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence)))
        eq_(s.alt, alt)
        if len(s.read_names) > 1:
            # expect sequences supported by more than one read to be greater
            # than the read length
            assert len(s) > max_read_length, \
                "Expected assembled sequences to be longer than read length (%d)" % (
                    max_read_length,)
Ejemplo n.º 10
0
def test_locus_reads_dataframe():
    sam_all_variants = load_bam("data/b16.f10/b16.combined.bam")

    n_reads_expected = 0

    sam_path_single_variant = data_path(
        "data/b16.f10/b16.f10.127a.aldh1b1.chr4.45802539.refG.altC.sam")
    with open(sam_path_single_variant) as f:
        for line in f:
            if line.startswith("HWI"):
                n_reads_expected += 1
    # we know from inspecting the file that *one* of the reads overlapping this
    # variant has a CIGAR string of N at the location before and thus we'll
    # be missing that read.
    #
    # TODO: figure out what to do when the variant nucleotide is at the start or
    # end of an exon, since that won't have mapping positions on both its left
    # and right
    n_reads_expected -= 1

    print("Found %d sequences in %s" %
          (n_reads_expected, sam_path_single_variant))
    df = locus_reads_dataframe(samfile=sam_all_variants,
                               chromosome="chr4",
                               base1_position_before_variant=45802538,
                               base1_position_after_variant=45802540)
    print(df)
    eq_(len(df), n_reads_expected)
Ejemplo n.º 11
0
def test_locus_reads_dataframe():
    sam_all_variants = load_bam("data/b16.f10/b16.combined.bam")

    n_reads_expected = 0

    sam_path_single_variant = data_path(
        "data/b16.f10/b16.f10.127a.aldh1b1.chr4.45802539.refG.altC.sam")
    with open(sam_path_single_variant) as f:
        for line in f:
            if line.startswith("HWI"):
                n_reads_expected += 1
    # we know from inspecting the file that *one* of the reads overlapping this
    # variant has a CIGAR string of N at the location before and thus we'll
    # be missing that read.
    #
    # TODO: figure out what to do when the variant nucleotide is at the start or
    # end of an exon, since that won't have mapping positions on both its left
    # and right
    n_reads_expected -= 1

    print("Found %d sequences in %s" % (n_reads_expected, sam_path_single_variant))
    df = locus_reads_dataframe(
        samfile=sam_all_variants,
        chromosome="chr4",
        base1_position_before_variant=45802538,
        base1_position_after_variant=45802540)
    print(df)
    eq_(len(df), n_reads_expected)
def test_partition_variant_reads_deletion():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 70091490
    ref = "TTGTAGATGCTGCCTCTCC"
    alt = ""
    variant = Variant(contig=chromosome, start=base1_location, ref=ref, alt=alt, ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(samfile=samfile, chromosome=chromosome, variant=variant)
    assert len(variant_reads) > 1
    for variant_read in variant_reads:
        eq_(variant_read.allele, alt)
def test_somatic_variant_with_0_supporting_rna_reads():
    variant = Variant("6", 90411765, "G", "A")
    base_dir = "data/somatic-variant-with-0-supporting-rna-reads/"
    normal_reads = load_bam(base_dir + "normal.6.90411765.G.A.sorted.bam")
    tumor_reads = load_bam(base_dir + "tumor.6.90411765.G.A.sorted.bam")
    rna_reads = load_bam(base_dir + "rna.6.90411765.G.A.sorted.bam")

    normal_sample_variant_reads = reads_supporting_variant(
        variant=variant, samfile=normal_reads)
    eq_(len(normal_sample_variant_reads), 0)
    print(normal_sample_variant_reads)

    tumor_sample_variant_reads = reads_supporting_variant(variant=variant,
                                                          samfile=tumor_reads)
    print(tumor_sample_variant_reads)
    eq_(len(tumor_sample_variant_reads), 5)

    rna_sample_variant_reads = reads_supporting_variant(variant=variant,
                                                        samfile=rna_reads)
    print(rna_sample_variant_reads)
    eq_(len(rna_sample_variant_reads), 0)
Ejemplo n.º 14
0
def test_translate_variant_collection():
    variants = load_vcf("data/b16.f10/b16.vcf")
    samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")

    result = list(translate_variants(reads_supporting_variants(variants, samfile)))
    eq_(
        len(result),
        4,
        "Expected %d translated variants but got %d: %s" % (
            len(variants),
            len(result),
            result))
Ejemplo n.º 15
0
def test_translate_variant_collection():
    variants = load_vcf("data/b16.f10/b16.vcf")
    samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")
    read_evidence_gen = ReadCollector().read_evidence_generator(
        variants,
        samfile)
    translation_gen = ProteinSequenceCreator().translate_variants(read_evidence_gen)
    translations = list(translation_gen)
    eq_(
        len(translations),
        4,
        "Expected %d translated variants but got %d: %s" % (
            len(variants),
            len(translations),
            translations))
Ejemplo n.º 16
0
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(
):
    # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
    # if we set the minimum quality to 256
    variants = load_vcf("data/b16.f10/b16.vcf")
    alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam")
    read_collector = ReadCollector(min_mapping_quality=256)
    read_evidence_gen = read_collector.read_evidence_generator(
        variants=variants, alignment_file=alignment_file)

    creator = ProteinSequenceCreator(max_protein_sequences_per_variant=1, )
    protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator(
        read_evidence_gen)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    print(df)
    eq_(len(df), 0, "Expected 0 entries, got %d: %s" % (len(df), df))
def test_partition_variant_reads_deletion():
    alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 70091490
    ref = "TTGTAGATGCTGCCTCTCC"
    alt = ""
    variant = Variant(contig=chromosome,
                      start=base1_location,
                      ref=ref,
                      alt=alt,
                      ensembl=ensembl_grch38)
    read_collector = ReadCollector()
    read_evidence = read_collector.read_evidence_for_variant(
        alignment_file=alignment_file, variant=variant)
    assert len(read_evidence.alt_reads) > 1
    for variant_read in read_evidence.alt_reads:
        eq_(variant_read.allele, alt)
def test_partition_variant_reads_snv():
    alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(contig=chromosome,
                      start=base1_location,
                      ref=ref,
                      alt=alt,
                      ensembl=ensembl_grch38)
    read_collector = ReadCollector()
    read_evidence = read_collector.read_evidence_for_variant(
        alignment_file=alignment_file, variant=variant)
    alt_reads = read_evidence.alt_reads
    assert len(alt_reads) > 1
    for variant_read in alt_reads:
        eq_(variant_read.allele, alt)
Ejemplo n.º 19
0
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality():
    # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
    # if we set the minimum quality to 256
    variants = load_vcf("data/b16.f10/b16.vcf")
    samfile = load_bam("data/b16.f10/b16.combined.sorted.bam")
    allele_reads_generator = reads_overlapping_variants(
        variants=variants,
        samfile=samfile,
        min_mapping_quality=256)
    protein_sequences_generator = reads_generator_to_protein_sequences_generator(
        allele_reads_generator,
        max_protein_sequences_per_variant=1)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    print(df)
    eq_(
        len(df),
        0,
        "Expected 0 entries, got %d: %s" % (len(df), df))
def test_partition_variant_reads_deletion():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 70091490
    ref = "TTGTAGATGCTGCCTCTCC"
    alt = ""
    variant = Variant(
        contig=chromosome,
        start=base1_location,
        ref=ref,
        alt=alt,
        ensembl=ensembl_grch38)
    variant_reads = reads_supporting_variant(
        samfile=samfile,
        chromosome=chromosome,
        variant=variant)
    assert len(variant_reads) > 1
    for variant_read in variant_reads:
        eq_(variant_read.allele, alt)
Ejemplo n.º 21
0
def test_protein_sequence_creator_protein_length():
    variants = load_vcf("data/b16.f10/b16.vcf")
    alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam")
    read_collector = ReadCollector()

    for desired_length in [21, 15, 10]:
        creator = ProteinSequenceCreator(
            max_protein_sequences_per_variant=1,
            protein_sequence_length=desired_length)
        read_evidence_gen = read_collector.read_evidence_generator(
            variants=variants, alignment_file=alignment_file)
        protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator(
            read_evidence_gen)
        df = protein_sequences_generator_to_dataframe(
            protein_sequences_generator)
        print(df)
        protein_sequences = df["amino_acids"]
        print(protein_sequences)
        protein_sequence_lengths = protein_sequences.str.len()
        assert (protein_sequence_lengths == desired_length).all(), (
            protein_sequence_lengths, )
def test_sequence_counts_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(chromosome, base1_location, ref, alt, grch38)
    read_creator = ReadCollector()
    variant_reads = read_creator.allele_reads_supporting_variant(
        alignment_file=samfile, variant=variant)
    variant_sequence_creator = VariantSequenceCreator(
        preferred_sequence_length=61)
    variant_sequences = variant_sequence_creator.reads_to_variant_sequences(
        variant=variant, reads=variant_reads)
    assert len(variant_sequences) == 1
    for variant_sequence in variant_sequences:
        print(variant_sequence)
        eq_(variant_sequence.alt, alt)
        eq_(len(variant_sequence.prefix), 30)
        eq_(len(variant_sequence.suffix), 30)
        eq_(
            variant_sequence.prefix + variant_sequence.alt +
            variant_sequence.suffix, variant_sequence.sequence)
Ejemplo n.º 23
0
def test_sequence_counts_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(chromosome, base1_location, ref, alt)

    variant_reads = reads_supporting_variant(samfile=samfile,
                                             chromosome=chromosome,
                                             variant=variant)

    variant_sequences = reads_to_variant_sequences(
        variant=variant, reads=variant_reads, preferred_sequence_length=61)
    assert len(variant_sequences) == 1
    for variant_sequence in variant_sequences:
        print(variant_sequence)
        eq_(variant_sequence.alt, alt)
        eq_(len(variant_sequence.prefix), 30)
        eq_(len(variant_sequence.suffix), 30)
        eq_(
            variant_sequence.prefix + variant_sequence.alt +
            variant_sequence.suffix, variant_sequence.sequence)