コード例 #1
0
def test_partitioned_read_sequences_deletion():
    """
    test_partitioned_read_sequences_deletion : Test that read gets correctly
    partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to
    be "ACCTTG"
    """
    # chr1_seq = "ACCTTG"
    chromosome = "chromosome"
    location = 4
    ref = "TT"
    alt = "T"
    variant = Variant(
        chromosome, location, ref, alt, grch38, normalize_contig_name=False)

    read = make_pysam_read(
        seq="ACCTG",
        cigar="4M1D1M",
        mdtag="4^T1")
    samfile = MockAlignmentFile(
        references=(chromosome,),
        reads=[read])
    read_creator = ReadCollector()
    variant_reads = read_creator.allele_reads_supporting_variant(
        alignment_file=samfile,
        variant=variant)
    print(variant_reads)
    assert len(variant_reads) == 1
    variant_read = variant_reads[0]
    expected = AlleleRead(
        name=read.qname,
        prefix="ACCT",
        allele="",
        suffix="G")
    eq_(variant_read, expected)
コード例 #2
0
def test_locus_reads_insertion():
    """
    test_partitioned_read_sequences_insertion : Test that read gets correctly
    partitioned for chr1:4 T>TG
    where the sequence for chr1 is assumed to be "ACCTTG"
    and the variant sequence is "ACCTGTG"
    """
    variant = Variant("1", 4, ref="T", alt="TG")

    pysam_read = make_pysam_read(seq="ACCTGTG", cigar="4M1I2M", mdtag="6")

    samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read])
    read_creator = ReadCollector()
    reads = read_creator.get_locus_reads(samfile, "chromosome", variant.start,
                                         variant.start)
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(
        name=pysam_read.qname,
        sequence=pysam_read.query_sequence,
        # expect the inserted nucleotide to be missing a corresponding
        # ref position
        reference_positions=[0, 1, 2, 3, None, 4, 5],
        quality_scores=pysam_read.query_qualities,
        read_base0_start_inclusive=4,
        read_base0_end_exclusive=5,
        reference_base0_start_inclusive=4,
        reference_base0_end_exclusive=4)
    print("Actual: %s" % (read, ))
    print("Expected: %s" % (expected, ))
    assert_equal_fields(read, expected)
コード例 #3
0
def test_locus_reads_deletion():
    """
    test_partitioned_read_sequences_deletion : Test that read gets correctly
    partitioned for chr1:4 TT>T where the sequence for chr1 is assumed to
    be "ACCTTG"
    """
    # normalization of this variant will turn it into the deletion of
    # "T" at base-1 position 5
    variant = Variant("1", 4, ref="TT", alt="T")
    pysam_read = make_pysam_read(seq="ACCTG", cigar="4M1D1M", mdtag="4^T1")

    samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read])
    read_creator = ReadCollector()
    reads = read_creator.get_locus_reads(samfile, "chromosome",
                                         variant.start - 1, variant.start)
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(
        name=pysam_read.qname,
        sequence=pysam_read.query_sequence,
        reference_positions=[0, 1, 2, 3, 5],
        quality_scores=pysam_read.query_qualities,
        # missing would have gone after 4th nucleotide in the read
        read_base0_start_inclusive=4,
        read_base0_end_exclusive=4,
        reference_base0_start_inclusive=4,
        reference_base0_end_exclusive=5)
    assert_equal_fields(read, expected)
コード例 #4
0
def test_locus_reads_snv():
    """
    test_partitioned_read_sequences_snv : Test that read gets correctly
    partitioned for chr1:4 T>G where the sequence for chr1 is assumed
    to be "ACCTTG"
    """
    # chr1_seq = "ACCTTG"
    variant = Variant("1", 4, ref="T", alt="G")

    pysam_read = make_pysam_read(seq="ACCGTG", cigar="6M", mdtag="3G2")

    samfile = MockAlignmentFile(references=("chromosome", ),
                                reads=[pysam_read])
    read_creator = ReadCollector()
    reads = read_creator.get_locus_reads(samfile, "chromosome",
                                         variant.start - 1, variant.start)
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, 2, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         reference_base0_start_inclusive=3,
                         reference_base0_end_exclusive=4,
                         read_base0_start_inclusive=3,
                         read_base0_end_exclusive=4)
    assert_equal_fields(read, expected)
コード例 #5
0
def test_locus_reads_substitution_shorter():
    # test CC>G subsitution at 2nd and 3rd nucleotides of reference sequence
    # "ACCTTG", for which the alignment is interpreted as a C>G variant
    # followed by the deletion of a C
    variant = Variant("1", 2, ref="CC", alt="G")
    print(variant)
    pysam_read = make_pysam_read(seq="AGTTG", cigar="2M1D3M", mdtag="1C^C4")

    samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read])
    read_creator = ReadCollector()
    reads = read_creator.get_locus_reads(samfile, "chromosome", 1, 3)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    print(reads)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         read_base0_start_inclusive=1,
                         read_base0_end_exclusive=2,
                         reference_base0_start_inclusive=1,
                         reference_base0_end_exclusive=3)
    assert_equal_fields(read, expected)
コード例 #6
0
def variants_to_protein_sequences_dataframe(
        expressed_vcf="data/b16.f10/b16.expressed.vcf",
        not_expressed_vcf="data/b16.f10/b16.not-expressed.vcf",
        tumor_rna_bam="data/b16.f10/b16.combined.sorted.bam",
        min_mapping_quality=0,
        max_protein_sequences_per_variant=1,
        variant_sequence_assembly=False):
    """
    Helper function to load pair of VCFs and tumor RNA BAM
    and use them to generate a DataFrame of expressed variant protein
    sequences.
    """
    expressed_variants = load_vcf(expressed_vcf)
    not_expressed_variants = load_vcf(not_expressed_vcf)

    combined_variants = VariantCollection(
        list(expressed_variants) + list(not_expressed_variants))
    alignment_file = load_bam(tumor_rna_bam)
    read_collector = ReadCollector(min_mapping_quality=min_mapping_quality)
    read_evidence_gen = read_collector.read_evidence_generator(
        variants=combined_variants, alignment_file=alignment_file)

    creator = ProteinSequenceCreator(
        max_protein_sequences_per_variant=max_protein_sequences_per_variant,
        variant_sequence_assembly=variant_sequence_assembly)
    protein_sequences_generator = \
        creator.protein_sequences_from_read_evidence_generator(read_evidence_gen)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    return df, expressed_variants, combined_variants
コード例 #7
0
def test_locus_reads_substitution_longer():
    # test C>GG subsitution at second nucleotide of reference sequence "ACCTTG",
    # the alignment is interpreted as a C>G variant followed by an insertion of
    # another G
    variant = Variant("1", 2, ref="C", alt="GG")
    print(variant)
    pysam_read = make_pysam_read(seq="AGGCTTG", cigar="2M1I4M", mdtag="1C4")

    samfile = MockAlignmentFile(references={"chromosome"}, reads=[pysam_read])
    read_creator = ReadCollector()
    reads = read_creator.get_locus_reads(samfile, "chromosome", 1, 2)
    print(reads)
    assert len(reads) == 1, \
        "Expected to get back one read but instead got %d" % (
            len(reads),)
    read = reads[0]
    expected = LocusRead(name=pysam_read.qname,
                         sequence=pysam_read.query_sequence,
                         reference_positions=[0, 1, None, 2, 3, 4, 5],
                         quality_scores=pysam_read.query_qualities,
                         read_base0_start_inclusive=1,
                         read_base0_end_exclusive=3,
                         reference_base0_start_inclusive=1,
                         reference_base0_end_exclusive=2)
    assert_equal_fields(read, expected)
コード例 #8
0
def test_assemble_transcript_fragments_snv():
    alignment_file = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(contig=chromosome,
                      start=base1_location,
                      ref=ref,
                      alt=alt,
                      ensembl=ensembl_grch38)
    read_creator = ReadCollector()
    variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant, alignment_file=alignment_file)

    sequences = iterative_overlap_assembly(
        initial_variant_sequences_from_reads(variant_reads),
        min_overlap_size=30)

    assert len(sequences) > 0
    max_read_length = max(len(r) for r in variant_reads)
    for s in sequences:
        print("%s%s%s weight=%d length=%d" %
              (s.prefix, s.alt, s.suffix, len(s.reads), len(s.sequence)))
        eq_(s.alt, alt)
        if len(s.read_names) > 1:
            # expect sequences supported by more than one read to be greater
            # than the read length
            assert len(s) > max_read_length, \
                "Expected assembled sequences to be longer than read length (%d)" % (
                    max_read_length,)
コード例 #9
0
def test_somatic_variant_with_2_supporting_rna_reads():
    variant = Variant("14", 105849746, "G", "A", grch38)
    base_dir = "data/somatic-variant-with-2-supporting-rna-reads/"
    normal_reads = load_bam(base_dir +
                            "normal.14.105849746.G.A.no-alt.sorted.bam")
    tumor_reads = load_bam(base_dir +
                           "tumor.14.105849746.G.A.many-alt.sorted.bam")
    rna_reads = load_bam(base_dir + "rna.14.105849746.G.A.2-alt.sorted.bam")
    read_creator = ReadCollector()
    normal_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant, alignment_file=normal_reads)
    eq_(len(normal_sample_variant_reads), 0)
    print(normal_sample_variant_reads)

    tumor_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant, alignment_file=tumor_reads)
    print(tumor_sample_variant_reads)
    eq_(len(tumor_sample_variant_reads), 8)

    rna_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant, alignment_file=rna_reads)
    print(rna_sample_variant_reads)
    eq_(len(rna_sample_variant_reads), 2)
    # Arun went through the hassle of pulling out the exact read names
    # in IGV
    expected_variant_rna_read_names = {
        "K00193:50:H5NKVBBXX:5:2202:6421:24964",
        "K00193:50:H5NKVBBXX:5:2119:30908:1138",
    }
    for variant_read in rna_sample_variant_reads:
        assert variant_read.name in expected_variant_rna_read_names
コード例 #10
0
def test_variants_to_protein_sequences_dataframe_filtered_all_reads_by_mapping_quality(
):
    # since the B16 BAM has all MAPQ=255 values then all the reads should get dropped
    # if we set the minimum quality to 256
    variants = load_vcf("data/b16.f10/b16.vcf")
    alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam")
    read_collector = ReadCollector(min_mapping_quality=256)
    read_evidence_gen = read_collector.read_evidence_generator(
        variants=variants, alignment_file=alignment_file)

    creator = ProteinSequenceCreator(max_protein_sequences_per_variant=1, )
    protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator(
        read_evidence_gen)
    df = protein_sequences_generator_to_dataframe(protein_sequences_generator)
    print(df)
    eq_(len(df), 0, "Expected 0 entries, got %d: %s" % (len(df), df))
コード例 #11
0
def test_protein_sequence_creator_protein_length():
    variants = load_vcf("data/b16.f10/b16.vcf")
    alignment_file = load_bam("data/b16.f10/b16.combined.sorted.bam")
    read_collector = ReadCollector()

    for desired_length in [21, 15, 10]:
        creator = ProteinSequenceCreator(
            max_protein_sequences_per_variant=1,
            protein_sequence_length=desired_length)
        read_evidence_gen = read_collector.read_evidence_generator(
            variants=variants, alignment_file=alignment_file)
        protein_sequences_generator = creator.protein_sequences_from_read_evidence_generator(
            read_evidence_gen)
        df = protein_sequences_generator_to_dataframe(
            protein_sequences_generator)
        print(df)
        protein_sequences = df["amino_acids"]
        print(protein_sequences)
        protein_sequence_lengths = protein_sequences.str.len()
        assert (protein_sequence_lengths == desired_length).all(), (
            protein_sequence_lengths, )
コード例 #12
0
def test_sequence_counts_snv():
    samfile = load_bam("data/cancer-wgs-primary.chr12.bam")
    chromosome = "chr12"
    base1_location = 65857041
    ref = "G"
    alt = "C"
    variant = Variant(chromosome, base1_location, ref, alt, grch38)
    read_creator = ReadCollector()
    variant_reads = read_creator.allele_reads_supporting_variant(
        alignment_file=samfile, variant=variant)
    variant_sequence_creator = VariantSequenceCreator(
        preferred_sequence_length=61)
    variant_sequences = variant_sequence_creator.reads_to_variant_sequences(
        variant=variant, reads=variant_reads)
    assert len(variant_sequences) == 1
    for variant_sequence in variant_sequences:
        print(variant_sequence)
        eq_(variant_sequence.alt, alt)
        eq_(len(variant_sequence.prefix), 30)
        eq_(len(variant_sequence.suffix), 30)
        eq_(
            variant_sequence.prefix + variant_sequence.alt +
            variant_sequence.suffix, variant_sequence.sequence)
def test_somatic_variant_with_0_supporting_rna_reads():
    variant = Variant("6", 90411765, "G", "A", grch38)
    base_dir = "data/somatic-variant-with-0-supporting-rna-reads/"
    normal_reads = load_bam(base_dir + "normal.6.90411765.G.A.sorted.bam")
    tumor_reads = load_bam(base_dir + "tumor.6.90411765.G.A.sorted.bam")
    rna_reads = load_bam(base_dir + "rna.6.90411765.G.A.sorted.bam")
    read_creator = ReadCollector()
    normal_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant,
        alignment_file=normal_reads)
    eq_(len(normal_sample_variant_reads), 0)
    print(normal_sample_variant_reads)

    tumor_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant,
        alignment_file=tumor_reads)
    print(tumor_sample_variant_reads)
    eq_(len(tumor_sample_variant_reads), 5)

    rna_sample_variant_reads = read_creator.allele_reads_supporting_variant(
        variant=variant,
        alignment_file=rna_reads)
    print(rna_sample_variant_reads)
    eq_(len(rna_sample_variant_reads), 0)