Beispiel #1
0
def test_del_annotation():
    seq = Seq("ACTGCTATTCGTAGT")
    prot_seq = Seq("TAIRS")

    assert str(seq.translate()) == str(prot_seq)

    region = \
        mutate.mutate_protein_from_transcript(
            seq, 3, "GCTATT", "", padding = 8)
    print(str(region))
    assert region.annot == 'AI2del', region
Beispiel #2
0
def test_mutate_protein_from_transcript_snp():
    seq = Seq("ACTGCTATTCGTAGT")
    prot_seq = Seq("TAIRS")
    assert(str(seq.translate()) == str(prot_seq))

    mutated_seq = Seq("AATGCTATTCGTAGT").translate()

    region = mutate.mutate_protein_from_transcript(
        seq, 1, 'C', 'A', padding = 8)
    print(str(region))
    assert(region.seq[0] == 'N')
    assert(str(mutated_seq) == str(region.seq))
    assert(len(region.seq) == 5)
    assert(region.annot == 'T1N')
Beispiel #3
0
def test_stop_codon_annotation():
    seq = Seq("ACTGCTATTCGTAGT")
    prot_seq = Seq("TAIRS")
    assert str(seq.translate()) == str(prot_seq)

    stop_seq =  Seq("ACTTAGCCCATTCGTAGT")
    assert str(stop_seq.translate()) == str(Seq("T*PIRS"))

    # change the 4th-6th chars to stop codon TAG
    region = \
        mutate.mutate_protein_from_transcript(
            seq, 3, "GCT", "TAGCCC", padding = 8)
    print(str(region))
    assert region.seq == 'T', region.seq
    assert region.annot == 'A2*', region
Beispiel #4
0
def test_mutate_protein_prefix_stop_codon():
    seq = Seq("TAGGCTATTCGTAGT")
    prot_seq = Seq("*AIRS")

    assert(str(seq.translate()) == str(prot_seq))

    mutated_seq = Seq("TAGGATATTCGTAGT").translate()
    print (str(mutated_seq))

    region = mutate.mutate_protein_from_transcript(
        seq, 4, 'C', 'A', padding = 8)
    print(str(region))
    assert(region.seq[0] == 'D')
    assert(region.seq[1] == 'I')
    assert(str(region.seq) == str(mutated_seq[1:]))
    assert(region.annot == 'A2D')
Beispiel #5
0
def test_mutate_protein_from_transcript_indel():
    seq = Seq("ACTGCTATTCGTAGT")
    prot_seq = Seq("TAIRS")

    assert(str(seq.translate()) == str(prot_seq))

    mutated_seq = Seq("AAATGCTATTCGTAGT").translate()
    print (str(mutated_seq))

    region = mutate.mutate_protein_from_transcript(
        seq, 1, 'C', 'AA', padding = 8)
    print(str(region))
    assert(region.seq[0] == 'K')
    assert(region.seq[1] == 'C')
    assert(region.seq[2] == 'Y')
    assert(str(region.seq) == str(mutated_seq[:-1]))
    assert(region.annot == 'T1fs')
Beispiel #6
0
def test_stop_codon_after_subst_annotation():
    seq = Seq("ACTGCTATTCGTAGT")
    prot_seq = Seq("TAIRS")
    assert str(seq.translate()) == str(prot_seq)

    stop_seq =  Seq("ACTCCCTAGATTCGTAGT")
    assert str(stop_seq.translate()) == str(Seq("TP*IRS"))

    # change the 4th-6th chars to stop codon TAG
    region = \
        mutate.mutate_protein_from_transcript(
            seq, 3, "GCT", "CCCTAG", padding = 8)
    print(str(region))
    assert region.n_removed == 1
    assert region.n_inserted == 1
    assert region.seq == 'TP', region.seq
    assert region.annot == 'A2P*', region
Beispiel #7
0
def test_mutate_protein_from_transcript_snp_coordinates():
    seq = Seq("ACTGCTATTCGTAGT")
    prot_seq = Seq("TAIRS")
    assert(str(seq.translate()) == str(prot_seq))

    mutated_seq = Seq("AATGCTATTCGTAGT").translate()

    region = \
        mutate.mutate_protein_from_transcript(
                seq, 1, 'C', 'A', padding = 8)
    print(str(region))
    assert region.seq[0] == 'N'
    assert str(mutated_seq) == str(region.seq)
    assert len(region.seq) == 5

    assert region.mutation_start == 0
    assert region.n_removed == 1
    assert region.n_inserted == 1
    assert region.annot == 'T1N'
Beispiel #8
0
def test_get_transcript_and_mutate_vcf():
    variant = {
        'chr' : '10',
        'pos' : 43617416,
        'ref' : 'T',
        'alt' : 'C'
    }

    vcf = pd.DataFrame.from_records([variant])
    transcripts_ids = ensembl.annotate_vcf_transcripts(vcf)

    transcript_ids = set(transcripts_ids['stable_id_transcript'])
    assert( "ENST00000355710" in transcript_ids)
    assert( "ENST00000340058" in transcript_ids)

    transcript_id = "ENST00000355710"


    cdna_idx = ensembl.get_transcript_index_from_pos(
        variant['pos'], transcript_id, skip_untranslated_region = False)
    assert cdna_idx is not None
    assert cdna_idx < 5569
    cdna_transcript = ref_data.get_cdna(transcript_id)
    assert(cdna_transcript[cdna_idx] == variant['ref'])

    cds_idx = ensembl.get_transcript_index_from_pos(
        variant['pos'], transcript_id, skip_untranslated_region = True)
    assert cds_idx is not None
    cds_transcript = ref_data.get_cds(transcript_id)
    assert(cds_transcript[cds_idx] == variant['ref'])

    region = mutate_protein_from_transcript(
            cds_transcript,
            cds_idx,
            variant['ref'],
            variant['alt'],
            padding = 10)
    assert region is not None
    assert len(region.seq) == 21, (region.seq, len(region.seq))
    assert region.seq == 'RSQGRIPVKWTAIESLFDHIY'
Beispiel #9
0
def peptide_from_transcript_variant(transcript_id,
                                    pos,
                                    ref,
                                    alt,
                                    padding=None,
                                    max_length=None):

    # sometimes empty strings get represented with a '.'
    if ref == ".":
        ref = ""
    if alt == ".":
        alt = ""

    forward = annotation.is_forward_strand(transcript_id)
    ref = ref if forward else annotation.reverse_complement(ref)
    alt = alt if forward else annotation.reverse_complement(alt)
    transcript = _ensembl.get_cds(transcript_id)

    def error_result(msg, *args):
        logging.warning(msg, *args)
        return None, -1, -1, msg % args

    if not transcript:
        return error_result("Couldn't find transcript for ID %s",
                            transcript_id)

    idx = annotation.get_transcript_index_from_pos(
        pos, transcript_id, skip_untranslated_region=True)
    if idx is None:
        return error_result(
            "Couldn't translate gene position %s into transcript index for %s",
            pos, transcript_id)
    elif idx >= len(transcript):
        return error_result(
            "Index %d longer than sequence (len %d) for transcript %s (%s)",
            idx, len(transcript), transcript_id,
            gene_mutation_description(pos, ref, alt))

    idx = idx if forward else idx - len(ref) + 1

    # 'ref' represents what the VCF file thought were the reference bases
    # at this position, now we actually check to make sure the transcript
    # agrees
    transcript_ref = str(transcript[idx:idx + len(ref)])
    if transcript_ref != ref:
        mutation_description = gene_mutation_description(pos, ref, alt)
        return error_result(
            "VCF/MAF expected %s at idx %d of transcript %s, found %s (%s)" % \
                (ref, idx, transcript_id, transcript_ref, mutation_description)
        )
    region = mutate_protein_from_transcript(transcript,
                                            idx,
                                            ref,
                                            alt,
                                            padding=padding)
    start = region.mutation_start
    stop = start + region.n_inserted
    if max_length and len(region.seq) > max_length:
        seq = region.seq[:max_length]
        stop = min(stop, max_length)
    else:
        seq = region.seq
    return seq, start, stop, region.annot
Beispiel #10
0
def peptide_from_transcript_variant(
        transcript_id, pos, ref, alt,
        padding = None,
        max_length = None):

    # sometimes empty strings get represented with a '.'
    if ref == ".":
        ref = ""
    if alt == ".":
        alt = ""

    forward = annotation.is_forward_strand(transcript_id)
    ref = ref if forward else annotation.reverse_complement(ref)
    alt = alt if forward else annotation.reverse_complement(alt)
    transcript = _ensembl.get_cds(transcript_id)
    def error_result(msg, *args):
        logging.warning(msg, *args)
        return None, -1, -1, msg % args

    if not transcript:
        return error_result("Couldn't find transcript for ID %s", transcript_id)

    idx = annotation.get_transcript_index_from_pos(
        pos,
        transcript_id,
        skip_untranslated_region = True)
    if idx is None:
        return error_result(
            "Couldn't translate gene position %s into transcript index for %s",
            pos,
            transcript_id)
    elif idx >= len(transcript):
        return error_result(
            "Index %d longer than sequence (len %d) for transcript %s (%s)",
            idx,
            len(transcript),
            transcript_id,
            gene_mutation_description(pos, ref, alt))

    idx = idx if forward else idx - len(ref) + 1

    # 'ref' represents what the VCF file thought were the reference bases
    # at this position, now we actually check to make sure the transcript
    # agrees
    transcript_ref = str(transcript[idx:idx+len(ref)])
    if transcript_ref != ref:
        mutation_description = gene_mutation_description(pos, ref, alt)
        return error_result(
            "VCF/MAF expected %s at idx %d of transcript %s, found %s (%s)" % \
                (ref, idx, transcript_id, transcript_ref, mutation_description)
        )
    region = mutate_protein_from_transcript(
        transcript,
        idx,
        ref,
        alt,
        padding = padding)
    start = region.mutation_start
    stop = start + region.n_inserted
    if max_length and len(region.seq) > max_length:
        seq = region.seq[:max_length]
        stop = min(stop, max_length)
    else:
        seq = region.seq
    return seq, start, stop, region.annot