Esempio n. 1
0
    def setUp(self):
        self.simple = Transcript("")
        self.simple_new = Transcript("")
        self.w_gid = Transcript("", gene_id="123")
        self.w_tid = Transcript("", transcript_id="tid")
        self.w_id = Transcript("", "gid", "tid")

        #Internal indexing starts at 0! MutationSyntax coming e.g. from ANNOVAR starts at 1!
        self.gcg_v1 = Variant(
            "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', {
                "NM_002054.4":
                MutationSyntax("NM_002054.4", 343, 114, "c.344C>A", "p.A115D")
            }, False, False)

        self.gcg_ts = "gcatagaatgcagatgagcaaagtgagtgggagagggaagtcatttgtaacaaaaactcattatttacagatgagaaatttatattgtcagcgtaatatctgtgaggctaaacagagctggagagtatataaaagcagtgcgccttggtgcagaagtacagagcttaggacacagagcacatcaaaagttcccaaagagggcttgctctctcttcacctgctctgttctacagcacactaccagaagacagcagaaatgaaaagcatttactttgtggctggattatttgtaatgctggtacaaggcagctggcaacgttcccttcaagacacagaggagaaatccagatcattctcagcttcccaggcagacccactcagtgatcctgatcagatgaacgaggacaagcgccattcacagggcacattcaccagtgactacagcaagtatctggactccaggcgtgcccaagattttgtgcagtggttgatgaataccaagaggaacaggaataacattgccaaacgtcacgatgaatttgagagacatgctgaagggacctttaccagtgatgtaagttcttatttggaaggccaagctgccaaggaattcattgcttggctggtgaaaggccgaggaaggcgagatttcccagaagaggtcgccattgttgaagaacttggccgcagacatgctgatggttctttctctgatgagatgaacaccattcttgataatcttgccgccagggactttataaactggttgattcagaccaaaatcactgacaggaaataactatatcactattcaagatcatcttcacaacatcacctgctagccacgtgggatgtttgaaatgttaagtcctgtaaatttaagaggtgtattctgaggccacattgctttgcatgccaataaataaattttcttttagtgttgtgtagccaaaaattacaaatggaataaagttttatcaaaatattgctaaaatatcagctttaaaatatgaaagtgctagattctgttattttcttcttattttggatgaagtaccccaacctgtttacatttagcgataaaattatttttctatgatataatttgtaaatgtaaattattccgatctgacatatctgcattataataataggagaatagaagaactggtagccacagtggtgaaattggaaagagaactttcttcctgaaacctttgtcttaaaaatactcagctttcaatgtatcaaagatacaattaaataaaattttcaagcttctttaccattgtct"
        self.w_v = Transcript(self.gcg_ts, 'GLUC_HUMAN', "NM_002054.4",
                              {343: self.gcg_v1})
Esempio n. 2
0
    def setUp(self):
        self.simple = Peptide("SYFPEITHI")

        self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK"
        self.gcg_t1 = Transcript("", transcript_id="GLUC_HUMAN")
        gcg_p1 = Protein(self.gcg_ps,
                         transcript_id='GLUC_HUMAN',
                         orig_transcript=self.gcg_t1)
        self.w_p = Peptide("PROTEIN", {gcg_p1: [0]})
        self.gcg_p1 = gcg_p1
        self.gcg_v1 = Variant(
            "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', {
                "GLUC_HUMAN":
                MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D")
            }, False, False)
        gcg_p1_copy = copy.deepcopy(gcg_p1)
        gcg_p1_copy.vars = {0: [self.gcg_v1]}
        self.w_v = Peptide("VARIANT", {gcg_p1_copy: [0]})
Esempio n. 3
0
def read_variant_effect_predictor(file, gene_filter=None):
    """
    Reads a VCF (v4.1) file generatede by variant effect predictor and generates variant objects

    :param str file: Path to vcf file
    :param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list
    :return: list(Variant) - a list of Fred2.Core.Variant objects
    """
    vars = []

    def get_type(ref, alt):
        """
            returns the variant type
        """
        if len(ref) == 1 and len(alt) == 1:
            return VariationType.SNP
        if len(ref) > 0 and len(alt) == 0:
            if len(ref) % 3 == 0:
                return VariationType.DEL
            else:
                return VariationType.FSDEL
        if len(ref) == 0 and len(alt) > 0:
            if len(alt) % 3 == 0:
                return VariationType.INS
            else:
                return VariationType.FSINS
        return VariationType.UNKNOWN

    coding_types = set([
        "3_prime_UTR_variant", "5_prime_UTR_variant", "start_lost",
        "stop_gained", "frameshift_variant", "start_lost", "inframe_insertion",
        "inframe_deletion", "missense_variant", "protein_altering_variant",
        "splice_region_variant", "incomplete_terminal_codon_variant",
        "stop_retained_variant", "synonymous_variant",
        "coding_sequence_variant"
    ])

    with open(file, "r") as f:
        for i, l in enumerate(f):

            #skip comments
            if l.startswith("#") or l.strip() == "":
                continue

            chrom, gene_pos, var_id, ref, alt, _, filter_flag, info = l.strip(
            ).split("\t")[:8]
            coding = {}
            isSynonymous = False

            for co in info.split(","):
                #skip additional info fields without annotation
                try:
                    #Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE">
                    _, var_type, _, gene, _, transcript_type, transcript_id, _, _, _, _, _, _, transcript_pos, prot_pos, aa_mutation = co.strip(
                    ).split("|")[:16]
                    HGNC_ID = co.strip().split("|")[22]
                except ValueError:
                    logging.warning(
                        "INFO field in different format in line: {}, skipping..."
                        .format(str(i)))
                    continue

                #pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.)
                #pass genes that are uninterresting for us
                if transcript_type != "Transcript" or (
                        HGNC_ID not in gene_filter and gene_filter):
                    continue

                #pass all intronic and other mutations that do not directly influence the protein sequence
                if any(t in coding_types for t in var_type.split("&")):
                    #generate mutation syntax

                    #positioning in Fred2 is 0-based!!!
                    if transcript_pos != "" and '?' not in transcript_pos:
                        coding[transcript_id] = MutationSyntax(
                            transcript_id,
                            int(transcript_pos.split("-")[0]) - 1,
                            -1 if prot_pos == "" else
                            int(prot_pos.split("-")[0]) - 1,
                            co,
                            "",
                            geneID=HGNC_ID)
                #is variant synonymous?
                isSynonymous = any(t == "synonymous_variant"
                                   for t in var_type.split("&"))

            if coding:
                vars.append(
                    Variant(var_id, get_type(ref, alt), chrom, int(gene_pos),
                            ref.upper(), alt.upper(), coding, False,
                            isSynonymous))
    return vars
Esempio n. 4
0
 def setUp(self):
     self.simple = Variant(
         "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', {
             "NM_002054.4":
             MutationSyntax("NM_002054.4", 344, 115, "c.344C>A", "p.A115D")
         }, False, False)
def read_variant_effect_predictor(file, gene_filter=None):
    """
    Reads a VCF (v4.1) file generatede by variant effect predictor and generates variant objects

    :param str file: Path to vcf file
    :param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list
    :return: list(Variant) - a list of Fred2.Core.Variant objects
    """
    vars = []
    def get_type(ref, alt):
        """
            returns the variant type
        """
        if len(ref)==1 and len(alt)==1:
            return VariationType.SNP
        if len(ref)>0 and len(alt)==0:
            if len(ref)%3 == 0:
                return VariationType.DEL
            else:
                return VariationType.FSDEL
        if len(ref) == 0 and len(alt)>0:
            if len(alt)% 3 == 0:
                return VariationType.INS
            else:
                return VariationType.FSINS
        return VariationType.UNKNOWN

    coding_types = set(["3_prime_UTR_variant", "5_prime_UTR_variant", "start_lost", "stop_gained",
        "frameshift_variant", "start_lost", "inframe_insertion", "inframe_deletion", "missense_variant",
        "protein_altering_variant", "splice_region_variant", "incomplete_terminal_codon_variant", "stop_retained_variant",
        "synonymous_variant", "coding_sequence_variant"])

    with open(file, "r") as f:
        for i,l in enumerate(f):

            #skip comments
            if l.startswith("#") or l.strip() == "":
                continue

            chrom, gene_pos,var_id,ref,alt,_,filter_flag,info= l.strip().split("\t")[:8]
            coding = {}
            isSynonymous = False

            for co in info.split(","):
                #Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|SYMBOL|SYMBOL_SOURCE|HGNC_ID|CCDS">
                _,gene,transcript_id,transcript_type,var_type,_,transcript_pos,prot_pos,_,_,_,distance,strand,HGNC_ID = co.strip().split("|")[:14]

                #pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.)
                #pass genes that are uninterresting for us
                if transcript_type != "Transcript" or (HGNC_ID not in gene_filter and gene_filter):
                    continue

                #pass all intronic and other mutations that do not directly influence the protein sequence
                if any(t in coding_types for t in var_type.split("&")):
                    #generate mutation syntax

                    #positioning in Fred2 is 0-based!!!
                    if transcript_pos != "":
                        coding[transcript_id] = MutationSyntax(transcript_id, int(transcript_pos.split("-")[0])-1, 
                            -1 if prot_pos  == "" else int(prot_pos.split("-")[0])-1, co, "", geneID=HGNC_ID)

                #is variant synonymous?
                isSynonymous = any(t == "synonymous_variant" for t in var_type.split("&"))
            if coding:
                vars.append(Variant(var_id, get_type(ref, alt), chrom, int(gene_pos), ref.upper(), alt.upper(), coding, False, isSynonymous))
    return vars