def setUp(self): self.simple = Transcript("") self.simple_new = Transcript("") self.w_gid = Transcript("", gene_id="123") self.w_tid = Transcript("", transcript_id="tid") self.w_id = Transcript("", "gid", "tid") #Internal indexing starts at 0! MutationSyntax coming e.g. from ANNOVAR starts at 1! self.gcg_v1 = Variant( "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', { "NM_002054.4": MutationSyntax("NM_002054.4", 343, 114, "c.344C>A", "p.A115D") }, False, False) self.gcg_ts = "gcatagaatgcagatgagcaaagtgagtgggagagggaagtcatttgtaacaaaaactcattatttacagatgagaaatttatattgtcagcgtaatatctgtgaggctaaacagagctggagagtatataaaagcagtgcgccttggtgcagaagtacagagcttaggacacagagcacatcaaaagttcccaaagagggcttgctctctcttcacctgctctgttctacagcacactaccagaagacagcagaaatgaaaagcatttactttgtggctggattatttgtaatgctggtacaaggcagctggcaacgttcccttcaagacacagaggagaaatccagatcattctcagcttcccaggcagacccactcagtgatcctgatcagatgaacgaggacaagcgccattcacagggcacattcaccagtgactacagcaagtatctggactccaggcgtgcccaagattttgtgcagtggttgatgaataccaagaggaacaggaataacattgccaaacgtcacgatgaatttgagagacatgctgaagggacctttaccagtgatgtaagttcttatttggaaggccaagctgccaaggaattcattgcttggctggtgaaaggccgaggaaggcgagatttcccagaagaggtcgccattgttgaagaacttggccgcagacatgctgatggttctttctctgatgagatgaacaccattcttgataatcttgccgccagggactttataaactggttgattcagaccaaaatcactgacaggaaataactatatcactattcaagatcatcttcacaacatcacctgctagccacgtgggatgtttgaaatgttaagtcctgtaaatttaagaggtgtattctgaggccacattgctttgcatgccaataaataaattttcttttagtgttgtgtagccaaaaattacaaatggaataaagttttatcaaaatattgctaaaatatcagctttaaaatatgaaagtgctagattctgttattttcttcttattttggatgaagtaccccaacctgtttacatttagcgataaaattatttttctatgatataatttgtaaatgtaaattattccgatctgacatatctgcattataataataggagaatagaagaactggtagccacagtggtgaaattggaaagagaactttcttcctgaaacctttgtcttaaaaatactcagctttcaatgtatcaaagatacaattaaataaaattttcaagcttctttaccattgtct" self.w_v = Transcript(self.gcg_ts, 'GLUC_HUMAN', "NM_002054.4", {343: self.gcg_v1})
def setUp(self): self.simple = Peptide("SYFPEITHI") self.gcg_ps = "MKSIYFVAGLFVMLVQGSWQRSLQDTEEKSRSFSASQADPLSDPDQMNEDKRHSQGTFTSDYSKYLDSRRAQDFVQWLMNTKRNRNNIAKRHDEFERHAEGTFTSDVSSYLEGQAAKEFIAWLVKGRGRRDFPEEVAIVEELGRRHADGSFSDEMNTILDNLAARDFINWLIQTKITDRK" self.gcg_t1 = Transcript("", transcript_id="GLUC_HUMAN") gcg_p1 = Protein(self.gcg_ps, transcript_id='GLUC_HUMAN', orig_transcript=self.gcg_t1) self.w_p = Peptide("PROTEIN", {gcg_p1: [0]}) self.gcg_p1 = gcg_p1 self.gcg_v1 = Variant( "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', { "GLUC_HUMAN": MutationSyntax("GLUC_HUMAN", 344, 115, "c.344C>A", "p.A115D") }, False, False) gcg_p1_copy = copy.deepcopy(gcg_p1) gcg_p1_copy.vars = {0: [self.gcg_v1]} self.w_v = Peptide("VARIANT", {gcg_p1_copy: [0]})
def read_variant_effect_predictor(file, gene_filter=None): """ Reads a VCF (v4.1) file generatede by variant effect predictor and generates variant objects :param str file: Path to vcf file :param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list :return: list(Variant) - a list of Fred2.Core.Variant objects """ vars = [] def get_type(ref, alt): """ returns the variant type """ if len(ref) == 1 and len(alt) == 1: return VariationType.SNP if len(ref) > 0 and len(alt) == 0: if len(ref) % 3 == 0: return VariationType.DEL else: return VariationType.FSDEL if len(ref) == 0 and len(alt) > 0: if len(alt) % 3 == 0: return VariationType.INS else: return VariationType.FSINS return VariationType.UNKNOWN coding_types = set([ "3_prime_UTR_variant", "5_prime_UTR_variant", "start_lost", "stop_gained", "frameshift_variant", "start_lost", "inframe_insertion", "inframe_deletion", "missense_variant", "protein_altering_variant", "splice_region_variant", "incomplete_terminal_codon_variant", "stop_retained_variant", "synonymous_variant", "coding_sequence_variant" ]) with open(file, "r") as f: for i, l in enumerate(f): #skip comments if l.startswith("#") or l.strip() == "": continue chrom, gene_pos, var_id, ref, alt, _, filter_flag, info = l.strip( ).split("\t")[:8] coding = {} isSynonymous = False for co in info.split(","): #skip additional info fields without annotation try: #Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|FLAGS|SYMBOL_SOURCE|HGNC_ID|TSL|APPRIS|SIFT|PolyPhen|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE"> _, var_type, _, gene, _, transcript_type, transcript_id, _, _, _, _, _, _, transcript_pos, prot_pos, aa_mutation = co.strip( ).split("|")[:16] HGNC_ID = co.strip().split("|")[22] except ValueError: logging.warning( "INFO field in different format in line: {}, skipping..." .format(str(i))) continue #pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.) #pass genes that are uninterresting for us if transcript_type != "Transcript" or ( HGNC_ID not in gene_filter and gene_filter): continue #pass all intronic and other mutations that do not directly influence the protein sequence if any(t in coding_types for t in var_type.split("&")): #generate mutation syntax #positioning in Fred2 is 0-based!!! if transcript_pos != "" and '?' not in transcript_pos: coding[transcript_id] = MutationSyntax( transcript_id, int(transcript_pos.split("-")[0]) - 1, -1 if prot_pos == "" else int(prot_pos.split("-")[0]) - 1, co, "", geneID=HGNC_ID) #is variant synonymous? isSynonymous = any(t == "synonymous_variant" for t in var_type.split("&")) if coding: vars.append( Variant(var_id, get_type(ref, alt), chrom, int(gene_pos), ref.upper(), alt.upper(), coding, False, isSynonymous)) return vars
def setUp(self): self.simple = Variant( "rs5650", VariationType.SNP, 2, 162145588, 'G', 'T', { "NM_002054.4": MutationSyntax("NM_002054.4", 344, 115, "c.344C>A", "p.A115D") }, False, False)
def read_variant_effect_predictor(file, gene_filter=None): """ Reads a VCF (v4.1) file generatede by variant effect predictor and generates variant objects :param str file: Path to vcf file :param list gene_filter: List of proteins (in HGNC) of inerrest. Variants are filter according to this list :return: list(Variant) - a list of Fred2.Core.Variant objects """ vars = [] def get_type(ref, alt): """ returns the variant type """ if len(ref)==1 and len(alt)==1: return VariationType.SNP if len(ref)>0 and len(alt)==0: if len(ref)%3 == 0: return VariationType.DEL else: return VariationType.FSDEL if len(ref) == 0 and len(alt)>0: if len(alt)% 3 == 0: return VariationType.INS else: return VariationType.FSINS return VariationType.UNKNOWN coding_types = set(["3_prime_UTR_variant", "5_prime_UTR_variant", "start_lost", "stop_gained", "frameshift_variant", "start_lost", "inframe_insertion", "inframe_deletion", "missense_variant", "protein_altering_variant", "splice_region_variant", "incomplete_terminal_codon_variant", "stop_retained_variant", "synonymous_variant", "coding_sequence_variant"]) with open(file, "r") as f: for i,l in enumerate(f): #skip comments if l.startswith("#") or l.strip() == "": continue chrom, gene_pos,var_id,ref,alt,_,filter_flag,info= l.strip().split("\t")[:8] coding = {} isSynonymous = False for co in info.split(","): #Allele|Gene|Feature|Feature_type|Consequence|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|DISTANCE|STRAND|SYMBOL|SYMBOL_SOURCE|HGNC_ID|CCDS"> _,gene,transcript_id,transcript_type,var_type,_,transcript_pos,prot_pos,_,_,_,distance,strand,HGNC_ID = co.strip().split("|")[:14] #pass every other feature type except Transcript (RegulatoryFeature, MotifFeature.) #pass genes that are uninterresting for us if transcript_type != "Transcript" or (HGNC_ID not in gene_filter and gene_filter): continue #pass all intronic and other mutations that do not directly influence the protein sequence if any(t in coding_types for t in var_type.split("&")): #generate mutation syntax #positioning in Fred2 is 0-based!!! if transcript_pos != "": coding[transcript_id] = MutationSyntax(transcript_id, int(transcript_pos.split("-")[0])-1, -1 if prot_pos == "" else int(prot_pos.split("-")[0])-1, co, "", geneID=HGNC_ID) #is variant synonymous? isSynonymous = any(t == "synonymous_variant" for t in var_type.split("&")) if coding: vars.append(Variant(var_id, get_type(ref, alt), chrom, int(gene_pos), ref.upper(), alt.upper(), coding, False, isSynonymous)) return vars