def main(args): # open input vcf vcf = vcf_parser.Vcf(args['inputfile']) # add 3 new tag definitions - for hg19 liftover: chr, pos, and end hg19CHROM_definition = '##INFO=<ID=hg19_chr,Number=1,Type=String,Description="CHROM in hg19 using LiftOver from pyliftover">' hg19POS_definition = '##INFO=<ID=hg19_pos,Number=1,Type=Integer,Description="POS in hg19 using LiftOver from pyliftover (converted back to 1-based)">' hg19END_definition = '##INFO=<ID=hg19_end,Number=1,Type=Integer,Description="END in hg19 using LiftOver from pyliftover (converted back to 1-based)">' vcf.header.add_tag_definition(hg19END_definition) vcf.header.add_tag_definition(hg19POS_definition) vcf.header.add_tag_definition(hg19CHROM_definition) # get chain file for liftover lo = LiftOver(args['chainfile']) # write header and then loop variants, adding liftover coordiantes to INFO fields when appropriate. write all variants. with open(args['outputfile'], 'w') as fo: vcf.write_header(fo) for vnt_obj in vcf.parse_variants(): # generate hg19 LO coordinates based on CHROM and POS hits = lo.convert_coordinate(vnt_obj.CHROM, vnt_obj.POS-1) if len(hits) > 0: #add hg19_chr hg19CHROM_value = 'hg19_chr='+hits[0][0].split('chr')[1] vnt_obj.add_tag_info(hg19CHROM_value) #add hg19_pos hg19POS_value = 'hg19_pos='+str(hits[0][1]+1) vnt_obj.add_tag_info(hg19POS_value) # also want to incorporate END position for SV and CNV # check if "END" exists in INFO and if it does, try a liftover try: END = int(vnt_obj.INFO.split("END=")[1].split(";")[0]) except: END = '' if END != '': hits_end = lo.convert_coordinate(vnt_obj.CHROM, END-1) if len(hits_end) > 0: try: #if hg19_chr is already defined, don't add it vnt_obj.get_tag_value("hg19_chr") #add hg19_end hg19END_value = 'hg19_end='+str(hits_end[0][1]+1) vnt_obj.add_tag_info(hg19END_value) except: #if hg19_chr is not defined, add hg19_chr hg19CHROM_value = 'hg19_chr='+hits_end[0][0].split('chr')[1] vnt_obj.add_tag_info(hg19CHROM_value) #add hg19_end hg19END_value = 'hg19_end='+str(hits_end[0][1]+1) vnt_obj.add_tag_info(hg19END_value) vcf.write_variant(fo, vnt_obj) subprocess.run(["bgzip", args['outputfile']]) subprocess.run(["tabix",args['outputfile']+".gz"])
def main(args): # Variables VEPtag = 'CSQ' samplegeno_def = '##INFO=<ID=SAMPLEGENO,Number=.,Type=String,Description="Sample genotype information. Subembedded:\'samplegeno\':Format:\'NUMGT|GT|AD|SAMPLEID|AC\'">' counts_dict = {} # {ENSG: {sample: alt_count, ...}, ...} # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Indexes ENSG_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Gene') most_severe_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'most_severe') # Counting alleles per most severe gene for vnt_obj in vcf_obj.parse_variants(): ENSG = get_most_severe(vnt_obj, VEPtag, ENSG_idx, most_severe_idx) if ENSG: counts_dict.setdefault(ENSG, {}) for ID_genotype in vnt_obj.IDs_genotypes: counts_dict[ENSG].setdefault(ID_genotype, 0) GT_0, GT_1 = vnt_obj.get_genotype_value( ID_genotype, 'GT').replace('|', '/').split('/') if GT_0 not in ['0', '.']: counts_dict[ENSG][ID_genotype] += 1 if GT_1 not in ['0', '.']: counts_dict[ENSG][ID_genotype] += 1 else: continue # Buffers fo = codecs.open(args['outputfile'], 'w', 'utf-8') # Update and write header vcf_obj.header.remove_tag_definition('SAMPLEGENO') vcf_obj.header.add_tag_definition(samplegeno_def, 'INFO') vcf_obj.write_header(fo) # Reading variants and adding samplegeno for vnt_obj in vcf_obj.parse_variants(): ENSG = get_most_severe(vnt_obj, VEPtag, ENSG_idx, most_severe_idx) # Update samplegeno samplegeno = [] samplegeno_ = vnt_obj.get_tag_value('SAMPLEGENO').split(',') for sample_ in samplegeno_: _, _, _, SAMPLEID = sample_.split('|') sample = sample_ + '|' + str(counts_dict[ENSG][SAMPLEID]) samplegeno.append(sample) # Add samplegeno to variant INFO vnt_obj.remove_tag_info('SAMPLEGENO') vnt_obj.add_tag_info('SAMPLEGENO={0}'.format(','.join(samplegeno))) # Write variant vcf_obj.write_variant(fo, vnt_obj) fo.close()
def main(args): ''' ''' # Variables granite_def = '##GRANITE=<ID=SAMPLEGENO>' samplegeno_def = '##INFO=<ID=SAMPLEGENO,Number=.,Type=String,Description="Sample genotype information. Subembedded:\'samplegeno\':Format:\'NUMGT|GT|AD|SAMPLEID\'">' # Buffers fo = open(args['outputfile'], 'w') # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Update and write header vcf_obj.header.add_tag_definition(granite_def + '\n' + samplegeno_def, 'INFO') vcf_obj.write_header(fo) # Reading variants and adding samplegeno for vnt_obj in vcf_obj.parse_variants(): # Init empty samplegeno samplegeno = [] # Get possible alleles # REF is at idx 0, ALT idxs are maintained alleles = [vnt_obj.REF] + vnt_obj.ALT.split(',') # Calculate samplegeno for ID_genotype in vnt_obj.IDs_genotypes: samplegeno_ = [] GT_0_, GT_1_ = vnt_obj.get_genotype_value(ID_genotype, 'GT').replace('|', '/').split('/') if GT_0_ != '.': GT_0 = alleles[int(GT_0_)] else: GT_0 = GT_0_ #end if if GT_1_ != '.': GT_1 = alleles[int(GT_1_)] else: GT_1 = GT_1_ #end if AD = vnt_obj.get_genotype_value(ID_genotype, 'AD').replace(',', '/') samplegeno_.append(GT_0_ + '/' + GT_1_) samplegeno_.append(GT_0 + '/' + GT_1) samplegeno_.append(AD) samplegeno_.append(ID_genotype) samplegeno.append('|'.join(samplegeno_)) #end for # Add samplegeno to variant INFO vnt_obj.add_tag_info('SAMPLEGENO={0}'.format(','.join(samplegeno))) # Write variant vcf_obj.write_variant(fo, vnt_obj) #end for fo.close()
def main(args): in_vcf = vcf_parser.Vcf(args['inputSampleVCF']) min_depth_to_keep = int(args['min_depth']) with open(args['outputfile'], 'w') as fo: in_vcf.write_header(fo) for vnt_obj in in_vcf.parse_variants(): sample_list = vnt_obj.IDs_genotypes for sample in sample_list: if vnt_obj.get_genotype_value(sample, "DP") != ".": if int(vnt_obj.get_genotype_value( sample, "DP")) >= min_depth_to_keep: in_vcf.write_variant(fo, vnt_obj) break subprocess.run(["bgzip", args['outputfile']]) subprocess.run(["tabix", args['outputfile'] + ".gz"])
def main(args): ''' ''' # Variables NA_chroms = real_NA_chroms is_verbose = True if args['verbose'] else False sample_novo = args['novo'] if args['novo'] else '' sample_het_list = args['het'] if args['het'] else [] anchor_list = args['anchor'] # Check pedigree and anchor args if len(anchor_list) != len(args['pedigree']) and \ len(args['pedigree']) != 1: sys.exit( '\nERROR in parsing arguments: number of pedigrees and anchors is different\n' ) #end if # Buffers fo = open(args['outputfile'], 'w') # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Get pedigree / pedigrees information pedigree_list = [] for pedigree in args['pedigree']: # Loading pedigree if os.path.isfile(pedigree): with open(pedigree) as fi: pedigree_list.append(json.load(fi)) #end with else: try: pedigree_list.append(json.loads(pedigree)) except Exception: sys.exit( '\nERROR in parsing arguments: {0} must be either a json file or a string representing a json\n' .format(pedigree)) #end try #end if #end for # Check novoPP if sample_novo: try: novotag, _ = vcf_obj.header.check_tag_definition('novoPP') except Exception: sys.exit( '\nERROR in parsing arguments: novoCaller information missing in VCF file\n' ) #end try #end if # Creating Pedigree object / objects pedigree_obj_list = [] for pedigree in pedigree_list: pedigree_obj_list.append(pedigree_parser.Pedigree(pedigree)) #end for # Initializing stat_dict for each anchor stat_dict_list = [] for anchor in anchor_list: stat_dict_list.append({ 'error_het_family': {}, 'error_het': {}, 'error_novo_family': {} }) #end for # Building family / families family_list = [] if len(pedigree_obj_list) == 1: for anchor in anchor_list: family_list.append(pedigree_obj_list[0].get_family(anchor)) #end for else: for i, pedigree_obj in enumerate(pedigree_obj_list): family_list.append(pedigree_obj.get_family(anchor_list[i])) #end for #end if # Reading variants analyzed = 0 vnt_obj_ = None for i, vnt_obj in enumerate(vcf_obj.parse_variants()): if is_verbose: sys.stderr.write('\rAnalyzing variant... ' + str(i + 1)) sys.stderr.flush() #end if # # Check if chromosome is canonical and in valid format # if not check_chrom(vnt_obj.CHROM): # continue # #end if analyzed += 1 # Skip MAV that are redundant if vnt_obj_: if vnt_obj_.CHROM == vnt_obj.CHROM and vnt_obj_.POS == vnt_obj.POS: continue #end if #end if vnt_obj_ = vnt_obj # Getting and updating stats for each stat_dict / family for l, family in enumerate(family_list): if sample_novo: if anchor_list[l] == sample_novo: get_stats_novo(vnt_obj, stat_dict_list[l], family, NA_chroms, sample_novo, novotag) #end if #end if if sample_het_list: if anchor_list[l] in sample_het_list: get_stats_het(vnt_obj, stat_dict_list[l], family, NA_chroms) #end if #end if #end for #end for # Writing output sys.stderr.write('\n\n...Writing results for ' + str(analyzed) + ' analyzed variants out of ' + str(i + 1) + ' total variants\n') sys.stderr.flush() # Create json stat_json = to_json(stat_dict_list, sample_het_list, sample_novo) # Write variants if sample_novo: novo_variants(stat_dict_list, args['outputfile']) for l, family in enumerate(family_list): if anchor_list[l] == sample_novo: try: plot_AD_DP_ratio(stat_dict_list[l], sample_novo) except Exception: # no variants with novoPP >= 0.9, skip pass #end try #end if #end for #end if # Plots if len(sample_het_list) == 2: plot_error_het_family(stat_dict_list, sample_het_list) plot_distr_het_family(stat_dict_list, sample_het_list) plot_distr_het(stat_dict_list, sample_het_list) #end if # Write json to file json.dump(stat_json, fo, indent=2, sort_keys=False) # Closing buffers fo.close()
def main(args, test=False): ''' ''' # Definitions CLNSIG_encode = [ # high impact ('Pathogenic', 'C'), ('Likely_pathogenic', 'C'), # moderate/low impact ('Conflicting_interpretations', 'c'), ('Uncertain_significance', 'c'), ('risk_factor', 'c') ] # VEP_encode = {...} -> import from shared_vars # DStags = {...} -> import from shared_vars IMPCT_encode = {'HIGH': 1, 'MODERATE': 2, 'LOW': 3, 'MODIFIER': 4} IMPCT_decode = {1: 'H', 2: 'M', 3: 'L', 4: 'm'} # Variables is_impct = True if args['impact'] else False is_IMPACT = False # True if IMPACT field is in VEP CLNSIGtag, CLNSIG_idx, is_CLNSIG = '', 0, False SpAItag_list, SpAI_idx_list, is_SpAI = [], [], False ENSG_idx, ENST_idx, IMPCT_idx = 0, 0, 0 SpliceAItag = args['SpliceAItag'] # default None VEPtag = args['VEPtag'] if args['VEPtag'] else 'CSQ' sep = args['sep'] if args['sep'] else '&' allow_undef = True if args['allow_undef'] else False filter_cmpHet = True if args['filter_cmpHet'] else False granite_def = '##GRANITE=<ID=comHet>' comHet_def = '##INFO=<ID=comHet,Number=.,Type=String,Description="Putative compound heterozygous pairs. Subembedded:\'cmpHet\':Format:\'phase|gene|transcript|mate_variant\'">' comHet_impct_def = '##INFO=<ID=comHet,Number=.,Type=String,Description="Putative compound heterozygous pairs. Subembedded:\'cmpHet\':Format:\'phase|gene|transcript|impact_gene|impact_transcript|mate_variant\'">' is_verbose = True if args['verbose'] else False # Buffers fo = open(args['outputfile'], 'w') # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Add definition to header if is_impct: vcf_obj.header.add_tag_definition( granite_def + '\n' + comHet_impct_def, 'INFO') else: vcf_obj.header.add_tag_definition(granite_def + '\n' + comHet_def, 'INFO') #end if # Writing header vcf_obj.write_header(fo) # Data structures stat_dict = { 'genes': {}, 'trscrpts': {}, 'pairs': { 'pairs_set': set(), 'Phased': 0, 'Unphased': 0 }, 'vnts': { 'Phased': 0, 'Unphased': 0 }, 'impact': {} } ENSG_dict = {} # {ENSG: [vntHet_obj1, vntHet_obj2], ...} ENST_dict_tmp = {} # {ENSG: ENST_set, ...} vntHet_set = set() # variants to write in output -> {(i, vntHet_obj), ...} # i is to track and keep variants order as they are read from input # Get idx for ENST and ENSG ENSG_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Gene') ENST_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Feature') # Get idx for SpliceAI, CLINVAR and VEP IMPACT if is_impct: try: IMPCT_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'IMPACT') is_IMPACT = True except Exception: # missing IMPACT, IMPCT_idx will point to Consequence try: IMPCT_idx = vcf_obj.header.get_tag_field_idx( VEPtag, 'Consequence') except Exception: sys.exit( '\nERROR in VCF structure: either IMPACT or Consequence field in VEP is necessary to assign "--impact"\n' ) #end try #end try try: if SpliceAItag: # single tag has been specified tag, idx = vcf_obj.header.check_tag_definition(SpliceAItag) SpAItag_list.append(tag) SpAI_idx_list.append(idx) else: # search for delta scores as default for DStag in DStags: tag, idx = vcf_obj.header.check_tag_definition(DStag) SpAItag_list.append(tag) SpAI_idx_list.append(idx) #end for #end if is_SpAI = True except Exception: is_SpAI = False #end try try: CLNSIGtag, CLNSIG_idx = vcf_obj.header.check_tag_definition( 'CLNSIG') is_CLNSIG = True except Exception: is_CLNSIG = False #end try #end if # Get trio IDs if len(args['trio']) > 3: sys.exit( '\nERROR in parsing arguments: too many sample IDs provided for trio\n' ) #end if ID_list = args['trio'] # [proband_ID, parent_ID, parent_ID] # Reading variants analyzed = 0 for c, vnt_obj in enumerate(vcf_obj.parse_variants()): if is_verbose: sys.stderr.write('\rAnalyzing variant... ' + str(c + 1)) sys.stderr.flush() #end if # # Check if chromosome is canonical and in valid format # if not check_chrom(vnt_obj.CHROM): # continue # #end if analyzed += 1 # Reset data structures ENST_dict_tmp = {} IMPCT_dict_tmp = {} # Creating VariantHet object vntHet_obj = VariantHet(vnt_obj, c) if not filter_cmpHet: # if not filter, all variants are added to vntHet_set here # if filter, no variant is added here to vntHet_set, # compound heterozygous variants will be added after pairing vntHet_set.add((vntHet_obj.i, vntHet_obj)) #end if # Check proband_ID genotype if vnt_obj.get_genotype_value(ID_list[0], 'GT').replace('|', '/') not in ['0/1', '1/0']: continue # go next if is not 0/1 #end if # Get transcripts and genes information from VEP ENSG_list = VEP_field(vnt_obj, ENSG_idx, VEPtag) ENST_list = VEP_field(vnt_obj, ENST_idx, VEPtag) # Assign transcripts to genes for ENSG, ENST in zip(ENSG_list, ENST_list): if ENSG and ENST: ENST_dict_tmp.setdefault(ENSG, set()) ENST_dict_tmp[ENSG].add(ENST) #end if #end for # Assign variant to genes if VEP if ENST_dict_tmp: # Assign variant to genes and update transcripts for variant for ENSG, ENST_set in ENST_dict_tmp.items(): ENSG_dict.setdefault(ENSG, []) ENSG_dict[ENSG].append(vntHet_obj) vntHet_obj.add_ENST(ENSG, ENST_set) #end for #end if # Add impact information if is_impct if is_impct: # VEP IMPCT_list = VEP_field(vnt_obj, IMPCT_idx, VEPtag) IMPCT_encoded = encode_IMPACT(IMPCT_list, VEP_encode, IMPCT_encode, is_IMPACT, sep) for i, (ENSG, IMPCT) in enumerate(zip(ENSG_list, IMPCT_encoded)): if ENSG and IMPCT: IMPCT_dict_tmp.setdefault(ENSG, set()) IMPCT_dict_tmp[ENSG].add(IMPCT) vntHet_obj.add_ENST_IMPCT(ENST_list[i], IMPCT) #end if #end for if IMPCT_dict_tmp: for ENSG, IMPCT_set in IMPCT_dict_tmp.items(): vntHet_obj.add_ENSG_IMPCT(ENSG, IMPCT_set) #end for #end if # SpliceAI if is_SpAI: # if SpliceAI is within VEP # fetching only the first transcript # expected the same scores for all transcripts SpAI_vals = [] for i, SpAItag in enumerate(SpAItag_list): SpAI_val = get_tag_idx(vnt_obj, SpAItag, SpAI_idx_list[i]) # if SpliceAI is with VEP and is at the end of Format # need to remove , that separate next transcript try: SpAI_vals.append(float(SpAI_val.split(',')[0])) except Exception: break #end try #end for if SpAI_vals: max_SpAI_vals = max(SpAI_vals) if max_SpAI_vals >= 0.2: vntHet_obj.add_SpAI(max_SpAI_vals) #end if #end if #end if # CLINVAR if is_CLNSIG: # if CLNSIG is within VEP # fetching only the first transcript # expected the same CLNSIG for all transcripts CLNSIG_val = get_tag_idx(vnt_obj, CLNSIGtag, CLNSIG_idx) if CLNSIG_val: vntHet_obj.add_CLINVAR(CLNSIG_val, CLNSIG_encode) #end if #end if #end if #end for # Pairing variants sys.stderr.write('\n') n = len(ENSG_dict) for n_i, (ENSG, vntHet_list) in enumerate(ENSG_dict.items()): if is_verbose: sys.stderr.write('\rPairing variants... {:.0f}%'.format( float(n_i) / n * 100)) sys.stderr.flush() #end if p, l = 0, len(vntHet_list) while p < l: vntHet_obj = vntHet_list[p] for i, vntHet_obj_i in enumerate(vntHet_list): if i != p: # if parents information, # check genotypes to confirm is compound het or not if is_comHet(vntHet_obj, vntHet_obj_i, ID_list, allow_undef): vntHet_obj.add_pair( vntHet_obj_i, ENSG, phase(vntHet_obj, vntHet_obj_i, ID_list), sep, is_impct, IMPCT_decode, test) # Add vntHet to set to write since there is at least one pair vntHet_set.add((vntHet_obj.i, vntHet_obj)) #end if #end if #end for p += 1 #end while #end for if is_verbose: sys.stderr.write('\rPairing variants... {0}%'.format(100)) sys.stderr.flush() #end if # Writing output sys.stderr.write('\n\n...Writing results for ' + str(analyzed) + ' analyzed variants out of ' + str(c + 1) + ' total variants\n') sys.stderr.flush() # Order and write variants to output file for _, vntHet_obj in sorted(vntHet_set, key=lambda x: x[0]): fo.write(vntHet_obj.to_string()) update_stats(vntHet_obj, stat_dict, sep, is_impct) #end for # Print summary fs = open(args['outputfile'] + '.summary', 'w') fj = open(args['outputfile'] + '.json', 'w') # Get stats as json stat_json = to_json(stat_dict, is_impct) # Write to file print_stats(stat_json, fs, is_impct) json.dump(stat_json, fj, indent=2, sort_keys=True) # Close buffers fo.close() fs.close() fj.close()
def runner(args): ''' TODO add docstring ''' # Variables is_verbose = args['verbose'] VEPtag = 'CSQ' VEP_order = { # HIGH 'transcript_ablation': 1, 'splice_acceptor_variant': 2, 'splice_donor_variant': 3, 'stop_gained': 4, 'frameshift_variant': 5, 'stop_lost': 6, 'start_lost': 7, 'transcript_amplification': 8, # MODERATE 'inframe_insertion': 9, 'inframe_deletion': 10, 'missense_variant': 11, 'protein_altering_variant': 12, # LOW 'splice_region_variant': 13, 'incomplete_terminal_codon_variant': 14, 'start_retained_variant': 15, 'stop_retained_variant': 16, 'synonymous_variant': 17, # MODIFIER 'coding_sequence_variant': 18, 'mature_miRNA_variant': 19, '5_prime_UTR_variant': 20, '3_prime_UTR_variant': 21, 'intron_variant': 22, 'MODIFIER': 23 } dbNSFP_fields = { # dbNSFP fields that may be a list # and need to be assigned to transcripts 'Polyphen2_HVAR_pred': 0, 'Polyphen2_HVAR_score': 0, 'SIFT_pred': 0, 'SIFT_score': 0 } # Definitions vep_init = '##VEP=<ID={0}>'.format(VEPtag) genes_init = '##CGAP=<ID=GENES>' spliceai_def = '##INFO=<ID=spliceaiMaxds,Number=1,Type=Float,Description="SpliceAI max delta score">' genes_def = '##INFO=<ID=GENES,Number=.,Type=String,Description=". Subembedded:\'genes\':Format:\'most_severe_gene|most_severe_transcript|most_severe_feature_ncbi|most_severe_hgvsc|most_severe_hgvsp|most_severe_amino_acids|most_severe_sift_score|most_severe_polyphen_score|most_severe_maxentscan_diff|most_severe_consequence\'">' variant_def = '##INFO=<ID=variantClass,Number=1,Type=String,Description="Variant type">' # Buffers fo = io.open(args['outputfile'], 'w', encoding='utf-8') # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Modify VEP definition vep_def = '##INFO=<ID={0},Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Subembedded:\'transcript\':Format:\'{1}\'">' for line in vcf_obj.header.definitions.split('\n')[:-1]: if line.startswith('##INFO=<ID=' + VEPtag + ','): ##<tag_type>=<ID=<tag>,... format = line.split('Format:')[1] # Cleaning format format = format.replace(' ', '') format = format.replace('\'', '') format = format.replace('\"', '') format = format.replace('>', '') # Update definition vep_field_list = format.split('|') vep_field_list.append('most_severe') vep_def = vep_def.format(VEPtag, '|'.join(vep_field_list)) break # Remove older VEP definition vcf_obj.header.remove_tag_definition(VEPtag) # Update and write custom definitions vcf_obj.header.add_tag_definition(vep_init + '\n' + genes_init, 'INFO') vcf_obj.header.add_tag_definition(spliceai_def, 'INFO') vcf_obj.header.add_tag_definition(genes_def, 'INFO') vcf_obj.header.add_tag_definition(variant_def, 'INFO') vcf_obj.header.add_tag_definition(vep_def, 'INFO') # Write header vcf_obj.write_header(fo) # Get SpliceAI ds indexes # DStags import from granite.shared_vars SpAItag_list, SpAI_idx_list = [], [] for DStag in DStags: tag, idx = vcf_obj.header.check_tag_definition(DStag) SpAItag_list.append(tag) SpAI_idx_list.append(idx) # Get VEP indexes # Indexes to resolve dbNSFP values by transcript dbnsfp_ENST_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Ensembl_transcriptid') for field in dbNSFP_fields: dbNSFP_fields[field] = vcf_obj.header.get_tag_field_idx(VEPtag, field) # Indexes for worst transcript (GENES) CNONICL_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'CANONICAL') ENSG_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Gene') ENST_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Feature') MANE_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'MANE') #feature_ncbi HGVSC_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'HGVSc') HGVSP_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'HGVSp') AACIDS_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Amino_acids') SIFT_idx = dbNSFP_fields['SIFT_score'] PPHEN_idx = dbNSFP_fields['Polyphen2_HVAR_score'] MAXENTDIFF_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'MaxEntScan_diff') CONSEQUENCE_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Consequence') # Reading variants and adding new tags for i, vnt_obj in enumerate(vcf_obj.parse_variants()): if is_verbose: sys.stderr.write('\r' + str(i + 1)) sys.stderr.flush() # Clean dbNSFP by resolving values by transcript VEP_clean = clean_dbnsfp(vnt_obj, VEPtag, dbNSFP_fields, dbnsfp_ENST_idx, ENST_idx) if not VEP_clean: continue # Get max SpliceAI max_ds maxds = get_maxds(vnt_obj, SpAItag_list, SpAI_idx_list) # Get most severe transcript worst_trscrpt = get_worst_trscrpt(VEP_clean, VEP_order, CNONICL_idx, CONSEQUENCE_idx) # Get variant class # import from granite.shared_functions clss = variant_type_ext(vnt_obj.REF, vnt_obj.ALT) # Add MAXDS to variant INFO if maxds: vnt_obj.add_tag_info('spliceaiMaxds={0}'.format(maxds)) # Add CLASS to variant INFO vnt_obj.add_tag_info('variantClass={0}'.format(clss.upper())) # Update and replace VEP tag in variant INFO # Adding field most_severe (0|1) to transcripts VEP_update = update_worst(VEP_clean, worst_trscrpt) # Replace VEP vnt_obj.remove_tag_info(VEPtag) vnt_obj.add_tag_info('{0}={1}'.format(VEPtag, VEP_update)) # Add GENES to variant INFO worst_trscrpt_ = worst_trscrpt.split('|') worst_consequence_ = get_worst_consequence( worst_trscrpt_[CONSEQUENCE_idx], VEP_order) genes = '{0}|{1}|{2}|{3}|{4}|{5}|{6}|{7}|{8}|{9}'.format( worst_trscrpt_[ENSG_idx], worst_trscrpt_[ENST_idx], worst_trscrpt_[MANE_idx], worst_trscrpt_[HGVSC_idx], worst_trscrpt_[HGVSP_idx], worst_trscrpt_[AACIDS_idx], worst_trscrpt_[SIFT_idx], worst_trscrpt_[PPHEN_idx], worst_trscrpt_[MAXENTDIFF_idx], worst_consequence_) vnt_obj.add_tag_info('GENES={0}'.format(genes)) # Write variant vcf_obj.write_variant(fo, vnt_obj) # Close buffers sys.stderr.write('\n') fo.close()
def main(args): ''' ''' # Variables ENSG_set, ENSG_idx = set(), 0 VEPtag = args['VEPtag'] if args['VEPtag'] else 'CSQ' is_verbose = True if args['verbose'] else False # Read genes list with open(args['geneslist']) as fi: for line in fi: line = line.rstrip() if line: ENSG_set.add(line) #end if #end for #end with # Buffers fo = open(args['outputfile'], 'w') # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Writing header vcf_obj.write_header(fo) # Get ENSG (Gene) index in VEP ENSG_idx = vcf_obj.header.get_tag_field_idx(VEPtag, 'Gene') # Reading variants and writing passed analyzed = 0 for i, vnt_obj in enumerate(vcf_obj.parse_variants()): if is_verbose: sys.stderr.write('\rAnalyzing variant... ' + str(i + 1)) sys.stderr.flush() #end if # # Check if chromosome is canonical and in valid format # if not check_chrom(vnt_obj.CHROM): # continue # #end if analyzed += 1 # Apply genes list and clean VEP VEP_clean = clean_VEP_byfield(vnt_obj, ENSG_idx, ENSG_set, VEPtag) # Remove old VEP vnt_obj.remove_tag_info(VEPtag) # Add cleaned VEP if any if VEP_clean: vnt_obj.add_tag_info('{0}={1}'.format(VEPtag, VEP_clean)) #end if # Write variant vcf_obj.write_variant(fo, vnt_obj) #end for sys.stderr.write('\n\n...Wrote results for ' + str(analyzed) + ' analyzed variants out of ' + str(i + 1) + ' total variants\n') sys.stderr.flush() # Closing buffers fo.close()
def main(args): ''' ''' # Variables afthr, aftag = 0., '' big_dict = {} is_afthr = True if args['afthr'] else False is_bigfile = True if args['bigfile'] else False is_verbose = True if args['verbose'] else False # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Check arguments if is_afthr: afthr = float(args['afthr']) if args['aftag']: aftag, aftag_idx = vcf_obj.header.check_tag_definition( args['aftag']) else: sys.exit( '\nERROR in parsing arguments: to filter by population allele frequency please specify the TAG to use\n' ) #end if else: if not is_bigfile: sys.exit( '\nERROR in parsing arguments: to blacklist specify a BIG file and/or a threshold for population allele frequency and the TAG to use\n' ) #end if #end if # Buffers fo = open(args['outputfile'], 'w') # Loading big if specified if is_bigfile: big_dict = load_big(args['bigfile']) #end if # Writing header vcf_obj.write_header(fo) # Reading variants and writing passed analyzed = 0 for i, vnt_obj in enumerate(vcf_obj.parse_variants()): if is_verbose: sys.stderr.write('\rAnalyzing variant... ' + str(i + 1)) sys.stderr.flush() #end if # # Check if chromosome is canonical and in valid format # if not check_chrom(vnt_obj.CHROM): # continue # #end if analyzed += 1 # Get allele frequency from aftag tag if requested if is_afthr: af = allele_frequency(vnt_obj, aftag, aftag_idx) # Check allele frequency if af > afthr: continue #end if #end if if is_bigfile: vtype = variant_type(vnt_obj.REF, vnt_obj.ALT) try: key = vnt_obj.CHROM + '_' + vtype is_blacklist = big_dict[key][vnt_obj.POS] except Exception: sys.exit( '\nERROR in blacklist check: {0}:{1} missing in BIG file'. format(key, vnt_obj.POS)) #end try if is_blacklist: continue #end if #end if # All good, pass and write variant vcf_obj.write_variant(fo, vnt_obj) #end for sys.stderr.write('\n\n...Wrote results for ' + str(analyzed) + ' analyzed variants out of ' + str(i + 1) + ' total variants\n') sys.stderr.flush() # Closing buffers fo.close()
def main(args, test=False): ''' ''' # Variables is_bam = True if args['bam'] else False is_afthr = True if args['afthr'] else False afthr, aftag, aftag_idx = 1., 'novoAF', 0 # novoAF as aftag placeholder if not is_afthr ppthr = float(args['ppthr']) if args['ppthr'] else 0. afthr_unrelated = float(args['afthr_unrelated']) if args['afthr_unrelated'] else 1. MQthr = int(args['MQthr']) if args['MQthr'] else 0 BQthr = int(args['BQthr']) if args['BQthr'] else 0 ADthr = int(args['ADthr']) if args['ADthr'] else 0 RSTR_def = '##FORMAT=<ID=RSTR,Number=4,Type=Integer,Description="Read counts by strand for ref and alt alleles (Rf,Af,Rr,Ar)">' novoCaller_def = '##INFO=<ID=novoPP,Number=1,Type=Float,Description="Posterior probability from novoCaller">' # NA chromosomes set -> import from shared_vars if test: NA_chroms = test_NA_chroms else: NA_chroms = real_NA_chroms #end if is_NA = False is_verbose = True if args['verbose'] else False # Buffers fo = open(args['outputfile'], 'w') # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Check arguments if is_afthr: afthr = float(args['afthr']) if args['aftag']: aftag, aftag_idx = vcf_obj.header.check_tag_definition(args['aftag']) else: sys.exit('\nERROR in parsing arguments: to filter by population allele frequency please specify the TAG to use\n') #end if #end if # Data structures variants_passed = [] # Getting files and associated IDs sys.stderr.write('Getting unrelated and trio files...\n') sys.stderr.flush() if is_bam: # if bam files unrelated_files, IDs_unrelated = buffering_bams(args['unrelatedfiles']) trio_files, IDs_trio = buffering_bams(args['triofiles']) # [parent, parent, child] else: unrelated_files, IDs_unrelated = buffering_rcks(args['unrelatedfiles']) trio_files, IDs_trio = buffering_rcks(args['triofiles']) # [parent, parent, child] #end if # Checking info files for trio is complete if len(trio_files) != 3: sys.exit('\nERROR in BAMs info file for trio: missing information for some family member\n') #end if # Checking information for trio is complete in the vcf for ID in IDs_trio: if ID not in vcf_obj.header.IDs_genotypes: sys.exit('\nERROR in VCF file: missing information for some family member\n') #end if #end for # Reading variants analyzed = 0 for i, vnt_obj in enumerate(vcf_obj.parse_variants()): if is_verbose: sys.stderr.write('\rAnalyzing variant... ' + str(i + 1)) sys.stderr.flush() #end if # # Check if chromosome is canonical and in valid format # if not check_chrom(vnt_obj.CHROM): # skip variant if not # continue # #end if # Getting allele frequency from novoAF tag af = allele_frequency(vnt_obj, aftag, aftag_idx) # is_NA reset is_NA = False # Calculate statistics if af <= afthr: # hard filter on allele frequency analyzed += 1 PP, ADfs, ADrs, ADfs_U, ADrs_U, _, _, _, AF_unrel = \ PP_calc(trio_files, unrelated_files, vnt_obj.CHROM, vnt_obj.POS, vnt_obj.REF, vnt_obj.ALT, af, MQthr, BQthr, is_bam) if vnt_obj.CHROM.replace('chr', '') in NA_chroms: # model assumptions does not apply to sex and mithocondrial chromosomes, PP -> NA PP = 0. is_NA = True #end if if ADthr and ALT_count_check_parents(ADfs, ADrs, ADthr): # AD in parents over ADthr, PP -> 0 PP = 0. is_NA = False #end if if AF_unrel <= afthr_unrelated and PP >= ppthr: # hard filter on AF_unrel, PP variants_passed.append([PP, ADfs, ADrs, ADfs_U, ADrs_U, AF_unrel, is_NA, vnt_obj]) #end if #end if #end for # Writing output sys.stderr.write('\n\n...Writing results for ' + str(analyzed) + ' analyzed variants out of ' + str(i + 1) + ' total variants\n') sys.stderr.flush() # Header definitions is_RSTR = 'RSTR' in vcf_obj.header.definitions is_novoCaller = 'novoPP' in vcf_obj.header.definitions if not is_RSTR: vcf_obj.header.add_tag_definition(RSTR_def, 'FORMAT') #end if if not is_novoCaller: vcf_obj.header.add_tag_definition(novoCaller_def, 'INFO') #end if vcf_obj.write_definitions(fo) # Adding to header columns unrelated samples missing IDs fo.write(vcf_obj.header.columns.rstrip()) for ID in IDs_unrelated: if ID not in vcf_obj.header.IDs_genotypes: fo.write('\t' + ID) #end if #end for fo.write('\n') # Variants passed for variant in sorted(variants_passed, key=lambda x: x[0], reverse=True): PP, ADfs, ADrs, ADfs_U, ADrs_U, AF_unrel, is_NA, vnt_obj = variant # Removing older tags fields if present if is_RSTR: vnt_obj.remove_tag_genotype('RSTR') #end if if is_novoCaller: vnt_obj.remove_tag_info('novoPP') #end if # Adding new tag if not is_NA: vnt_obj.add_tag_info('novoPP={0}'.format(PP)) #end if # Fill the trailing fields dropped in genotypes vnt_obj.complete_genotype() # Updating genotypes trio for i, ID in enumerate(IDs_trio): values = '{0},{1},{2},{3}'.format(int(ADfs[i][0]), int(ADfs[i][1]), int(ADrs[i][0]), int(ADrs[i][1])) vnt_obj.add_values_genotype(ID, values) #end for # Updating genotypes unrelated unrelated_genotypes = [] for i, ID in enumerate(IDs_unrelated): values = '{0},{1},{2},{3}'.format(int(ADfs_U[i][0]), int(ADfs_U[i][1]), int(ADrs_U[i][0]), int(ADrs_U[i][1])) if ID in vnt_obj.GENOTYPES: vnt_obj.add_values_genotype(ID, values) else: unrelated_genotypes.append(vnt_obj.empty_genotype() + ':' + values) #end if #end for # Updating FORMAT vnt_obj.add_tag_format('RSTR') # Writing output if unrelated_genotypes: fo.write(vnt_obj.to_string().rstrip() + '\t' + '\t'.join(unrelated_genotypes) + '\n') else: vcf_obj.write_variant(fo, vnt_obj) #end if #end for # Closing files buffers fo.close() if is_bam: for buffer in unrelated_files: buffer.close() #end for for buffer in trio_files: buffer.close()
def main(args): ''' ''' # Variables VEPrescue, consequence_idx = set(), 0 # VEPremove = {...} -> import from shared_vars # VEPSpliceAI = {...} -> import from shared_vars # DStags = {...} -> import from shared_vars SpAItag_list, SpAI_idx_list = [], [] is_VEP = True if args['VEP'] else False is_filter_VEP = True if args['filter_VEP'] else False VEPtag = args['VEPtag'] if args['VEPtag'] else 'CSQ' VEPsep = args['VEPsep'] if args['VEPsep'] else '&' SpliceAI_thr = float(args['SpliceAI']) if args['SpliceAI'] else 0. SpliceAItag = args['SpliceAItag'] # default None is_SpAI = False is_verbose = True if args['verbose'] else False # Buffers fo = open(args['outputfile'], 'w') # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Clean header definitions in INFO block for specified tags if args['tag']: for tag in args['tag']: vcf_obj.header.remove_tag_definition(tag, 'INFO') #end for #end if # Writing header vcf_obj.write_header(fo) # VEP if is_VEP: consequence_idx = vcf_obj.header.get_tag_field_idx( VEPtag, 'Consequence') if args['VEPrescue']: VEPrescue = {term for term in args['VEPrescue']} #end if if args['VEPremove']: VEPremove.update({term for term in args['VEPremove']}) #end if elif args['VEPrescue'] or args['VEPremove']: sys.exit( '\nERROR in parsing arguments: specify the flag "--VEP" to filter by VEP annotations to apply rescue terms or remove additional terms\n' ) #end if # SpliceAI if SpliceAI_thr: if SpliceAItag: # single tag has been specified tag, idx = vcf_obj.header.check_tag_definition(SpliceAItag) SpAItag_list.append(tag) SpAI_idx_list.append(idx) else: # search for delta scores as default for DStag in DStags: tag, idx = vcf_obj.header.check_tag_definition(DStag) SpAItag_list.append(tag) SpAI_idx_list.append(idx) #end for #end if #end if # Reading variants and writing passed analyzed = 0 for i, vnt_obj in enumerate(vcf_obj.parse_variants()): if is_verbose: sys.stderr.write('\rAnalyzing variant... ' + str(i + 1)) sys.stderr.flush() #end if # # Check if chromosome is canonical and in valid format # if not check_chrom(vnt_obj.CHROM): # continue # #end if analyzed += 1 # Clean tags if specified if args['tag']: for tag in args['tag']: vnt_obj.remove_tag_info(tag) #end for #end if # is_SpAI reset is_SpAI = False # Check SpliceAI if SpliceAI_thr: if check_spliceAI(vnt_obj, SpAI_idx_list, SpAItag_list, SpliceAI_thr): is_SpAI = True #end if #end if # Clean VEP if is_VEP: # Get cleaned VEP if is_SpAI: VEP_clean = clean_VEP(vnt_obj, consequence_idx, VEPremove, VEPrescue.union(VEPSpliceAI), VEPtag, VEPsep) else: VEP_clean = clean_VEP(vnt_obj, consequence_idx, VEPremove, VEPrescue, VEPtag, VEPsep) #end if # Remove old VEP vnt_obj.remove_tag_info(VEPtag) # Add cleaned VEP if any if VEP_clean: vnt_obj.add_tag_info('{0}={1}'.format(VEPtag, VEP_clean)) else: # check if is filter_VEP if is_filter_VEP: continue #end if #end if #end if # Write variant vcf_obj.write_variant(fo, vnt_obj) #end for sys.stderr.write('\n\n...Wrote results for ' + str(analyzed) + ' analyzed variants out of ' + str(i + 1) + ' total variants\n') sys.stderr.flush() # Closing buffers fo.close()
def main(args): ''' ''' # Variables VEPrescue, consequence_idx = {}, 0 # VEPremove = {...} -> import from shared_vars # DStags = {...} -> import from shared_vars CLINVARonly = {} CLNtag = '' CLNSIGtag, CLNSIG_idx = '', 0 SpAItag_list, SpAI_idx_list = [], [] BED_bitarrays = {} is_VEP = True if args['VEP'] else False is_CLINVAR = True if args['CLINVAR'] else False SpliceAI_thr = float(args['SpliceAI']) if args['SpliceAI'] else 0. is_BEDfile = True if args['BEDfile'] else False VEPtag = args['VEPtag'] if args['VEPtag'] else 'CSQ' CLINVARtag = args['CLINVARtag'] if args['CLINVARtag'] else 'ALLELEID' SpliceAItag = args['SpliceAItag'] # default None VEPsep = args['VEPsep'] if args['VEPsep'] else '&' is_verbose = True if args['verbose'] else False # Buffers fo = open(args['outputfile'], 'w') # Creating Vcf object vcf_obj = vcf_parser.Vcf(args['inputfile']) # Writing header vcf_obj.write_header(fo) # VEP if is_VEP: consequence_idx = vcf_obj.header.get_tag_field_idx( VEPtag, 'Consequence') if args['VEPrescue']: VEPrescue = {term for term in args['VEPrescue']} #end if if args['VEPremove']: VEPremove.update({term for term in args['VEPremove']}) #end if elif args['VEPrescue'] or args['VEPremove']: sys.exit( '\nERROR in parsing arguments: specify the flag "--VEP" to filter by VEP annotations to apply rescue terms or remove additional terms\n' ) #end if #CLINVAR if is_CLINVAR: CLNtag, CLN_idx = vcf_obj.header.check_tag_definition(CLINVARtag) if args['CLINVARonly']: CLINVARonly = {term for term in args['CLINVARonly']} CLNSIGtag, CLNSIG_idx = vcf_obj.header.check_tag_definition( 'CLNSIG') #end if elif args['CLINVARonly']: sys.exit( '\nERROR in parsing arguments: specify the flag "--CLINVAR" to filter by CLINVAR annotations to specify tags or keywords to whitelist\n' ) #end if # SpliceAI if SpliceAI_thr: if SpliceAItag: # single tag has been specified tag, idx = vcf_obj.header.check_tag_definition(SpliceAItag) SpAItag_list.append(tag) SpAI_idx_list.append(idx) else: # search for delta scores as default for DStag in DStags: tag, idx = vcf_obj.header.check_tag_definition(DStag) SpAItag_list.append(tag) SpAI_idx_list.append(idx) #end for #end if #end if # BED if is_BEDfile: BED_bitarrays = bed_to_bitarray(args['BEDfile']) #end if # Reading variants and writing passed analyzed = 0 for i, vnt_obj in enumerate(vcf_obj.parse_variants()): if is_verbose: sys.stderr.write('\rAnalyzing variant... ' + str(i + 1)) sys.stderr.flush() #end if # # Check if chromosome is canonical and in valid format # if not check_chrom(vnt_obj.CHROM): # continue # #end if analyzed += 1 # Check BED if is_BEDfile: try: # CHROM and POS can miss in the BED file, if that just pass to next checks if BED_bitarrays[vnt_obj.CHROM][vnt_obj.POS]: vcf_obj.write_variant(fo, vnt_obj) continue #end if except Exception: pass #end try #end if # Check VEP if is_VEP: if check_VEP(vnt_obj, consequence_idx, VEPremove, VEPrescue, VEPtag, VEPsep): vcf_obj.write_variant(fo, vnt_obj) continue #end if #end if # Check SpliceAI if SpliceAI_thr: if check_spliceAI(vnt_obj, SpAI_idx_list, SpAItag_list, SpliceAI_thr): vcf_obj.write_variant(fo, vnt_obj) continue #end if #end if # Check CLINVAR if is_CLINVAR: if check_CLINVAR(vnt_obj, CLN_idx, CLNtag, CLNSIG_idx, CLNSIGtag, CLINVARonly): vcf_obj.write_variant(fo, vnt_obj) continue #end if #end if #end for sys.stderr.write('\n\n...Wrote results for ' + str(analyzed) + ' analyzed variants out of ' + str(i + 1) + ' total variants\n') sys.stderr.flush() # Closing buffers fo.close()