def filter_ann_txt_files_just_maf(infile, cols_to_filter, freq_req, outfile_prefix, protein_changing_definitions, exonic_definitions, genename): ##filter rnaseq data for freq in freq_req: ##only 'rare' (<=1%) filtering_annotated.filter(working_dir, "and", infile, "1.temp", [cols_to_filter[0]], ['<='], [freq]) ##q>=30 and coverage >=5 filtering_annotated.filter(working_dir, "and", "1.temp", outfile_prefix + '.' + str(freq) + '.' + genename + ".xls", cols_to_filter[1], ['>=', '>='], [30,5])
def filter_rpt_c3h(file_prefix): ##remove if in rmsk, segdup filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + "11.temp", [11, 12], ['==', '=='], ['.', '.']) ##in c3hr mouse filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + ".c3h.xls", [25], ['!='], ['.'])
def filter_exonic_variants(infile, outfile): col_exon = 6 exon_definition = ['exonic', 'splicing'] col_function = 9 syn_definition = 'synonymous SNV' filtering_annotated.filter(working_dir, "or", infile, 'temp1.txt', [col_exon, col_exon], ['==', '=='], [exon_definition[0], exon_definition[1]]) ##remove synonymous filtering_annotated.filter(working_dir, "and", 'temp1.txt', outfile, [col_function], ['!='], [syn_definition])
def filter_ann_file_3(file_prefix): ##remove if in rmsk, segdup filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + "21.temp", [11, 12], ['==', '=='], ['.', '.']) ##homozygous in all three mice filtering_annotated.filter(working_dir, "or", file_prefix + '21.temp', file_prefix + '.non_rpt.in_any.xls', [16, 17, 18], ['!=', '!=', '!='], ['.', '.', '.'])
def filter_ann_file_2(file_prefix): ##remove if in rmsk, segdup, or b6 filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + "11.temp", [11, 12, 15], ['==', '==', '=='], ['.', '.', '.']) ##homozygous in all three mice filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.non_rpt_b6.hom_in_all.xls', [16, 17, 18], ['==', '==', '=='], ['hom', 'hom', 'hom'])
def filter_for_enu(file_prefix): ##remove if in rmsk, segdup, dbsnp, mgp, c3h filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".enu_rpts.xls", [11, 12, 13, 14, 25], ['==', '==', '==', '==', '=='], ['.', '.', '.', '.', '.']) ##remove if in dbsnp, mgp, c3h filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".enu.xls", [13, 14, 25], ['==', '==', '=='], ['.', '.', '.'])
def filter_ann_txt_files(samples, cols_to_filter, freq_req, outfile_prefix): ##filter rnaseq data for sample in samples: for freq in freq_req: ##only 'rare' (<=1%) filtering_annotated.filter( working_dir, "and", outfile_prefix + '.' + sample + '.annotated.txt', "1.temp", [cols_to_filter[1]], ['<='], [freq]) ##q>=30 and coverage >=5 filtering_annotated.filter(working_dir, "and", "1.temp", "2.temp", cols_to_filter[4], ['>=', '>='], [30, 5]) ##exonic_variants in refGene filtering_annotated.filter( working_dir, "or", "2.temp", outfile_prefix + '.' + str(freq) + "3.temp", [cols_to_filter[0], cols_to_filter[0]], ['==', '=='], [exonic_definitions[0], exonic_definitions[1]]) ##get all protein changing filtering_annotated.filter( working_dir, "and", outfile_prefix + '.' + str(freq) + "3.temp", outfile_prefix + '.' + sample + '.' + str(freq) + ".protein_changing.xls", make_list(cols_to_filter[2], non_protein_changing_definitions), make_list('!=', non_protein_changing_definitions), non_protein_changing_definitions)
def filter_ann_file_mt2(file_prefix): ##if not passed by mt2 filtering_annotated.filter(working_dir, "and", file_prefix + '.mt2.annotated.txt', file_prefix + '.mt2.passed.xls', [137], ['=='], ['PASS']) ##exonic_variants filtering_annotated.filter(working_dir, "or", file_prefix + '.mt2.annotated.txt' , file_prefix + "_1.temp", [6, 6], ['==','=='], [exon_definition[0],exon_definition[1]]) ##remove synonymous filtering_annotated.filter(working_dir, "and", file_prefix + "_1.temp", file_prefix + "_2.temp", [9], ['!='], ['synonymous SNV']) ##<10% in all gnomad filtering_annotated.filter(working_dir, "and", file_prefix + "_2.temp", file_prefix + ".mt2.exonic.rare.xls", [83,100,117], ['<=','<=','<='], [freq_req,freq_req,freq_req])
def filter_ann_txt_files(infile, cols_to_filter, freq_req, outfile_prefix, protein_changing_definitions, exonic_definitions, genename): ##filter rnaseq data for freq in freq_req: ##only 'rare' (<=1%) filtering_annotated.filter(working_dir, "and", infile, "1.temp", [cols_to_filter[0]], ['<='], [freq]) ##q>=30 and coverage >=5 filtering_annotated.filter(working_dir, "and", "1.temp", "2.temp", cols_to_filter[1], ['>=', '>='], [30,5]) ##exonic_variants in refGene filtering_annotated.filter(working_dir, "or", "2.temp", outfile_prefix + '.' + str(freq) + ".exonic_temp.xls", [cols_to_filter[2], cols_to_filter[2]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]]) ##get all protein changing filtering_annotated.filter(working_dir, "or", outfile_prefix + '.' + str(freq) + ".exonic_temp.xls", outfile_prefix + '.' + str(freq) + ".protein_changing." + genename + ".xls", make_list(cols_to_filter[3], protein_changing_definitions), make_list('==', protein_changing_definitions), protein_changing_definitions)
def filter_ann_file(file_prefix): ##remove if in rmsk, segdup # filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + "11.temp", [11,12], ['==','=='], ['.','.']) ##in chr17 region # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17_20-30mb.xls', [1,3,3], ['==','>=','<='], ['chr17',20000000,30000000]) # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17_20-40mb.xls', [1,3,3], ['==','>=','<='], ['chr17',20000000,40000000]) # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17_20-60mb.xls', [1,3,3], ['==','>=','<='], ['chr17',20000000,60000000]) # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17.xls', [1], ['=='], ['chr17']) # filtering_annotated.filter(working_dir, "and", file_prefix + '11.temp', file_prefix + '.chr17_20-30mb.xls', [1,3,3], ['==','>=','<='], ['chr17','20000000','30000000']) ##unique to ko # filtering_annotated.filter(working_dir, "and", file_prefix + '.chr17_20-30mb.xls', file_prefix + '.chr17_20-30mb.ko_only.xls', [15], ['=='], ['.']) # filtering_annotated.filter(working_dir, "and", file_prefix + '.chr17_20-40mb.xls', file_prefix + '.chr17_20-40mb.ko_only.xls', [15], ['=='], ['.']) # filtering_annotated.filter(working_dir, "and", file_prefix + '.chr17_20-60mb.xls', file_prefix + '.chr17_20-60mb.ko_only.xls', [15], ['=='], ['.']) # filtering_annotated.filter(working_dir, "and", file_prefix + '.chr17.xls', file_prefix + '.chr17.ko_only.xls', [15], ['=='], ['.']) ##in 129 filtering_annotated.filter(working_dir, "or", file_prefix + '.chr17.ko_only.xls', file_prefix + '.chr17.ko_only.129.xls', [17, 18, 19], ['!=', '!=', '!='], ['.', '.', '.'])
def filter_ann_file_somatic_only(file_prefix, sample): file_prefix = file_prefix + '.' + sample ##remove if in control, i.e. not in TZ008 if sample == 'TZ001' or sample == 'TZ002': filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".not_in_TZ008.xls", [153], ['=='], ['.']) ##remove if in control, i.e. not in TZ009 elif sample == 'TZ003': filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".not_in_TZ009.xls", [154], ['=='], ['.']) ##remove if in control, i.e. not in TZ007 elif sample == 'TZ004' or sample == 'TZ005' or sample == 'TZ006': filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".not_in_TZ007.xls", [152], ['=='], ['.']) else: print(sample, 'sample name not recognized')
def filter_rpt(file_prefix): ##remove if in rmsk, segdup filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + ".c15_vars.xls", [11, 12], ['==', '=='], ['.', '.'])
genome_and_window = homozygosity_mapping_ub26.make_windows(working_dir, genome_fai, ws, step_size).split('.')[0] ##make bed file from variants homozygosity_mapping_ub26.make_bed_from_ann(working_dir, 'samtools', sample + '.hom_temp.txt', zygosity_col, info_col) ##hom and het count and hom percentage homozygosity_mapping_ub26.count_and_percentage(working_dir, genome_and_window, sample + '.bed') ##naf homozygosity_mapping_ub26.naf_in_window(working_dir, genome_and_window, sample + '.bed') ##total snp number homozygosity_mapping_ub26.total_snp_in_window(working_dir, genome_and_window, sample + '.bed') ##combine bedgraphs for graphing in r homozygosity_mapping_ub26.combine_bedgraphs_for_r(working_dir, sample, genome_and_window) ''' # ''' ##filter variants for candidates snps affected_samples = ['M75', 'M77'] for sample in affected_samples: ##auts2 filtering_annotated.filter(working_dir, "and", sample + '.annotated.txt', sample + "_1.temp", [1, 2, 3], ['==', '>=', '<='], ['chr5', 131437306, 132542649]) ##keep if het in all sample filtering_annotated.filter(working_dir, "and", sample + "_1.temp", sample + ".auts2_het_in_all.xls", [14, 15, 16, 17], ['==', '==', '==', '=='], ['het', 'het', 'het', 'het']) # '''
# combine_fq_file(r1_files_to_combine, r2_files_to_combine, K541_combined_r1, K541_combined_r2) # align_with_bwa(fq_dict) variant_calling_samtools(fq_dict, mkdup_bam, st_vcf_suffix) convert_to_annovar(fq_dict, st_vcf_suffix + '.gz') run_table_annovar(fq_dict) multianno_to_annotated(fq_dict) ##filter variants for variants, homozygsity mapping then counts # ''' ##filter variants for candidates snps for sample in samples: ##exonic_variants filtering_annotated.filter(working_dir, "or", sample + '.annotated.txt' , sample + "_1.temp", [col_exon, col_exon], ['==','=='], [exon_definition[0],exon_definition[1]]) ##remove synonymous filtering_annotated.filter(working_dir, "and", sample + "_1.temp", sample + "_2.temp", [col_function], ['!='], [syn_definition]) ##remove if in dbsnp, sanger, or other mouse line filtering_annotated.filter(working_dir, "and", sample + "_2.temp", sample + "_3.temp", [13,14,15,16,17,18,19,20,21,22,23], ['==','==','==','==','==','==','==','==','==','==','=='], ['','','','','','','','','','','']) ##keep if hom filtering_annotated.filter(working_dir, "and", sample + "_3.temp", sample + '.hom_exonic_rare.xls', [zygosity_col], ['=='], ['hom']) ##filter variants by coverage and quality filtering_annotated.filter(working_dir, "and", sample + '.hom_exonic_rare.xls', sample + '.hom_exonic_rare_qual_filtered.xls', [cov_col,qual_col], ['>=','>='], [cov_definition,qual_definition]) # ''' # ''' ##homozygosity mapping # window_size = [100000,500000,1000000,2000000] window_size = [10000000]
##naf homozygosity_mapping_cybertron.naf_in_window(working_dir, genome_and_window, sample + '.bed') ##total snp number homozygosity_mapping_cybertron.total_snp_in_window(working_dir, genome_and_window, sample + '.bed') ##combine bedgraphs for graphing in r homozygosity_mapping_cybertron.combine_bedgraphs_for_r(working_dir, sample, genome_and_window) ''' ## add not in affected mouse to c3Hh analysis # ''' samples = ['mut_combined'] for ws in window_size: for sample in samples: ##remove if in rmsk, segdup, not hom in unaffected mouse and keep hom if in c3h mouse filtering_annotated.filter(working_dir, "and", sample + '.annotated.txt', sample + "21.temp", [11, 12], ['==', '=='], ['', '']) filtering_annotated.filter(working_dir, "and", sample + '21.temp', sample + "31.temp", [22], ['!='], ['hom']) filtering_annotated.filter(working_dir, "and", sample + '31.temp', sample + "41.temp", [26], ['=='], ['hom']) ##filter variants by coverage and quality filtering_annotated.filter(working_dir, "and", sample + "41.temp", sample + '.hom_temp.txt', [cov_col, qual_col], ['>=', '>='], [cov_definition, qual_definition]) ##get hom vars filtering_annotated.filter(working_dir, "and", sample + ".hom_temp.txt", sample + '.no_rpts_c3h_notunaff_hom.xls', [zygosity_col], ['=='], ['hom'])
##hom and het count and hom percentage homozygosity_mapping_cybertron.count_and_percentage(working_dir, genome_and_window, sample + '.bed') ##naf homozygosity_mapping_cybertron.naf_in_window(working_dir, genome_and_window, sample + '.bed') ##total snp number homozygosity_mapping_cybertron.total_snp_in_window(working_dir, genome_and_window, sample + '.bed') ##combine bedgraphs for graphing in r homozygosity_mapping_cybertron.combine_bedgraphs_for_r(working_dir, sample, genome_and_window) # ''' ##filter variants for candidates snps for sample in samples: ##remove if in dbsnp, sanger, or other mouse line filtering_annotated.filter(working_dir, "and", sample + '.annotated.txt', sample + "_23.temp", [11, 12, 13, 14, 15, 16], ['==', '==', '==', '==', '==', '=='], ['', '', '', '', '', '']) ##keep if hom filtering_annotated.filter(working_dir, "and", sample + "_23.temp", sample + "_24.temp", [zygosity_col], ['=='], ['hom']) ##filter variants by coverage and quality filtering_annotated.filter(working_dir, "and", sample + "_24.temp", sample + '.rare_qual_filtered.xls', [cov_col, qual_col], ['>=', '>='], [cov_definition, qual_definition]) ##find C3H windows ##filter variants for counts c3h_snp_suffix = '.c3h_filtered.xls' c3h_snp_bed_suffix = '.c3h_filtered.bed'
filtering_annotated.filter(working_dir, "and", sample + '.hom_exonic_rare.xls', sample + '.hom_exonic_rare_qual_filtered.xls', [cov_col,qual_col], ['>=','>='], [cov_definition,qual_definition]) ''' # ''' ##homozygosity mapping ##for 0917 analysis window_size = [100000, 500000, 1000000, 2000000] # window_size = [10000000] step_size = 100000 fq_dict = ['timon_comb'] for ws in window_size: for sample in fq_dict: ##remove if in dbsnp, sanger, other ped or rmsk filtering_annotated.filter( working_dir, "and", sample + '.annotated.txt', sample + "11.temp", [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], [ '==', '==', '==', '==', '==', '==', '==', '==', '==', '==', '==', '==', '==' ], ['', '', '', '', '', '', '', '', '', '', '', '', '']) # filtering_annotated.filter(working_dir, "and", sample + '.annotated.txt', sample + "11.temp", [11,12,23], ['==','==','=='], ['','','']) ##filter variants by coverage and quality filtering_annotated.filter(working_dir, "and", sample + "11.temp", sample + '.hom_temp.txt', [cov_col, qual_col], ['>=', '>='], [cov_definition, qual_definition]) #make bed file with windows and returns genome name and window size variable genome_and_window = homozygosity_mapping_ub26.make_windows( working_dir, genome_fai, ws, step_size).split('.')[0] ##make bed file from variants
def filter_ann_file_causal(file_prefix, sample): file_prefix = file_prefix + '.' + sample ##if not passed by gatk filtering_annotated.filter(working_dir, "and", file_prefix + '.annotated.txt', file_prefix + "_0.temp", [9], ['=='], ['PASS']) ##remove if in control, i.e. not in TZ008 if sample == 'TZ001' or sample == 'TZ002': filtering_annotated.filter(working_dir, "and", file_prefix + "_0.temp", file_prefix + ".not_in_TZ008.xls", [153], ['=='], ['.']) ##exonic_variants filtering_annotated.filter(working_dir, "or", file_prefix + ".not_in_TZ008.xls" , file_prefix + "_1.temp", [col_exon, col_exon], ['==','=='], [exon_definition[0],exon_definition[1]]) ##remove synonymous filtering_annotated.filter(working_dir, "and", file_prefix + "_1.temp", file_prefix + "_2.temp", [col_function], ['!='], [syn_definition]) ##<10% in all gnomad filtering_annotated.filter(working_dir, "and", file_prefix + "_2.temp", file_prefix + ".not_in_TZ008.exonic.rare.xls", af_cols, ['<=','<=','<='], [freq_req,freq_req,freq_req]) ##remove if in control, i.e. not in TZ009 elif sample == 'TZ003': filtering_annotated.filter(working_dir, "and", file_prefix + "_0.temp", file_prefix + ".not_in_TZ009.xls", [154], ['=='], ['.']) ##exonic_variants filtering_annotated.filter(working_dir, "or", file_prefix + ".not_in_TZ009.xls" , file_prefix + "_1.temp", [col_exon, col_exon], ['==','=='], [exon_definition[0],exon_definition[1]]) ##remove synonymous filtering_annotated.filter(working_dir, "and", file_prefix + "_1.temp", file_prefix + "_2.temp", [col_function], ['!='], [syn_definition]) ##<10% in all gnomad filtering_annotated.filter(working_dir, "and", file_prefix + "_2.temp", file_prefix + ".not_in_TZ009.exonic.rare.xls", af_cols, ['<=','<=','<='], [freq_req,freq_req,freq_req]) ##remove if in control, i.e. not in TZ007 elif sample == 'TZ004' or sample == 'TZ005' or sample == 'TZ006': filtering_annotated.filter(working_dir, "and", file_prefix + "_0.temp", file_prefix + ".not_in_TZ007.xls", [152], ['=='], ['.']) ##exonic_variants filtering_annotated.filter(working_dir, "or", file_prefix + ".not_in_TZ007.xls" , file_prefix + "_1.temp", [col_exon, col_exon], ['==','=='], [exon_definition[0],exon_definition[1]]) ##remove synonymous filtering_annotated.filter(working_dir, "and", file_prefix + "_1.temp", file_prefix + "_2.temp", [col_function], ['!='], [syn_definition]) ##<10% in all gnomad filtering_annotated.filter(working_dir, "and", file_prefix + "_2.temp", file_prefix + ".not_in_TZ007.exonic.rare.xls", af_cols, ['<=','<=','<='], [freq_req,freq_req,freq_req]) else: print(sample, 'sample name not recognized')
window_bed = genome_and_window + '.bed' ##all shared snps out_bed = bed_to_graph.rsplit('.', 1)[0] + '.' + genome_and_window + '.bed' ##bedtools intersect with open('temp.bed', "w") as naf_fh: hom_bt_intersect = subprocess.Popen([ bedtools, 'intersect', '-a', window_bed, '-b', bed_to_graph, '-c' ], stdout=naf_fh) hom_bt_intersect.wait() ##filter for region of interest if '30mb' in bed_to_graph: filtering_annotated.filter(working_dir, "and", 'temp.bed', 'temp2.bed', [1, 3, 3], ['==', '>', '<='], ['chr17', 20000000, 30000000]) elif '40mb' in bed_to_graph: filtering_annotated.filter(working_dir, "and", 'temp.bed', 'temp2.bed', [1, 3, 3], ['==', '>', '<='], ['chr17', 20000000, 40000000]) elif '60mb' in bed_to_graph: filtering_annotated.filter(working_dir, "and", 'temp.bed', 'temp2.bed', [1, 3, 3], ['==', '>', '<='], ['chr17', 20000000, 60000000]) else: filtering_annotated.filter(working_dir, "and", 'temp.bed', 'temp2.bed', [1], ['=='], ['chr17'])
def filter_ann_txt_files(exac_txt, cases_txt, control_txt, esp6500_cols, case_controls_col, freq_req): ##remove duplicate vars from cases and controls and add count in first col remove_dup_vars_and_add_count(cases_txt,'cases.nodup_temp.txt') remove_dup_vars_and_add_count(control_txt,'controls.nodup_temp.txt') ##filter rnaseq data for freq in freq_req: for group in ['cases', 'controls']: ##only 'rare' (<=1%) filtering_annotated.filter(working_dir, "and", group + '.nodup_temp.txt', group + '.' + str(freq) + ".1.temp", case_controls_col[1], make_list('<=', case_controls_col[1]), make_list(freq, case_controls_col[1])) ##q>=30 and coverage >=5 filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) + ".1.temp", group + '.' + str(freq) + ".2.temp", case_ctl_qual_cov, ['>=', '>='], [30,5]) ##exonic_variants in refGene filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".2.temp", group + '.' + str(freq) + ".exonic.xls", [case_controls_col[0], case_controls_col[0]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]]) ##get dispuptive filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".disruptive.xls", make_list(case_controls_col[2], disruptive_definitions), make_list('==', disruptive_definitions), disruptive_definitions) ##get all protein changing filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".protein_changing.xls", make_list(case_controls_col[2], protein_changing_definitions), make_list('==', protein_changing_definitions), protein_changing_definitions) ##get damaging - pp2_hdiv, pp2_hvar, cadd_phred, gerp - in all or in any #get non synonymous snps filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".3.temp", [case_controls_col[2], case_controls_col[2]], ['==', '=='], nosyn_definitions) #get if all are positive # filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".4.temp", case_controls_col[3], make_list('>=', case_controls_col[3]), damaging_definitions) filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".4a.temp", case_controls_col[3][:2], make_list('>=', case_controls_col[3][:2]), pp2_score) filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) + ".4a.temp", group + '.' + str(freq) + ".4b.temp", case_controls_col[3][2:], make_list('>=', case_controls_col[3][2:]), cadd_gerp_score) remove_rows_with_no_data(group + '.' + str(freq) + ".4b.temp", group + '.' + str(freq) + ".damaging_all.xls", case_controls_col[3]) #get if any are positive filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".5.temp", case_controls_col[3], make_list('>=', case_controls_col[3]), damaging_definitions) remove_rows_with_no_data(group + '.' + str(freq) + ".5.temp", group + '.' + str(freq) + ".damaging_any.xls", case_controls_col[3]) ##and exac for group in ['exac3_1115']: ##only 'rare' (<=1%) filtering_annotated.filter(working_dir, "and", exac_txt, group + '.' + str(freq) + ".1.temp", esp6500_rvis_col[1], make_list('<=', esp6500_rvis_col[1]), make_list(freq, esp6500_rvis_col[1])) ##exonic_variants in refGene filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".1.temp", group + '.' + str(freq) + ".exonic.xls", [esp6500_rvis_col[0], esp6500_rvis_col[0]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]]) ##get dispuptive filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".disruptive.xls", make_list(esp6500_rvis_col[2], disruptive_definitions), make_list('==', disruptive_definitions), disruptive_definitions) ##get all protein changing filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".protein_changing.xls", make_list(esp6500_rvis_col[2], protein_changing_definitions), make_list('==', protein_changing_definitions), protein_changing_definitions) ##get damaging - pp2_hdiv, pp2_hvar, cadd_phred, gerp - in all or in any #get non synonymous snps filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".3.temp", [esp6500_rvis_col[2], esp6500_rvis_col[2]], ['==', '=='], nosyn_definitions) #get if all are positive # filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".4.temp", esp6500_rvis_col[3], make_list('>=', esp6500_rvis_col[3]), damaging_definitions) filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".4a.temp", esp6500_rvis_col[3][:2], make_list('>=', esp6500_rvis_col[3][:2]), pp2_score) filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) + ".4a.temp", group + '.' + str(freq) + ".4b.temp", esp6500_rvis_col[3][2:], make_list('>=', esp6500_rvis_col[3][2:]), cadd_gerp_score) remove_rows_with_no_data(group + '.' + str(freq) + ".4b.temp", group + '.' + str(freq) + ".damaging_all.xls", esp6500_rvis_col[3]) #get if any are positive filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".5.temp", esp6500_rvis_col[3], make_list('>=', esp6500_rvis_col[3]), damaging_definitions) remove_rows_with_no_data(group + '.' + str(freq) + ".5.temp", group + '.' + str(freq) + ".damaging_any.xls", esp6500_rvis_col[3])
def filter_ann_txt_files_just_exac(exac_txt, esp6500_cols, freq_req): for freq in freq_req: for group in [exac_unfiltered_prefix]: # ##only 'rare' (<=1%) # filtering_annotated.filter(working_dir, "and", exac_txt, group + '.' + str(freq) + ".1.temp", esp6500_rvis_col[1], make_list('<=', esp6500_rvis_col[1]), make_list(freq, esp6500_rvis_col[1])) # ##exonic_variants in refGene # filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".1.temp", group + '.' + str(freq) + ".exonic.xls", [esp6500_rvis_col[0], esp6500_rvis_col[0]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]]) ##only 'rare' (<=1%) filtering_annotated.filter(working_dir, "and", exac_txt, group + '.' + str(freq) + ".1.temp", esp6500_rvis_col[1], make_list('<=', esp6500_rvis_col[1]), make_list(freq, esp6500_rvis_col[1])) ##exonic_variants in refGene filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".1.temp", group + '.' + str(freq) + ".exonic.xls", [esp6500_rvis_col[0], esp6500_rvis_col[0]], ['==','=='], [exonic_definitions[0],exonic_definitions[1]]) ##get dispuptive filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".disruptive.xls", make_list(esp6500_rvis_col[2], disruptive_definitions), make_list('==', disruptive_definitions), disruptive_definitions) ##get all protein changing filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".protein_changing.xls", make_list(esp6500_rvis_col[2], protein_changing_definitions), make_list('==', protein_changing_definitions), protein_changing_definitions) ##get damaging - pp2_hdiv, pp2_hvar, cadd_phred, gerp - in all or in any #get non synonymous snps filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".exonic.xls", group + '.' + str(freq) + ".3.temp", [esp6500_rvis_col[2], esp6500_rvis_col[2]], ['==', '=='], nosyn_definitions) #get if all are positive # filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".4.temp", esp6500_rvis_col[3], make_list('>=', esp6500_rvis_col[3]), damaging_definitions) filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".4a.temp", esp6500_rvis_col[3][:2], make_list('>=', esp6500_rvis_col[3][:2]), pp2_score) filtering_annotated.filter(working_dir, "and", group + '.' + str(freq) + ".4a.temp", group + '.' + str(freq) + ".4b.temp", esp6500_rvis_col[3][2:], make_list('>=', esp6500_rvis_col[3][2:]), cadd_gerp_score) remove_rows_with_no_data(group + '.' + str(freq) + ".4b.temp", group + '.' + str(freq) + ".damaging_all.xls", esp6500_rvis_col[3]) #get if any are positive filtering_annotated.filter(working_dir, "or", group + '.' + str(freq) + ".3.temp", group + '.' + str(freq) + ".5.temp", esp6500_rvis_col[3], make_list('>=', esp6500_rvis_col[3]), damaging_definitions) remove_rows_with_no_data(group + '.' + str(freq) + ".5.temp", group + '.' + str(freq) + ".damaging_any.xls", esp6500_rvis_col[3])
hom_bed_suffix = '.resc_hom.bed' # window_size = [1000000, 500000, 100000] # step_size = 100000 window_size = [100000] step_size = 10000 zygosity_col = 27 cov_col = 29 cov_definition = 20 qual_col = 28 qual_definition = 30 working_dir = work_dir genome_fai = fasta_fai for sample in samples_to_annotate: ##remove if in repeat region or indel and cov/qual filtering_annotated.filter(working_dir, "and", sample + '.annotated.xls', sample + "21.temp", [16], ['=='], ['']) filtering_annotated.filter(working_dir, "and", sample + "21.temp", sample + "22.temp", [4, 5], ['!=', '!='], ['-', '-']) filtering_annotated.filter(working_dir, "and", sample + "22.temp", sample + "23.temp", [cov_col, qual_col], ['>=', '>='], [cov_definition, qual_definition]) ##keep if hom not hom in unrescued filtering_annotated.filter(working_dir, "and", sample + "23.temp", sample + "24.temp", [zygosity_col], ['=='], ['hom']) ##keep if not hom in unrescued filtering_annotated.filter(working_dir, "and", sample + "24.temp", sample + hom_snp_suffix, [20, 21, 22], ['!=', '!=', '!='], ['hom', 'hom', 'hom'])