def main(ARGS = None): if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ convert certain comma delim str args to lists """ args.qual_impacts = misc.str_none_split(args.qual_impacts, ",") args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",") args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",") args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",") """ read samples from fam file, send basic stats to stdout """ samples_i = Samples(args.in_fam) samples_i.print_stats() n_samples = len(samples_i.samples) n_males = len(samples_i.males) n_females = len(samples_i.females) n_cases = len(samples_i.cases) n_ctrls = len(samples_i.ctrls) """ read cnds files """ var_cnds = None pro_cnds = None par_cnds = None if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds) if args.pro_cnds != None: pro_cnds = VcfCnds(args.pro_cnds) if args.par_cnds != None: par_cnds = VcfCnds(args.par_cnds) """ init cyvcf2 VCF obj, get info subfields, header for output """ vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True) cyvcf2_vcf = Cyvcf2Vcf(vcf) cyvcf2_vcf.get_info_subfields() cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|") vcf_header_str = cyvcf2_vcf.header_to_list(gt_varnames=GT_VARNAMES, max_impact=args.max_impact, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, delim="\t") """ create sample idx """ samples_i.get_vcf_idx(vcf.samples) """ get case, control idxs """ case_idxs = [samples_i.samples[x].idx for x in samples_i.cases] ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls] """ iterate through all variants, performing de novo screen on each one """ hom_counts = defaultdict(int) prev_chrom = None linenum=0 """ if intervals provided, make sure to parse over those, else whole vcf """ if args.intervals != None: if os.path.isfile(args.intervals): intervals = open(args.intervals, "r").readlines() intervals = [x.rstrip() for x in intervals] else: intervals = [args.intervals] else: intervals = [""] """ init output file """ misc.init_out_file(args.out_tsv, force_overwrite = args.force_overwrite, init_line = vcf_header_str) """ parse VCF file looking for de novo variant calls """ for vcf_variant in cyvcf2_vcf.iterator(intervals): linenum+=1 #if linenum == 1000000: break if vcf_variant.CHROM != prev_chrom: print("Extracting newly homozygous vars from chrom " + vcf_variant.CHROM) prev_chrom = vcf_variant.CHROM """ create new Cyvcf2Variant instance """ cyvcf2_variant=Cyvcf2Variant(vcf_variant) """ assume single alternate allele per row, exclude sites with call as '*' since these are by-product of multi-sample calling and don't really represent a real SNV/indel """ alt = vcf_variant.ALT[0] if alt == '*': continue ## if no qualifying impact str found in CSQ, skip if args.qual_impacts != None: res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts, csq_subfield="CSQ") if res == False: continue ## if desired, derive max impact annots from var, along with other ## user defined max or min scores in CSQ for variant csqs_maximpact_list = [] max_csq_scores = [] min_csq_scores = [] if args.max_impact == True: cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys, csq_subfield="CSQ") res=cyvcf2_variant.maxmin_csqs(max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, impact_subfield="IMPACT") (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res ## filter on internal ctrl-only maf, can't do in var cnds file if args.internal_ctrl_af_max != None: ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs) if ctrl_af > args.internal_ctrl_af_max: continue ## variant cnds file provided, filter exclusively on that if var_cnds != None: if var_cnds.test_variant(vcf_variant) == False: continue ## otherwise, filter on user args else: ## filter on FILTER column, default is PASS only, allow for ## user defined FILTER classifs too if args.filter_include == None and vcf_variant.FILTER != None: continue elif args.filter_include != None and vcf_variant.FILTER != None: filter_include = set(",".split(args.filter_include)) if vcf_variant not in filter_include: continue ## filter on user-defined VCF INFO flags if args.vcf_info_flags_exclude != None: vcf_info_flags_fail = False for vcf_info_flag in args.vcf_info_flags_exclude.split(","): if vcf_variant.INFO.get(vcf_info_flag) == True: vcf_info_flags_fail = True break if vcf_info_flags_fail == True: continue ## filter on user-defined maximum internal MAF if args.internal_af_max != None: if vcf_variant.INFO.get("AF") > args.internal_af_max: continue ## filter on user-defined maximum external MAF if args.af_max_fields != None and args.af_max != None: af_max_fields = args.af_max_fields.split(",") af_max_fail = False for af_max_field in af_max_fields: af = vcf_variant.INFO.get(af_max_field) if af > args.af_max: af_max_fail=True break if af_max_fail == True: continue hom_carriers = set() if pro_cnds != None and par_cnds != None: hom_carriers = set() for iid in samples_i.trios: pid = samples_i.trios[iid].pid mid = samples_i.trios[iid].mid iid_idx=samples_i.samples[iid].idx pid_idx=samples_i.samples[pid].idx mid_idx=samples_i.samples[mid].idx trio_idxs=(iid_idx,pid_idx,mid_idx) ## is iid hom alt at site? trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs]) if trio_gts[0] != 2: continue ## is either parent sample hom alt at site? if trio_gts[1] != 1 or trio_gts[2] != 1: continue ## test if proband and parents pass conditions in proband and parent ## cnds files, respectively if pro_cnds.test_gt(vcf_variant, iid_idx) == False: continue if par_cnds.test_gt(vcf_variant, pid_idx) == False: continue if par_cnds.test_gt(vcf_variant, mid_idx) == False: continue hom_carriers.add(iid) else: hom_screen = screens.hom_screen hom_carriers = hom_screen(vcf_variant, samples_i, min_coverage = args.min_coverage, pro_min_perc_alt=args.pro_min_perc_alt, par_max_perc_alt=args.par_max_perc_alt, pro_homref_phredmin=args.pro_homref_phredmin, pro_het_phredmin=args.pro_het_phredmin, pro_homalt_phredmax=args.pro_homalt_phredmax, par_homref_phredmin=args.par_homref_phredmin, par_het_phredmax=args.par_het_phredmax, par_homalt_phredmin=args.par_homalt_phredmin) if len(hom_carriers) > 0: iids = list(hom_carriers) for iid in iids: samples_i.samples[iid].varcounts["hom"] += 1 outs = cyvcf2_variant.variant_to_list(samples_i, hom_carriers, cyvcf2_vcf.info_subfields, GT_VARNAMES, csqs_maximpact=csqs_maximpact_list, max_csq_scores=max_csq_scores, min_csq_scores=min_csq_scores, delim="\t") for out in outs: misc.append_out_file(args.out_tsv, out) vcf.close() samples_i.print_varcount_stats(var_types=["hom"]) return
def main(ARGS=None): if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ convert certain comma delim str args to lists """ args.qual_impacts = str_none_split(args.qual_impacts, ",") args.max_impact_csqs = str_none_split(args.max_impact_csqs, ",") args.max_csq_scores = str_none_split(args.max_csq_scores, ",") args.min_csq_scores = str_none_split(args.min_csq_scores, ",") """ convert all comma delim args to list, or leave as none. """ args_str_none = ("filter_include", "af_max_fields", "vcf_info_flags_exclude") for arg in args_str_none: args.__dict__[arg] = misc.str_none_split(args.__dict__[arg], ",") """ read samples from fam file, send basic stats to stdout """ samples_i = samples.Samples(args.in_fam) samples_i.print_stats() """ get all trios from samples with male proband """ male_trios = {} for iid in samples_i.trios: if samples_i.samples[iid].gender == "M": male_trios[iid] = samples_i.samples[iid] if len(male_trios) == 0: print("ERROR" + \ "Can't do hemizygous screen if no " + \ "trios with male proband in cohort.") sys.exit(1) """ read cnds files """ var_cnds = None iid_cnds = None pid_cnds = None mid_cnds = None if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds) if args.iid_cnds != None: iid_cnds = VcfCnds(args.iid_cnds) if args.pid_cnds != None: pid_cnds = VcfCnds(args.pid_cnds) if args.mid_cnds != None: mid_cnds = VcfCnds(args.mid_cnds) """ init cyvcf2 VCF obj, get info subfields, header for output """ vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True) cyvcf2_vcf = Cyvcf2Vcf(vcf) cyvcf2_vcf.get_info_subfields() cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|") vcf_header_str = cyvcf2_vcf.header_to_list( gt_varnames=GT_VARNAMES, max_impact=args.max_impact, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, delim="\t") """ create sample idx """ samples_i.get_vcf_idx(vcf.samples) """ get case, control idxs """ case_idxs = [samples_i.samples[x].idx for x in samples_i.cases] ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls] """ iterate through all variants, performing newlyhemizygous screen on each one """ hemi_counts = defaultdict(int) prev_chrom = None linenum = 0 """ init output file """ init_out_file(args.out_tsv, force_overwrite=args.force_overwrite, init_line=vcf_header_str + "\n") """ only parse X chromosome, since this is only place where hemi vars can happen """ for vcf_variant in cyvcf2_vcf.cyvcf2_vcf(args.x_chrom_interval): """ assume single allele per site, exclude sites with call as '*' """ alt = vcf_variant.ALT[0] if alt == '*': continue """ create new Cyvcf2Variant instance """ cyvcf2_variant = Cyvcf2Variant(vcf_variant) ## if no qualifying impact str found in CSQ, skip if args.qual_impacts != None: res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts, csq_subfield="CSQ") if res == False: continue ## if desired, derive max impact annots from var, along with other ## user defined max or min scores in CSQ for variant csqs_maximpact_list = [] max_csq_scores = [] min_csq_scores = [] if args.max_impact == True: cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys, csq_subfield="CSQ") res = cyvcf2_variant.maxmin_csqs( max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, impact_subfield="IMPACT") (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res ## filter on internal ctrl-only maf, can't do in var cnds file if args.internal_ctrl_af_max != None: ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs) if ctrl_af > args.internal_ctrl_af_max: continue ## variant cnds file provided, filter exclusively on that if var_cnds != None: if var_cnds.test_variant(vcf_variant) == False: continue ## otherwise, filter on user args else: var_pass = screens.variant_screen( vcf_variant, min_qual=args.min_qual, filter_include=args.filter_include, vcf_info_flags_exclude=args.vcf_info_flags_exclude, internal_af_max=args.internal_af_max, af_max=args.af_max, af_max_fields=args.af_max_fields) if var_pass == False: continue hemi_carriers = set() if iid_cnds != None and pid_cnds != None and mid_cnds != None: for iid in male_trios: pid = samples_i.samples[iid].pid mid = samples_i.samples[iid].mid iid_idx = samples_i.samples[iid].idx pid_idx = samples_i.samples[pid].idx mid_idx = samples_i.samples[mid].idx trio_idxs = (iid_idx, mid_idx) ## is iid hemi at site? trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs]) if trio_gts[0] not in set([1, 2]): continue ## is father hom ref at site? if trio_gts[1] not in set([0]): continue ## is mother het at site? if trio_gts[2] not in set([1]): continue ## test if proband and parents pass conditions in proband and parent ## cnds files, respectively if iid_cnds.test_gt(vcf_variant, iid_idx) == False: continue if pid_cnds.test_gt(vcf_variant, pid_idx) == False: continue if mid_cnds.test_gt(vcf_variant, mid_idx) == False: continue hemi_carriers.add(iid) else: hemi_screen = screens.hemi_screen hemi_carriers = hemi_screen(vcf_variant, samples_i, male_trios, min_coverage=args.min_coverage, iid_min_perc_alt=args.iid_min_perc_alt, pid_max_perc_alt=args.pid_max_perc_alt, mid_min_perc_alt=args.mid_min_perc_alt, iid_het_phredmax=args.iid_het_phredmax, pid_hom_phredmax=args.pid_hom_phredmax, mid_het_phredmax=args.mid_het_phredmax, iid_hom_phredmin=args.iid_hom_phredmin, pid_het_phredmin=args.pid_het_phredmin, mid_hom_phredmin=args.mid_hom_phredmin) if len(hemi_carriers) > 0: iids = list(hemi_carriers) for iid in iids: samples_i.samples[iid].varcounts["newlyhemi"] += 1 outs = cyvcf2_variant.variant_to_list( vcf_variant, samples_i, hemi_carriers, cyvcf2_vcf.info_subfields, GT_VARNAMES, csqs_maximpact=csqs_maximpact_list, max_csq_scores=max_csq_scores, min_csq_scores=min_csq_scores, delim="\t") for out in outs: append_out_file(args.out_tsv, out) vcf.close() print("Screening of VCF for newly hemizygous variants complete.") samples_i.print_varcount_stats(var_types=["newlyhemi"]) return
def main(ARGS = None): if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ set name of gt class """ gt_class = "het_trans" if args.ntrans == True: gt_class = "het_ntrans" """ convert certain comma delim str args to lists """ args.qual_impacts = misc.str_none_split(args.qual_impacts, ",") args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",") args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",") args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",") """ read samples from fam file, send basic stats to stdout """ samples_i = Samples(args.in_fam) samples_i.print_stats() n_samples = len(samples_i.samples) n_males = len(samples_i.males) n_females = len(samples_i.females) n_cases = len(samples_i.cases) n_ctrls = len(samples_i.ctrls) """ read cnds files """ var_cnds = None pro_cnds = None transpar_cnds = None ntranspar_cnds = None if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds) if args.pro_cnds != None: pro_cnds = VcfCnds(args.pro_cnds) if args.transpar_cnds != None: transpar_cnds = VcfCnds(args.transpar_cnds) if args.ntranspar_cnds != None: ntranspar_cnds = VcfCnds(args.ntranspar_cnds) """ init cyvcf2 VCF obj, get info subfields, header for output """ vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True) cyvcf2_vcf = Cyvcf2Vcf(vcf) cyvcf2_vcf.get_info_subfields() cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|") vcf_header_str = cyvcf2_vcf.header_to_list(main_fields=["PAR_IID","PRO_IID","CHROM", "POS","ID","REF","ALT", "QUAL","FILTER"], gt_varnames=GT_VARNAMES, max_impact=args.max_impact, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, delim="\t") """ create sample idx """ samples_i.get_vcf_idx(vcf.samples) """ get case, control idxs """ case_idxs = [samples_i.samples[x].idx for x in samples_i.cases] ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls] """ iterate through all variants, performing de novo screen on each one """ trans_counts = defaultdict(int) prev_chrom = None linenum=0 """ if intervals provided, make sure to parse over those, else whole vcf """ if args.intervals != None: if os.path.isfile(args.intervals): intervals = open(args.intervals, "r").readlines() intervals = [x.rstrip() for x in intervals] else: intervals = [args.intervals] else: intervals = [""] """ init output file """ misc.init_out_file(args.out_tsv, force_overwrite = args.force_overwrite, init_line = vcf_header_str) """ parse VCF file looking for de novo variant calls """ for vcf_variant in cyvcf2_vcf.iterator(intervals): linenum+=1 #if linenum == 1000000: break if vcf_variant.CHROM != prev_chrom: print("Extracting " + gt_class + " from chrom " + vcf_variant.CHROM) prev_chrom = vcf_variant.CHROM """ assume single allele per site, exclude sites with call as '*' """ alt = vcf_variant.ALT[0] if alt == '*': continue """ create new Cyvcf2Variant instance """ cyvcf2_variant=Cyvcf2Variant(vcf_variant) ## if no qualifying impact str found in CSQ, skip if args.qual_impacts != None: res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts, csq_subfield="CSQ") if res == False: continue ## if desired, derive max impact annots from var, along with other ## user defined max or min scores in CSQ for variant csqs_maximpact_list = [] max_csq_scores = [] min_csq_scores = [] if args.max_impact == True: cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys, csq_subfield="CSQ") res=cyvcf2_variant.maxmin_csqs(max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, impact_subfield="IMPACT") (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res ## filter on internal ctrl-only maf, can't do in var cnds file if args.internal_ctrl_af_max != None: ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs) if ctrl_af > args.internal_ctrl_af_max: continue ## variant cnds file provided, filter exclusively on that if var_cnds != None: if var_cnds.test_variant(vcf_variant) == False: continue ## otherwise, filter on user args else: ## filter on FILTER column, default is PASS only, allow for ## user defined FILTER classifs too if args.filter_include == None and vcf_variant.FILTER != None: continue elif args.filter_include != None and vcf_variant.FILTER != None: filter_include = set(",".split(args.filter_include)) if vcf_variant not in filter_include: continue ## filter on user-defined VCF INFO flags if args.vcf_info_flags_exclude != None: vcf_info_flags_fail = False for vcf_info_flag in args.vcf_info_flags_exclude.split(","): if vcf_variant.INFO.get(vcf_info_flag) == True: vcf_info_flags_fail = True break if vcf_info_flags_fail == True: continue ## filter on user-defined maximum internal MAF if args.internal_af_max != None: if vcf_variant.INFO.get("AF") > args.internal_af_max: continue ## filter on user-defined maximum external MAF if args.af_max_fields != None and args.af_max != None: af_max_fields = args.af_max_fields.split(",") af_max_fail = False for af_max_field in af_max_fields: af = vcf_variant.INFO.get(af_max_field) if af > args.af_max: af_max_fail=True break if af_max_fail == True: continue trans_carriers = set() if pro_cnds != None and transpar_cnds != None and ntranspar_cnds != None: for iid in samples_i.trios: pid = samples_i.trios[iid].pid mid = samples_i.trios[iid].mid iid_idx=samples_i.samples[iid].idx pid_idx=samples_i.samples[pid].idx mid_idx=samples_i.samples[mid].idx trio_idxs=(iid_idx,pid_idx,mid_idx) ## get trio gts trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs]) ## parent with higher % alt reads is transpar, ## other parent is ntranspar pid_alt_freq = vcf_variant.gt_alt_freqs[pid_idx] mid_alt_freq = vcf_variant.gt_alt_freqs[mid_idx] if pid_alt_freq > mid_alt_freq: transpar = pid transpar_idx = pid_idx transpar_gt = trio_gts[1] ntranspar = mid ntranspar_idx = mid_idx ntranspar_gt = trio_gts[2] else: transpar = mid transpar_idx = mid_idx transpar_gt = trio_gts[2] ntranspar = pid ntranspar_idx = pid_idx ntranspar_gt = trio_gts[1] ## is more likely transmitting parent GT equal to 1? if transpar_gt not in set([1]): continue ## is likely transmitting parent GT equal to 0? if ntranspar_gt not in set([0]): continue ## if looking at trans variants, keep if pro is het at site ## if looking at ntnrans variant, keep if pro i homref at site if args.ntrans == False: if trio_gts[0] not in set([1]): continue else: if trio_gts[0] not in set([0]): continue ## test if proband passes conditions in proband cnds file if pro_cnds.test_gt(vcf_variant, iid_idx) == False: continue ## test if more likely transmitting parent passes conditions ## in transpar cnds file if transpar_cnds.test_gt(vcf_variant, transpar_idx) == False: continue ## test if less likely transmitting parent passes conditions ## in ntranspar cnds file if ntranspar_cnds.test_gt(vcf_variant, ntranspar_idx) == False: continue trans_carriers.add((transpar, iid)) else: trans_screen = screens.trans_screen res = trans_screen(vcf_variant, samples_i, ntrans=args.ntrans, transpar_gts=set([1]), ntranspar_gts=set([0]), min_coverage=args.min_coverage, pro_min_perc_alt=args.pro_min_perc_alt, pro_max_perc_alt=args.pro_max_perc_alt, transpar_min_perc_alt=args.transpar_min_perc_alt, transpar_max_perc_alt=args.transpar_max_perc_alt, ntranspar_max_perc_alt=args.ntranspar_max_perc_alt, pro_homref_phredmin=args.pro_homref_phredmin, pro_het_phredmax=args.pro_het_phredmax, pro_homalt_phredmin=args.pro_homalt_phredmin) trans_carriers = res if len(trans_carriers) > 0: transpars_iids = list(trans_carriers) for pair in trans_carriers: transpar = pair[0] iid = pair[1] samples_i.samples[iid].varcounts[gt_class] += 1 outs = cyvcf2_variant.variant_to_list(samples_i, trans_carriers, cyvcf2_vcf.info_subfields, GT_VARNAMES, csqs_maximpact=csqs_maximpact_list, max_csq_scores=max_csq_scores, min_csq_scores=min_csq_scores, delim="\t") for out in outs: misc.append_out_file(args.out_tsv, out) vcf.close() samples_i.print_varcount_stats(var_types=[gt_class]) return
def main(ARGS = None): """ workflow 1. read fam file, tsv of transmitted het variants 2. for each proband, cluster variants by max annot gene 3. find combinations of het transmitted variants that are absent from both parents """ if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ convert certain comma delim str args to lists """ args.qual_impacts = set(misc.str_none_split(args.qual_impacts, ",")) """ read samples from fam file, send basic stats to stdout """ samples_i = Samples(args.in_fam) samples_i.print_stats() n_samples = len(samples_i.samples) n_males = len(samples_i.males) n_females = len(samples_i.females) """ read het transmitted table into pandas DataFrame """ het_trans = pandas.read_csv(args.het_trans_tsv, sep="\t", header=0) """ iterate through all variants, performing cpht screen on each one """ cpht_gts = {} prev_chrom = None linenum=0 # first pass of transmitted het calls, get qualifying var calls, # carrying proband, and parent of origin for i in range(het_trans.shape[0]): gene=str(het_trans.loc[i,args.gene_col]) chrom=str(het_trans.loc[i,"CHROM"]) pos=str(het_trans.loc[i,"POS"]) ref=str(het_trans.loc[i,"REF"]) alt=str(het_trans.loc[i,"ALT"]) par=str(het_trans.loc[i,"PAR_IID"]) pro=str(het_trans.loc[i,"PRO_IID"]) impact=str(het_trans.loc[i,"IMPACT_maximpact"]) if impact not in args.qual_impacts: continue if gene not in cpht_gts: cpht_gts[gene]={} if pro not in cpht_gts[gene]: cpht_gts[gene][pro]={} var_id="-".join([chrom,pos,ref,alt]) if par not in cpht_gts[gene][pro]: cpht_gts[gene][pro][par]=set() cpht_gts[gene][pro][par].add(var_id) # identify all gene-based instances of compound heterozygousity cpht_pro_gene=set() for gene in cpht_gts: for pro in cpht_gts[gene]: if len(cpht_gts[gene][pro])==2: cpht_pro_gene.add((pro, gene)) # second pass, select all variants that are part of cpht genotypes # as variants to keep and to write to output tsv i_keep = [] for i in range(het_trans.shape[0]): gene=str(het_trans.loc[i,args.gene_col]) chrom=str(het_trans.loc[i,"CHROM"]) pos=str(het_trans.loc[i,"POS"]) ref=str(het_trans.loc[i,"REF"]) alt=str(het_trans.loc[i,"ALT"]) par=str(het_trans.loc[i,"PAR_IID"]) pro=str(het_trans.loc[i,"PRO_IID"]) impact=str(het_trans.loc[i,"IMPACT_maximpact"]) var_id="-".join([chrom,pos,ref,alt]) pro_gene = (pro, gene) if pro_gene in cpht_pro_gene: par_vars=cpht_gts[gene][pro] for par in par_vars: if var_id in par_vars[par]: i_keep.append(i) # write vars that are part of cpht genotypes to out_tsv het_trans = het_trans.loc[i_keep, :] het_trans.to_csv(path_or_buf=args.out_tsv, sep="\t", header=True, index=False) return
def main(ARGS=None): if ARGS == None: ARGS = sys.argv[1:] args = parse_args(ARGS) """ convert certain comma delim str args to lists """ args.qual_impacts = misc.str_none_split(args.qual_impacts, ",") args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",") args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",") args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",") """ read cnds files """ var_cnds = None if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds) """ init cyvcf2 VCF obj, get info subfields, header for output """ vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True) cyvcf2_vcf = Cyvcf2Vcf(vcf) cyvcf2_vcf.get_info_subfields() if args.annotation_subfield == "ANN": cyvcf2_vcf.get_csq_keys(spliton="Functional annotations: ", delim="|", chars_del=[" ", "'", '"'], ann_id=args.annotation_subfield) else: cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|", ann_id=args.annotation_subfield) vcf_header_str = cyvcf2_vcf.header_to_list( gt_varnames=GT_VARNAMES, max_impact=args.max_impact, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores, delim="\t") """ since we're writing to a VCF, if any new INFO items written, need to add to header to reflect this. """ if args.max_impact_csqs != None: for csq_name in args.max_impact_csqs: csq_name_ext = csq_name + "_maximpact" vcf.add_info_to_header({'ID': csq_name_ext, 'Description':'max '+csq_name+' to go along '+\ 'with transcripts with max IMPACT', 'Type':'Character', 'Number':'1'}) if args.max_csq_scores != None: for csq_name in args.max_csq_scores: csq_name_ext = csq_name + "_max" vcf.add_info_to_header({'ID': csq_name_ext, 'Description':'max value for '+csq_name + \ 'along assessed transcripts '+\ 'in CSQ field.', 'Type':'Float', 'Number':'1'}) if args.min_csq_scores != None: for csq_name in args.min_csq_scores: csq_name_ext = csq_name + "_min" vcf.add_info_to_header({'ID': csq_name_ext, 'Description':'min value for '+csq_name + \ 'along assessed transcripts '+\ 'in CSQ field.', 'Type':'Float', 'Number':'1'}) """ init VCF writer object """ w = cyvcf2.Writer(args.out_vcf, vcf) # to write variant record, for v in vcf: w.write_record(v) """ iterate through all variants, performing de novo screen on each one """ vargeno_counts = defaultdict(int) prev_chrom = None n_var = 0 n_var_keep = 0 """ if intervals provided, make sure to parse over those, else whole vcf """ if args.intervals != None: if os.path.isfile(args.intervals): intervals = open(args.intervals, "r").readlines() intervals = [x.rstrip() for x in intervals] else: intervals = [args.intervals] else: intervals = [""] """ parse VCF file looking for de novo variant calls """ for vcf_variant in cyvcf2_vcf.iterator(intervals): n_var += 1 #if linenum == 1000000: break """ create new Cyvcf2Variant instance """ cyvcf2_variant = Cyvcf2Variant(vcf_variant) if vcf_variant.CHROM != prev_chrom: print("Extracting variants from chrom " + vcf_variant.CHROM) prev_chrom = vcf_variant.CHROM """ assume single allele per site, exclude sites with call as '*' """ alt = vcf_variant.ALT[0] if alt == '*': continue ## if no qualifying impact str found in CSQ, skip if args.qual_impacts != None: res = cyvcf2_variant.qual_impacts_screen( args.qual_impacts, csq_subfield=args.annotation_subfield) if res == False: continue ## if desired, derive max impact annots from var, along with other ## user defined max or min scores in CSQ for variant csqs_maximpact_list = [] max_csq_scores = [] min_csq_scores = [] if args.max_impact == True: cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys, csq_subfield=args.annotation_subfield) if args.annotation_subfield == "ANN": impact_subfield = "Annotation_Impact" else: impact_subfield = "IMPACT" res = cyvcf2_variant.maxmin_csqs( csq_subfield=args.annotation_subfield, impact_subfield=impact_subfield, max_impact_csqs=args.max_impact_csqs, max_csq_scores=args.max_csq_scores, min_csq_scores=args.min_csq_scores) (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res """ if corresponding values defined, add to vcf record """ if args.max_impact_csqs != None: for i in range(len(args.max_impact_csqs)): max_impact_csq_name = args.max_impact_csqs[i] + "_maximpact" max_impact_csq = csqs_maximpact_list[i] vcf_variant.INFO[max_impact_csq_name] = max_impact_csq if args.min_csq_scores != None: for i in range(len(args.min_csq_scores)): min_csq_score_name = args.min_csq_scores[i] + "_min" min_csq_score = float(min_csq_scores[i]) vcf_variant.INFO[min_csq_score_name] = min_csq_score if args.max_csq_scores != None: for i in range(len(args.max_csq_scores)): max_csq_score_name = args.max_csq_scores[i] + "_max" max_csq_score = float(max_csq_scores[i]) vcf_variant.INFO[max_csq_score_name] = max_csq_score ## filter on variant cnds file provided if var_cnds.test_variant(vcf_variant) == False: continue ## if variant survives filters, retain record w.write_record(vcf_variant) n_var_keep += 1 w.close() vcf.close() ## print basic stats on number of input variants, number of ## variants to keep print("Number of variants in parent VCF : " + str(n_var)) print("Number of variants retained post-filtration : " + str(n_var_keep)) return