Exemple #1
0
def main(ARGS = None):
    if ARGS == None:
        ARGS = sys.argv[1:]
    args = parse_args(ARGS)

    """
    convert certain comma delim str args to lists
    """
    args.qual_impacts = misc.str_none_split(args.qual_impacts, ",")
    args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",")
    args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",")
    args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",")

    """
    read samples from fam file, send basic stats to stdout
    """
    samples_i = Samples(args.in_fam)
    samples_i.print_stats()
    n_samples = len(samples_i.samples)
    n_males = len(samples_i.males)
    n_females = len(samples_i.females)
    n_cases = len(samples_i.cases)
    n_ctrls = len(samples_i.ctrls)

    """
    read cnds files
    """
    var_cnds = None
    pro_cnds = None
    par_cnds = None
    if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds)
    if args.pro_cnds != None: pro_cnds = VcfCnds(args.pro_cnds)
    if args.par_cnds != None: par_cnds = VcfCnds(args.par_cnds)

    """
    init cyvcf2 VCF obj, get info subfields, header for output
    """
    vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True)
    cyvcf2_vcf = Cyvcf2Vcf(vcf)
    cyvcf2_vcf.get_info_subfields()
    cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|")
    vcf_header_str = cyvcf2_vcf.header_to_list(gt_varnames=GT_VARNAMES,
                                               max_impact=args.max_impact,
                                               max_impact_csqs=args.max_impact_csqs,
                                               max_csq_scores=args.max_csq_scores,
                                               min_csq_scores=args.min_csq_scores,
                                               delim="\t")
    
    """
    create sample idx
    """
    samples_i.get_vcf_idx(vcf.samples)

    """
    get case, control idxs
    """
    case_idxs = [samples_i.samples[x].idx for x in samples_i.cases]
    ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls]

    """
    iterate through all variants, performing de novo screen on each one
    """
    hom_counts = defaultdict(int)
    prev_chrom = None
    linenum=0

    """
    if intervals provided, make sure to parse over those, else whole vcf
    """
    if args.intervals != None:
        if os.path.isfile(args.intervals):
            intervals = open(args.intervals, "r").readlines()
            intervals = [x.rstrip() for x in intervals]
        else:
            intervals = [args.intervals]
    else:
        intervals = [""]

    """                                                                         
    init output file                                                         
    """                                                                         
    misc.init_out_file(args.out_tsv,       
                       force_overwrite = args.force_overwrite,  
                       init_line = vcf_header_str)  

    """
    parse VCF file looking for de novo variant calls
    """
    for vcf_variant in cyvcf2_vcf.iterator(intervals):
        linenum+=1
        #if linenum == 1000000: break
        if vcf_variant.CHROM != prev_chrom:
            print("Extracting newly homozygous vars from chrom " + vcf_variant.CHROM)
            prev_chrom = vcf_variant.CHROM

        """
        create new Cyvcf2Variant instance 
        """
        cyvcf2_variant=Cyvcf2Variant(vcf_variant)

        """
        assume single alternate allele per row, exclude sites with call as '*'
        since these are by-product of multi-sample calling and don't really 
        represent a real SNV/indel
        """
        alt = vcf_variant.ALT[0]
        if alt == '*': continue
        
        ## if no qualifying impact str found in CSQ, skip
        if args.qual_impacts != None:
            res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts,
                                                     csq_subfield="CSQ")
            if res == False: continue

        ## if desired, derive max impact annots from var, along with other
        ## user defined max or min scores in CSQ for variant
        csqs_maximpact_list = []
        max_csq_scores = []
        min_csq_scores = []
        if args.max_impact == True:
            cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys,
                                         csq_subfield="CSQ")
            res=cyvcf2_variant.maxmin_csqs(max_impact_csqs=args.max_impact_csqs,
                                           max_csq_scores=args.max_csq_scores,
                                           min_csq_scores=args.min_csq_scores,
                                           impact_subfield="IMPACT") 
            (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res

        ## filter on internal ctrl-only maf, can't do in var cnds file          
        if args.internal_ctrl_af_max != None:                                   
            ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs)                      
            if ctrl_af > args.internal_ctrl_af_max:                            
                continue 
        
        ## variant cnds file provided, filter exclusively on that
        if var_cnds != None:
            if var_cnds.test_variant(vcf_variant) == False: continue
        ## otherwise, filter on user args
        else:

            ## filter on FILTER column, default is PASS only, allow for 
            ## user defined FILTER classifs too
            if args.filter_include == None and vcf_variant.FILTER != None:
                continue
            elif args.filter_include != None and vcf_variant.FILTER != None:
                filter_include = set(",".split(args.filter_include))
                if vcf_variant not in filter_include: continue
           
            ## filter on user-defined VCF INFO flags
            if args.vcf_info_flags_exclude != None:
                vcf_info_flags_fail = False
                for vcf_info_flag in args.vcf_info_flags_exclude.split(","):
                    if vcf_variant.INFO.get(vcf_info_flag) == True:
                        vcf_info_flags_fail = True
                        break
                if vcf_info_flags_fail == True: continue

            ## filter on user-defined maximum internal MAF
            if args.internal_af_max != None:
                if vcf_variant.INFO.get("AF") > args.internal_af_max:
                    continue

            ## filter on user-defined maximum external MAF
            if args.af_max_fields != None and args.af_max != None:
                af_max_fields = args.af_max_fields.split(",")
                af_max_fail = False
                for af_max_field in af_max_fields:
                    af = vcf_variant.INFO.get(af_max_field)
                    if af > args.af_max: 
                        af_max_fail=True
                        break
                if af_max_fail == True: continue

        hom_carriers = set()
        if pro_cnds != None and par_cnds != None:
            hom_carriers = set()
            for iid in samples_i.trios:
                pid = samples_i.trios[iid].pid
                mid = samples_i.trios[iid].mid
                iid_idx=samples_i.samples[iid].idx
                pid_idx=samples_i.samples[pid].idx
                mid_idx=samples_i.samples[mid].idx
                trio_idxs=(iid_idx,pid_idx,mid_idx)

                ## is iid hom alt at site?
                trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs])
                if trio_gts[0] != 2: 
                    continue
                
                ## is either parent sample hom alt at site?
                if trio_gts[1] != 1 or trio_gts[2] != 1: 
                    continue
                
                ## test if proband and parents pass conditions in proband and parent
                ## cnds files, respectively
                if pro_cnds.test_gt(vcf_variant, iid_idx) == False: continue        
                if par_cnds.test_gt(vcf_variant, pid_idx) == False: continue
                if par_cnds.test_gt(vcf_variant, mid_idx) == False: continue
                hom_carriers.add(iid)
                
        else:
            hom_screen = screens.hom_screen
            hom_carriers = hom_screen(vcf_variant, samples_i,
                                      min_coverage = args.min_coverage,
                                      pro_min_perc_alt=args.pro_min_perc_alt,
                                      par_max_perc_alt=args.par_max_perc_alt,
                                      pro_homref_phredmin=args.pro_homref_phredmin,
                                      pro_het_phredmin=args.pro_het_phredmin,
                                      pro_homalt_phredmax=args.pro_homalt_phredmax,
                                      par_homref_phredmin=args.par_homref_phredmin,
                                      par_het_phredmax=args.par_het_phredmax, 
                                      par_homalt_phredmin=args.par_homalt_phredmin)

        if len(hom_carriers) > 0:
            iids = list(hom_carriers)
            for iid in iids: 
                samples_i.samples[iid].varcounts["hom"] += 1
            outs = cyvcf2_variant.variant_to_list(samples_i,
                                                  hom_carriers,
                                                  cyvcf2_vcf.info_subfields, 
                                                  GT_VARNAMES, 
                                                  csqs_maximpact=csqs_maximpact_list,
                                                  max_csq_scores=max_csq_scores,
                                                  min_csq_scores=min_csq_scores,
                                                  delim="\t")
            for out in outs:
                misc.append_out_file(args.out_tsv, out)

    vcf.close()
    samples_i.print_varcount_stats(var_types=["hom"])
    return
Exemple #2
0
def main(ARGS=None):

    if ARGS == None:
        ARGS = sys.argv[1:]
    args = parse_args(ARGS)
    """
    convert certain comma delim str args to lists
    """
    args.qual_impacts = str_none_split(args.qual_impacts, ",")
    args.max_impact_csqs = str_none_split(args.max_impact_csqs, ",")
    args.max_csq_scores = str_none_split(args.max_csq_scores, ",")
    args.min_csq_scores = str_none_split(args.min_csq_scores, ",")
    """
    convert all comma delim args to list, or leave as none.
    """
    args_str_none = ("filter_include", "af_max_fields",
                     "vcf_info_flags_exclude")
    for arg in args_str_none:
        args.__dict__[arg] = misc.str_none_split(args.__dict__[arg], ",")
    """
    read samples from fam file, send basic stats to stdout
    """
    samples_i = samples.Samples(args.in_fam)
    samples_i.print_stats()
    """
    get all trios from samples with male proband
    """
    male_trios = {}
    for iid in samples_i.trios:
        if samples_i.samples[iid].gender == "M":
            male_trios[iid] = samples_i.samples[iid]
    if len(male_trios) == 0:
        print("ERROR" + \
              "Can't do hemizygous screen if no " + \
              "trios with male proband in cohort.")
        sys.exit(1)
    """
    read cnds files
    """
    var_cnds = None
    iid_cnds = None
    pid_cnds = None
    mid_cnds = None
    if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds)
    if args.iid_cnds != None: iid_cnds = VcfCnds(args.iid_cnds)
    if args.pid_cnds != None: pid_cnds = VcfCnds(args.pid_cnds)
    if args.mid_cnds != None: mid_cnds = VcfCnds(args.mid_cnds)
    """
    init cyvcf2 VCF obj, get info subfields, header for output
    """
    vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True)
    cyvcf2_vcf = Cyvcf2Vcf(vcf)
    cyvcf2_vcf.get_info_subfields()
    cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|")
    vcf_header_str = cyvcf2_vcf.header_to_list(
        gt_varnames=GT_VARNAMES,
        max_impact=args.max_impact,
        max_impact_csqs=args.max_impact_csqs,
        max_csq_scores=args.max_csq_scores,
        min_csq_scores=args.min_csq_scores,
        delim="\t")
    """
    create sample idx
    """
    samples_i.get_vcf_idx(vcf.samples)
    """                                                                         
    get case, control idxs                                                      
    """
    case_idxs = [samples_i.samples[x].idx for x in samples_i.cases]
    ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls]
    """
    iterate through all variants, performing newlyhemizygous screen on each one
    """
    hemi_counts = defaultdict(int)
    prev_chrom = None
    linenum = 0
    """
    init output file
    """
    init_out_file(args.out_tsv,
                  force_overwrite=args.force_overwrite,
                  init_line=vcf_header_str + "\n")
    """
    only parse X chromosome, since this is only place where hemi vars can happen
    """
    for vcf_variant in cyvcf2_vcf.cyvcf2_vcf(args.x_chrom_interval):
        """
        assume single allele per site, exclude sites with call as '*'
        """
        alt = vcf_variant.ALT[0]
        if alt == '*': continue
        """
        create new Cyvcf2Variant instance 
        """
        cyvcf2_variant = Cyvcf2Variant(vcf_variant)

        ## if no qualifying impact str found in CSQ, skip
        if args.qual_impacts != None:
            res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts,
                                                     csq_subfield="CSQ")
            if res == False: continue

        ## if desired, derive max impact annots from var, along with other
        ## user defined max or min scores in CSQ for variant
        csqs_maximpact_list = []
        max_csq_scores = []
        min_csq_scores = []
        if args.max_impact == True:
            cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys,
                                         csq_subfield="CSQ")
            res = cyvcf2_variant.maxmin_csqs(
                max_impact_csqs=args.max_impact_csqs,
                max_csq_scores=args.max_csq_scores,
                min_csq_scores=args.min_csq_scores,
                impact_subfield="IMPACT")
            (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res

        ## filter on internal ctrl-only maf, can't do in var cnds file
        if args.internal_ctrl_af_max != None:
            ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs)
            if ctrl_af > args.internal_ctrl_af_max:
                continue

        ## variant cnds file provided, filter exclusively on that
        if var_cnds != None:
            if var_cnds.test_variant(vcf_variant) == False: continue
        ## otherwise, filter on user args
        else:
            var_pass = screens.variant_screen(
                vcf_variant,
                min_qual=args.min_qual,
                filter_include=args.filter_include,
                vcf_info_flags_exclude=args.vcf_info_flags_exclude,
                internal_af_max=args.internal_af_max,
                af_max=args.af_max,
                af_max_fields=args.af_max_fields)
            if var_pass == False: continue

        hemi_carriers = set()
        if iid_cnds != None and pid_cnds != None and mid_cnds != None:
            for iid in male_trios:
                pid = samples_i.samples[iid].pid
                mid = samples_i.samples[iid].mid
                iid_idx = samples_i.samples[iid].idx
                pid_idx = samples_i.samples[pid].idx
                mid_idx = samples_i.samples[mid].idx
                trio_idxs = (iid_idx, mid_idx)

                ## is iid hemi at site?
                trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs])
                if trio_gts[0] not in set([1, 2]): continue

                ## is father hom ref at site?
                if trio_gts[1] not in set([0]): continue

                ## is mother het at site?
                if trio_gts[2] not in set([1]): continue

                ## test if proband and parents pass conditions in proband and parent
                ## cnds files, respectively
                if iid_cnds.test_gt(vcf_variant, iid_idx) == False: continue
                if pid_cnds.test_gt(vcf_variant, pid_idx) == False: continue
                if mid_cnds.test_gt(vcf_variant, mid_idx) == False: continue
                hemi_carriers.add(iid)

        else:
            hemi_screen = screens.hemi_screen
            hemi_carriers = hemi_screen(vcf_variant,
                                        samples_i,
                                        male_trios,
                                        min_coverage=args.min_coverage,
                                        iid_min_perc_alt=args.iid_min_perc_alt,
                                        pid_max_perc_alt=args.pid_max_perc_alt,
                                        mid_min_perc_alt=args.mid_min_perc_alt,
                                        iid_het_phredmax=args.iid_het_phredmax,
                                        pid_hom_phredmax=args.pid_hom_phredmax,
                                        mid_het_phredmax=args.mid_het_phredmax,
                                        iid_hom_phredmin=args.iid_hom_phredmin,
                                        pid_het_phredmin=args.pid_het_phredmin,
                                        mid_hom_phredmin=args.mid_hom_phredmin)

        if len(hemi_carriers) > 0:
            iids = list(hemi_carriers)
            for iid in iids:
                samples_i.samples[iid].varcounts["newlyhemi"] += 1
            outs = cyvcf2_variant.variant_to_list(
                vcf_variant,
                samples_i,
                hemi_carriers,
                cyvcf2_vcf.info_subfields,
                GT_VARNAMES,
                csqs_maximpact=csqs_maximpact_list,
                max_csq_scores=max_csq_scores,
                min_csq_scores=min_csq_scores,
                delim="\t")
            for out in outs:
                append_out_file(args.out_tsv, out)

    vcf.close()
    print("Screening of VCF for newly hemizygous variants complete.")
    samples_i.print_varcount_stats(var_types=["newlyhemi"])
    return
def main(ARGS = None):
    if ARGS == None:
        ARGS = sys.argv[1:]
    args = parse_args(ARGS)

    """
    set name of gt class
    """
    gt_class = "het_trans"
    if args.ntrans == True: gt_class = "het_ntrans"

    """
    convert certain comma delim str args to lists
    """
    args.qual_impacts = misc.str_none_split(args.qual_impacts, ",")
    args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",")
    args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",")
    args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",")

    """
    read samples from fam file, send basic stats to stdout
    """
    samples_i = Samples(args.in_fam)
    samples_i.print_stats()
    n_samples = len(samples_i.samples)
    n_males = len(samples_i.males)
    n_females = len(samples_i.females)
    n_cases = len(samples_i.cases)
    n_ctrls = len(samples_i.ctrls)

    """
    read cnds files
    """
    var_cnds = None
    pro_cnds = None
    transpar_cnds = None
    ntranspar_cnds = None
    if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds)
    if args.pro_cnds != None: pro_cnds = VcfCnds(args.pro_cnds)
    if args.transpar_cnds != None: transpar_cnds = VcfCnds(args.transpar_cnds)
    if args.ntranspar_cnds != None: ntranspar_cnds = VcfCnds(args.ntranspar_cnds) 

    """
    init cyvcf2 VCF obj, get info subfields, header for output
    """
    vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True, gts012=True)
    cyvcf2_vcf = Cyvcf2Vcf(vcf)
    cyvcf2_vcf.get_info_subfields()
    cyvcf2_vcf.get_csq_keys(spliton="Format: ", delim="|")
    vcf_header_str = cyvcf2_vcf.header_to_list(main_fields=["PAR_IID","PRO_IID","CHROM",
                                                            "POS","ID","REF","ALT",
                                                            "QUAL","FILTER"],
                                               gt_varnames=GT_VARNAMES,
                                               max_impact=args.max_impact,
                                               max_impact_csqs=args.max_impact_csqs,
                                               max_csq_scores=args.max_csq_scores,
                                               min_csq_scores=args.min_csq_scores,
                                               delim="\t")
 
    """
    create sample idx
    """
    samples_i.get_vcf_idx(vcf.samples)

    """
    get case, control idxs
    """
    case_idxs = [samples_i.samples[x].idx for x in samples_i.cases]
    ctrl_idxs = [samples_i.samples[x].idx for x in samples_i.ctrls]

    """
    iterate through all variants, performing de novo screen on each one
    """
    trans_counts = defaultdict(int)
    prev_chrom = None
    linenum=0

    """
    if intervals provided, make sure to parse over those, else whole vcf
    """
    if args.intervals != None:
        if os.path.isfile(args.intervals):
            intervals = open(args.intervals, "r").readlines()
            intervals = [x.rstrip() for x in intervals]
        else:
            intervals = [args.intervals]
    else:
        intervals = [""]

    """                                                                         
    init output file                                                         
    """                                                                         
    misc.init_out_file(args.out_tsv,
                       force_overwrite = args.force_overwrite,
                       init_line = vcf_header_str)  

    """
    parse VCF file looking for de novo variant calls
    """
    for vcf_variant in cyvcf2_vcf.iterator(intervals):
        linenum+=1
        #if linenum == 1000000: break
        if vcf_variant.CHROM != prev_chrom:
            print("Extracting " + gt_class + " from chrom " + vcf_variant.CHROM)
            prev_chrom = vcf_variant.CHROM

        """
        assume single allele per site, exclude sites with call as '*'
        """
        alt = vcf_variant.ALT[0]
        if alt == '*': continue

        """
        create new Cyvcf2Variant instance
        """
        cyvcf2_variant=Cyvcf2Variant(vcf_variant)

        ## if no qualifying impact str found in CSQ, skip
        if args.qual_impacts != None:
            res = cyvcf2_variant.qual_impacts_screen(args.qual_impacts,
                                                     csq_subfield="CSQ")
            if res == False: continue

        ## if desired, derive max impact annots from var, along with other
        ## user defined max or min scores in CSQ for variant
        csqs_maximpact_list = []
        max_csq_scores = []
        min_csq_scores = []
        if args.max_impact == True:
            cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys,
                                         csq_subfield="CSQ")
            res=cyvcf2_variant.maxmin_csqs(max_impact_csqs=args.max_impact_csqs,
                                           max_csq_scores=args.max_csq_scores,
                                           min_csq_scores=args.min_csq_scores,
                                           impact_subfield="IMPACT") 
            (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res

        ## filter on internal ctrl-only maf, can't do in var cnds file          
        if args.internal_ctrl_af_max != None:                                   
            ctrl_af = cyvcf2_variant.compute_maf(ctrl_idxs)                      
            if ctrl_af > args.internal_ctrl_af_max:                            
                continue 

        ## variant cnds file provided, filter exclusively on that
        if var_cnds != None:
            if var_cnds.test_variant(vcf_variant) == False: continue
        ## otherwise, filter on user args
        else:

            ## filter on FILTER column, default is PASS only, allow for 
            ## user defined FILTER classifs too
            if args.filter_include == None and vcf_variant.FILTER != None:
                continue
            elif args.filter_include != None and vcf_variant.FILTER != None:
                filter_include = set(",".split(args.filter_include))
                if vcf_variant not in filter_include: continue
           
            ## filter on user-defined VCF INFO flags
            if args.vcf_info_flags_exclude != None:
                vcf_info_flags_fail = False
                for vcf_info_flag in args.vcf_info_flags_exclude.split(","):
                    if vcf_variant.INFO.get(vcf_info_flag) == True:
                        vcf_info_flags_fail = True
                        break
                if vcf_info_flags_fail == True: continue

            ## filter on user-defined maximum internal MAF
            if args.internal_af_max != None:
                if vcf_variant.INFO.get("AF") > args.internal_af_max:
                    continue

            ## filter on user-defined maximum external MAF
            if args.af_max_fields != None and args.af_max != None:
                af_max_fields = args.af_max_fields.split(",")
                af_max_fail = False
                for af_max_field in af_max_fields:
                    af = vcf_variant.INFO.get(af_max_field)
                    if af > args.af_max: 
                        af_max_fail=True
                        break
                if af_max_fail == True: continue

        trans_carriers = set()
        if pro_cnds != None and transpar_cnds != None and ntranspar_cnds != None:
            for iid in samples_i.trios:
                pid = samples_i.trios[iid].pid
                mid = samples_i.trios[iid].mid
                iid_idx=samples_i.samples[iid].idx
                pid_idx=samples_i.samples[pid].idx
                mid_idx=samples_i.samples[mid].idx
                trio_idxs=(iid_idx,pid_idx,mid_idx)

                ## get trio gts
                trio_gts = ([vcf_variant.gt_types[id_x] for id_x in trio_idxs])

                ## parent with higher % alt reads is transpar, 
                ## other parent is ntranspar
                pid_alt_freq = vcf_variant.gt_alt_freqs[pid_idx]
                mid_alt_freq = vcf_variant.gt_alt_freqs[mid_idx]
                if pid_alt_freq > mid_alt_freq:
                    transpar = pid
                    transpar_idx = pid_idx
                    transpar_gt = trio_gts[1]
                    ntranspar = mid
                    ntranspar_idx = mid_idx
                    ntranspar_gt = trio_gts[2]
                else:
                    transpar = mid 
                    transpar_idx = mid_idx 
                    transpar_gt = trio_gts[2] 
                    ntranspar = pid
                    ntranspar_idx = pid_idx
                    ntranspar_gt = trio_gts[1]

                ## is more likely transmitting parent GT equal to 1?
                if transpar_gt not in set([1]): continue

                ## is likely transmitting parent GT equal to 0?
                if ntranspar_gt not in set([0]): continue

                ## if looking at trans variants, keep if pro is het at site
                ## if looking at ntnrans variant, keep if pro i homref at site
                if args.ntrans == False:
                    if trio_gts[0] not in set([1]): continue
                else:
                    if trio_gts[0] not in set([0]): continue

                ## test if proband passes conditions in proband cnds file
                if pro_cnds.test_gt(vcf_variant, iid_idx) == False: continue  

                ## test if more likely transmitting parent passes conditions
                ## in transpar cnds file
                if transpar_cnds.test_gt(vcf_variant, transpar_idx) == False: 
                    continue
                
                ## test if less likely transmitting parent passes conditions
                ## in ntranspar cnds file
                if ntranspar_cnds.test_gt(vcf_variant, ntranspar_idx) == False:
                    continue
                
                trans_carriers.add((transpar, iid))
                
        else:
            trans_screen = screens.trans_screen
            res = trans_screen(vcf_variant, samples_i, ntrans=args.ntrans,
                               transpar_gts=set([1]), ntranspar_gts=set([0]),
                               min_coverage=args.min_coverage,
                               pro_min_perc_alt=args.pro_min_perc_alt,
                               pro_max_perc_alt=args.pro_max_perc_alt,
                               transpar_min_perc_alt=args.transpar_min_perc_alt, 
                               transpar_max_perc_alt=args.transpar_max_perc_alt,
                               ntranspar_max_perc_alt=args.ntranspar_max_perc_alt,
                               pro_homref_phredmin=args.pro_homref_phredmin, 
                               pro_het_phredmax=args.pro_het_phredmax,
                               pro_homalt_phredmin=args.pro_homalt_phredmin)
            trans_carriers = res

        if len(trans_carriers) > 0:
            transpars_iids = list(trans_carriers)
            for pair in trans_carriers: 
                transpar = pair[0]
                iid = pair[1]
                samples_i.samples[iid].varcounts[gt_class] += 1
            outs = cyvcf2_variant.variant_to_list(samples_i,
                                                  trans_carriers,
                                                  cyvcf2_vcf.info_subfields, 
                                                  GT_VARNAMES, 
                                                  csqs_maximpact=csqs_maximpact_list,
                                                  max_csq_scores=max_csq_scores,
                                                  min_csq_scores=min_csq_scores,
                                                  delim="\t")
            for out in outs:
                misc.append_out_file(args.out_tsv, out)

    vcf.close()
    samples_i.print_varcount_stats(var_types=[gt_class])
    return
def main(ARGS = None):
    """
    workflow
    1. read fam file, tsv of transmitted het variants
    2. for each proband, cluster variants by max annot gene
    3. find combinations of het transmitted variants that are absent
       from both parents
    """
    if ARGS == None:
        ARGS = sys.argv[1:]
    args = parse_args(ARGS)

    """
    convert certain comma delim str args to lists
    """
    args.qual_impacts = set(misc.str_none_split(args.qual_impacts, ","))

    """
    read samples from fam file, send basic stats to stdout
    """
    samples_i = Samples(args.in_fam)
    samples_i.print_stats()
    n_samples = len(samples_i.samples)
    n_males = len(samples_i.males)
    n_females = len(samples_i.females)

    """
    read het transmitted table into pandas DataFrame
    """
    het_trans = pandas.read_csv(args.het_trans_tsv,
                                sep="\t", header=0)
    
    """
    iterate through all variants, performing cpht screen on each one
    """
    cpht_gts = {}
    prev_chrom = None
    linenum=0
    
    # first pass of transmitted het calls, get qualifying var calls,
    # carrying proband, and parent of origin
    for i in range(het_trans.shape[0]):
        gene=str(het_trans.loc[i,args.gene_col])
        chrom=str(het_trans.loc[i,"CHROM"])
        pos=str(het_trans.loc[i,"POS"]) 
        ref=str(het_trans.loc[i,"REF"]) 
        alt=str(het_trans.loc[i,"ALT"]) 
        par=str(het_trans.loc[i,"PAR_IID"])
        pro=str(het_trans.loc[i,"PRO_IID"])
        impact=str(het_trans.loc[i,"IMPACT_maximpact"])
        if impact not in args.qual_impacts: continue
        if gene not in cpht_gts: cpht_gts[gene]={}
        if pro not in cpht_gts[gene]: cpht_gts[gene][pro]={}
        var_id="-".join([chrom,pos,ref,alt])
        if par not in cpht_gts[gene][pro]:
            cpht_gts[gene][pro][par]=set()
        cpht_gts[gene][pro][par].add(var_id)

    # identify all gene-based instances of compound heterozygousity
    cpht_pro_gene=set()
    for gene in cpht_gts:
        for pro in cpht_gts[gene]:
            if len(cpht_gts[gene][pro])==2:
                cpht_pro_gene.add((pro, gene))
    
    # second pass, select all variants that are part of cpht genotypes
    # as variants to keep and to write to output tsv
    i_keep = []
    for i in range(het_trans.shape[0]):
        gene=str(het_trans.loc[i,args.gene_col])
        chrom=str(het_trans.loc[i,"CHROM"])
        pos=str(het_trans.loc[i,"POS"]) 
        ref=str(het_trans.loc[i,"REF"]) 
        alt=str(het_trans.loc[i,"ALT"]) 
        par=str(het_trans.loc[i,"PAR_IID"])
        pro=str(het_trans.loc[i,"PRO_IID"])
        impact=str(het_trans.loc[i,"IMPACT_maximpact"])
        var_id="-".join([chrom,pos,ref,alt])
        pro_gene = (pro, gene)
        if pro_gene in cpht_pro_gene:
            par_vars=cpht_gts[gene][pro]
            for par in par_vars:
                if var_id in par_vars[par]:
                    i_keep.append(i)

    # write vars that are part of cpht genotypes to out_tsv
    het_trans = het_trans.loc[i_keep, :]
    het_trans.to_csv(path_or_buf=args.out_tsv,
                     sep="\t", header=True, index=False)
    return
Exemple #5
0
def main(ARGS=None):
    if ARGS == None:
        ARGS = sys.argv[1:]
    args = parse_args(ARGS)
    """
    convert certain comma delim str args to lists
    """
    args.qual_impacts = misc.str_none_split(args.qual_impacts, ",")
    args.max_impact_csqs = misc.str_none_split(args.max_impact_csqs, ",")
    args.max_csq_scores = misc.str_none_split(args.max_csq_scores, ",")
    args.min_csq_scores = misc.str_none_split(args.min_csq_scores, ",")
    """
    read cnds files
    """
    var_cnds = None
    if args.variant_cnds != None: var_cnds = VcfCnds(args.variant_cnds)
    """
    init cyvcf2 VCF obj, get info subfields, header for output
    """
    vcf = cyvcf2.VCF(args.in_vcf, strict_gt=True)
    cyvcf2_vcf = Cyvcf2Vcf(vcf)
    cyvcf2_vcf.get_info_subfields()
    if args.annotation_subfield == "ANN":
        cyvcf2_vcf.get_csq_keys(spliton="Functional annotations: ",
                                delim="|",
                                chars_del=[" ", "'", '"'],
                                ann_id=args.annotation_subfield)
    else:
        cyvcf2_vcf.get_csq_keys(spliton="Format: ",
                                delim="|",
                                ann_id=args.annotation_subfield)
    vcf_header_str = cyvcf2_vcf.header_to_list(
        gt_varnames=GT_VARNAMES,
        max_impact=args.max_impact,
        max_impact_csqs=args.max_impact_csqs,
        max_csq_scores=args.max_csq_scores,
        min_csq_scores=args.min_csq_scores,
        delim="\t")
    """
    since we're writing to a VCF, if any new INFO items written, need to 
    add to header to reflect this.
    """
    if args.max_impact_csqs != None:
        for csq_name in args.max_impact_csqs:
            csq_name_ext = csq_name + "_maximpact"
            vcf.add_info_to_header({'ID': csq_name_ext,
                                    'Description':'max '+csq_name+' to go along '+\
                                                  'with transcripts with max IMPACT',
                                    'Type':'Character',
                                    'Number':'1'})
    if args.max_csq_scores != None:
        for csq_name in args.max_csq_scores:
            csq_name_ext = csq_name + "_max"
            vcf.add_info_to_header({'ID': csq_name_ext,
                                    'Description':'max value for '+csq_name + \
                                                  'along assessed transcripts '+\
                                                  'in CSQ field.',
                                    'Type':'Float',
                                    'Number':'1'})
    if args.min_csq_scores != None:
        for csq_name in args.min_csq_scores:
            csq_name_ext = csq_name + "_min"
            vcf.add_info_to_header({'ID': csq_name_ext,
                                    'Description':'min value for '+csq_name + \
                                                  'along assessed transcripts '+\
                                                  'in CSQ field.',
                                    'Type':'Float',
                                    'Number':'1'})
    """
    init VCF writer object
    """
    w = cyvcf2.Writer(args.out_vcf, vcf)
    # to write variant record, for v in vcf: w.write_record(v)
    """
    iterate through all variants, performing de novo screen on each one
    """
    vargeno_counts = defaultdict(int)
    prev_chrom = None
    n_var = 0
    n_var_keep = 0
    """
    if intervals provided, make sure to parse over those, else whole vcf
    """
    if args.intervals != None:
        if os.path.isfile(args.intervals):
            intervals = open(args.intervals, "r").readlines()
            intervals = [x.rstrip() for x in intervals]
        else:
            intervals = [args.intervals]
    else:
        intervals = [""]
    """
    parse VCF file looking for de novo variant calls
    """
    for vcf_variant in cyvcf2_vcf.iterator(intervals):
        n_var += 1
        #if linenum == 1000000: break
        """
        create new Cyvcf2Variant instance
        """
        cyvcf2_variant = Cyvcf2Variant(vcf_variant)

        if vcf_variant.CHROM != prev_chrom:
            print("Extracting variants from chrom " + vcf_variant.CHROM)
            prev_chrom = vcf_variant.CHROM
        """
        assume single allele per site, exclude sites with call as '*'
        """
        alt = vcf_variant.ALT[0]
        if alt == '*': continue

        ## if no qualifying impact str found in CSQ, skip
        if args.qual_impacts != None:
            res = cyvcf2_variant.qual_impacts_screen(
                args.qual_impacts, csq_subfield=args.annotation_subfield)
            if res == False: continue

        ## if desired, derive max impact annots from var, along with other
        ## user defined max or min scores in CSQ for variant
        csqs_maximpact_list = []
        max_csq_scores = []
        min_csq_scores = []
        if args.max_impact == True:
            cyvcf2_variant.get_annot_txs(cyvcf2_vcf.csq_keys,
                                         csq_subfield=args.annotation_subfield)
            if args.annotation_subfield == "ANN":
                impact_subfield = "Annotation_Impact"
            else:
                impact_subfield = "IMPACT"
            res = cyvcf2_variant.maxmin_csqs(
                csq_subfield=args.annotation_subfield,
                impact_subfield=impact_subfield,
                max_impact_csqs=args.max_impact_csqs,
                max_csq_scores=args.max_csq_scores,
                min_csq_scores=args.min_csq_scores)
            (csqs_maximpact_list, max_csq_scores, min_csq_scores) = res
            """
            if corresponding values defined, add to vcf record
            """
            if args.max_impact_csqs != None:
                for i in range(len(args.max_impact_csqs)):
                    max_impact_csq_name = args.max_impact_csqs[i] + "_maximpact"
                    max_impact_csq = csqs_maximpact_list[i]
                    vcf_variant.INFO[max_impact_csq_name] = max_impact_csq
            if args.min_csq_scores != None:
                for i in range(len(args.min_csq_scores)):
                    min_csq_score_name = args.min_csq_scores[i] + "_min"
                    min_csq_score = float(min_csq_scores[i])
                    vcf_variant.INFO[min_csq_score_name] = min_csq_score
            if args.max_csq_scores != None:
                for i in range(len(args.max_csq_scores)):
                    max_csq_score_name = args.max_csq_scores[i] + "_max"
                    max_csq_score = float(max_csq_scores[i])
                    vcf_variant.INFO[max_csq_score_name] = max_csq_score

        ## filter on variant cnds file provided
        if var_cnds.test_variant(vcf_variant) == False: continue

        ## if variant survives filters, retain record
        w.write_record(vcf_variant)
        n_var_keep += 1

    w.close()
    vcf.close()

    ## print basic stats on number of input variants, number of
    ## variants to keep
    print("Number of variants in parent VCF : " + str(n_var))
    print("Number of variants retained post-filtration : " + str(n_var_keep))

    return