예제 #1
0
def main():
    usage = "usage: %prog [options] file.vcf \n output format values from  genotype data field  in a VCF  for suitabale plotting/dataviz"
    parser = OptionParser(usage)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    parser.add_option("--formatTag", dest="format", default="GT", help="format tag to compare (default GT)")
    (options, args)=parser.parse_args()
    vcfilename=args[0]
    #vcfilename='/Users/indapa/software/Pgmsnp/PythonNotebook/child5x.nrs.sites.calledWith20x_bam.child5x.nrs.sites.calledWith5x_bam.combineVariants.vcf'
    
    basename=os.path.splitext(vcfilename)[0]

    vcfobj=VcfFile(vcfilename)
    vcfh=open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"

    samples=vcfobj.getSampleList()
    print "\t".join(samples)
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        
        vrec_ziptuple=vrec.zipGenotypes(samples)
        outputs=[]
        for (sample, geno_obj) in vrec_ziptuple:
            outputs.append( "\t".join( [geno_obj.getFormatVal(options.format) ] ) )
        print "\t".join(outputs)
예제 #2
0
def main():
    usage = "usage: %prog [options]  nrd.log.vcf\n"
    parser = OptionParser(usage)
    # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)

    (options, args) = parser.parse_args()
    vcfilename = args[0]
    basename = os.path.splitext(vcfilename)[0]

    vcfobj = VcfFile(vcfilename)
    vcfh = open(vcfilename, "r")
    nrdallfh = open(basename + ".allgenos.nrd.txt", "w")
    nrdtwofh = open(basename + ".twogenos.nrd.txt", "w")
    nrdonefh = open(basename + ".onegenos.nrd.txt", "w")
    vcfobj.parseMetaAndHeaderLines(vcfh)
    samples = vcfobj.getSampleList()
    # print samples
    # print "#setname\t" + "\t".join(samples)
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        outputline = [[vrec.getPos()]]

        setname = vrec.returnInfoDict()["set"]  # which callset does the site belong to?

        outputline.append([setname])  # we aggregate genotypes per sample heere

        vrec_ziptuple = vrec.zipGenotypes(samples)
        # print vrec_ziptuple
        """ Since I'm testing against trio, NRD count can be 1 2 or 3
            We keep track of the nrd count and print those records to the appropriate file:
            nrdallfh, nrdtwofh, nrdonefh  """
        nrd_count = 0
        for (compare, eval) in grouper(2, vrec_ziptuple):
            (comp_allele1, comp_allele2) = compare[1].getAlleles()
            (eval_allele1, eval_allele2) = eval[1].getAlleles()
            eval_alleletype = typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype = typeofGenotype(comp_allele1, comp_allele2)
            if eval_alleletype == comp_alleletype:
                continue
            outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)])
            nrd_count += 1

        output = "\t".join(melt_lol(outputline))
        """ depending on the nrd count, print the records to appropirate file(s) """
        if nrd_count == 3:
            nrdallfh.write(output + "\n")
        if nrd_count == 2:
            nrdtwofh.write(output + "\n")
        if nrd_count == 1:
            nrdonefh.write(output + "\n")
예제 #3
0
def main():
    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those  records matching filter (default is None)", default=None)
    parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
    parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    (options, args)=parser.parse_args()


    vcfilename=args[0]
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaLines(vcfh)
    vcfobj.addMetaInfoHeader("CR", "D", 1, "site call rate")
    vcfobj.printMetaLines()

    vcfh.seek(0)


    vcfobj.parseHeaderLine(vcfh)
    vcfobj.printHeaderLine()
    
    samplelist = vcfobj.getSampleList()
    sampleCalls={} #key sample name value #called genotypes
    for s in samplelist: sampleCalls[s]=0



    totalrecords=0
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        if vrec.getFilter() != options.filter and options.filter != None : continue
        totalrecords+=1
        sitecallrate=vrec.siteCallrate()
        vrec.appendInfoString("CR="+str(sitecallrate))
        vrec.sampleCallrate(samplelist,sampleCalls)
        #print vrec.toStringwithGenotypes()

    for s in samplelist:
        callrate=float(sampleCalls[s])/float(totalrecords)
        print s, sampleCalls[s], totalrecords, callrate
예제 #4
0
def main():
    usage = "usage: %prog [options]  "
    parser = argparse.ArgumentParser(description='Given a gzipped vcf file and pedigree file, generate a new vcf with only those samples present in the pedigree (ped file) ')
    parser.add_argument('-ped', dest='pedfile', type=str, help="*.ped file")
    parser.add_argument('vcfile',  type=str,help='*.vcf.gz file')

    args=parser.parse_args()

    """ parse the pedfile and return the list of iids to keep from the VCF file """
    pedobj=Pedfile(args.pedfile)
    pedobj.parsePedfile()

    keeplist=  pedobj.returnIndivids()

    #open the VCFfile
    vcfh=gzip.open(args.vcfile,'r')
    vcfobj=VcfFile(args.vcfile)

    vcfobj.parseMetaAndHeaderLines(vcfh)
    samples=vcfobj.getSampleList()
    newsamples= [ s for s in samples if s in keeplist]

    print newsamples

    vcfobj.setSampleList(newsamples)
    header=vcfobj.returnHeader()
    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        keepGenotypes=[]
        vrec_ziptuple=vrec.zipGenotypes(samples)
        for (s, genObj) in vrec_ziptuple:
            if s in keeplist:
                keepGenotypes.append( genObj )
    
        vrec.addGenotypeList(  keepGenotypes )
        print vrec.toStringwithGenotypes()
예제 #5
0
def main():

    """  remove samples from a vcf file """
    usage = "usage: %prog [options] file.vcf.gz "
    # parser = OptionParser(usage)
    parser = argparse.ArgumentParser(description="remove samples from vcf file")
    parser.add_argument("removesamples", metavar="sample", type=str, nargs="+", help="sample names to remove")
    parser.add_argument("-vcf", dest="vcfile", type=str, help="vcf file to remove samples from")
    # parser.add_argument("vcf", help="vcf file to analyze")
    args = parser.parse_args()
    # print 'remove these samples: ', args.samples
    # print args.vcfile

    vcfh = gzip.open(args.vcfile, "r")
    vcfobj = VcfFile(args.vcfile)

    vcfobj.parseMetaAndHeaderLines(vcfh)

    # print header
    samples = vcfobj.getSampleList()
    newsamples = [s for s in samples if s not in args.removesamples]
    # print 'keep these samples: ',  newsamples
    vcfobj.setSampleList(newsamples)
    header = vcfobj.returnHeader()
    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        keepGenotypes = []
        vrec_ziptuple = vrec.zipGenotypes(samples)
        for (s, genObj) in vrec_ziptuple:
            if s not in args.removesamples:
                # print s
                keepGenotypes.append(genObj)
        # print keepGenotypes
        vrec.addGenotypeList(keepGenotypes)
        print vrec.toStringwithGenotypes()
def main():
    usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
    parser = OptionParser(usage)
    
    parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    (options, args)=parser.parse_args()

    vcfilename=args[0]
    basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
    """ row is eval, column is comparison 
        make a numpy matrix to represent genotype concordance matrix """
    
    concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] )
    calledtable = np.matrix ( [ [0 ,0] , [0,0] ] )
    
    #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix
    outputfile=".".join([basename, 'variantEval','txt'])
    outputfh=open(outputfile, 'w')
    #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set
    nrslog=".".join([basename, 'nrs','log'])
    nrdlog=".".join([basename, 'nrd','log'])
    filterlog=".".join([basename, 'filtered','log'])
    multialleliclog=".".join([basename, 'multiallelic','log'])
    concordancelog=".".join([basename, 'concordance','log'])
    fieldslog=".".join([basename, 'fields', 'log'])
    nrsfh=open(nrslog, 'w')
    nrdfh=open(nrdlog, 'w')
    filteredfh=open(filterlog, 'w')
    multifh=open(multialleliclog, 'w')
    concordancefh=open(concordancelog, 'w')
    fieldsfh=open(fieldslog, 'w')
    fieldsfh.write('set'+"\n")
    vcfobj=VcfFile(vcfilename)
    vcfh=gzip.open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"
    
    nrsfh.write(header)
    nrdfh.write(header)
    filteredfh.write(header)
    concordancefh.write(header)
    multifh.write(header)
    #outputfh.write(header)
    #multifh.write(header)

    samples=vcfobj.getSampleList()
    
    #for (comparename, evalname) in grouper(2,samples):
    #    print comparename, evalname
    vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for  (compare,eval) in grouper(2,samples) ] 
    
    for evalObj in vcf_sample_eval_objects:
        evalObj.writeHeaders(header)
    
    totalrecords=0

    pattern=';set=(\S+)'
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        if ',' in vrec.getAlt() > 1:
            outstring=vrec.toStringwithGenotypes() + "\n"
            multifh.write(outstring)
            #continue


        """ skip homoz reference calls unless you want to include them!  """
        if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False:
            continue

        """ if variant is filtered, skip it! """
        if 'filterIn' in vrec.getInfo() and options.includeFilter == False:
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        if 'FilteredInAll' in vrec.getInfo():
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        #returns a list [ (samplename, vcfgenotype) , ... () ]
        vrec_ziptuple=vrec.zipGenotypes(samples)
        """ we make a hack and make a list like so:
           [(sample.variant, compare_genotype, sample.variant2, eval_genotype) ...   ] 
           basically it halves the length of vrec_ziptuple and gives it the same structure
           as the list of VcfSampleEval objects"""
        compare_eval =[ compare+evalu  for (compare,evalu) in grouper(2,vrec_ziptuple) ]
        
       
        #what set are you in?
        field=re.search(pattern, vrec.getInfo()).groups()[0]
        fieldsfh.write(field+"\n")
        totalrecords+=1
        """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype  """
        
        for (genotype_tuple, evalObj) in izip(compare_eval, vcf_sample_eval_objects):
            
            #print genotype_tuple
            compare=genotype_tuple[0:2]
            eval=genotype_tuple[2::]
            #print compare
            #print eval
            

           
                
            (comp_allele1, comp_allele2)=compare[1].getAlleles()
            (eval_allele1, eval_allele2)=eval[1].getAlleles()

            eval_alleletype=typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype=typeofGenotype(comp_allele1, comp_allele2)
           
            """ increment the cell count  """
            concordancetable[eval_alleletype, comp_alleletype]+=1
            evalObj.incrementcellcount(eval_alleletype,comp_alleletype)


            """write gentoype record to log appropriate log file """
            #print records that contirubut the NRS penalty
            if eval_alleletype == 3:
                if comp_alleletype == 1 or comp_alleletype==2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring)
                    evalObj.writeNrs(outstring)
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring )
                    evalObj.writeNrs(outstring)
    
        
            #print records that contribute to NRD penalty
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 0:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)
            if eval_alleletype == 1:
                if comp_alleletype == 0 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)
            if eval_alleletype == 2:
                if comp_alleletype == 0 or comp_alleletype ==1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)

    
    for evalObj in vcf_sample_eval_objects:
        evalObj.writeEvalOutput()
    
    outputfh.write("total records analyzed: " + str(totalrecords) + "\n" )

    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write("\t".join(['','AA','AB','BB', './.'  ])  +"\n")
   
    rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.']
    for (i, gt) in grouper(2,rownames):
        row=concordancetable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")

    outputfh.write( "matrix sum: \n")
    sum=np.sum(concordancetable)
    outputfh.write( str(sum) +"\n")

    #now we figure out how many sites were called or not called
    calledtable[0,0]=concordancetable[0:3,0:3].sum()
    calledtable[0,1]=concordancetable[0:3,3].sum()
    calledtable[1,0]=concordancetable[3,0:3].sum()
    calledtable[1,1]=concordancetable[3,3]
    outputfh.write("\n")
    rownames=[ 0,'called', 1,'./.' ]
    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write(  "\t".join(['','called','./.' ]) +"\n" )
    
    for (i, gt) in grouper(2,rownames):
        row=calledtable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")
    outputfh.write( "matrix sum: \n")
    sum=np.sum(calledtable)
    outputfh.write( str(sum) +"\n")
   
    outputfh.write("\n")


    if options.matrixonly == False:
        discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1]
        total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2]
    
        nrd=round( (float(discordance)/float(total)) * 100, 2)
    
        variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2]
    
        variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2]
        nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2)
    
        outputfh.write( "NRD: " + str(nrd) +" \n")
        outputfh.write( "NRS " + str(nrs) +" \n")
예제 #7
0
def main():
    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those records with matching filter")

    (options, args)=parser.parse_args()

    vcfilename=args[0]
    if vcfilename.endswith(".gz"):
        vcfh=gzip.open(vcfilename,'r')
    else:
        vcfh=open(vcfilename,'r')
    #vcfh=open(vcfilename,'r')
    
    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaAndHeaderLines(vcfh)

    TsTv_counter=collections.Counter()
    RefAlt_counter=collections.Counter()
   


    samples=vcfobj.getSampleList()


    genotype_dict={}
    for s in samples:
        genotype_dict[s]=[0,0,0,0]
    counter=0
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        if vrec.getFilter() != options.filter and options.filter != None:
            sys.stderr.write("skipped filter..\n")
            continue
        #print vrec.toString()
        counter+=1
        
        if vrec.getAlt() == ".": continue
        ref=vrec.getRef()
        numAlleles=vrec.getAlt().split(',')
        
        if len(numAlleles) > 1:
            sys.stderr.write("multi alleleic record\n")
         
       
        for alt in numAlleles:
            if len(alt) ==1 and len(ref) ==1:
                if isTransition(ref,alt) == True:
                    TsTv_counter['transition']+=1
                else:
                    TsTv_counter['transversion']+=1
                refalt_string=" ".join( [ ref, alt])
            #since the number of alleles on indels is unbounded, we only keep track of single nucleotide substitutions
                RefAlt_counter[ refalt_string ]+=1
                
        
        vrec_ziptuple=vrec.zipGenotypes(samples)
        genotype_typecounts=get_genotype_counts(vrec_ziptuple)
        for (g, sample) in genotype_typecounts:
            #print g,sample
            if g == None:
                sys.stderr.write("skipped genotype\n")
                continue
            genotype_dict[sample][g]+=1

    
    print
    print " ".join( ['sample', 'homoz_ref', 'het', 'homoz_nonref', 'nocall', 'total'])
    for sample in genotype_dict.keys():
        """ http://docs.python.org/library/functions.html#reduce """
        tota=reduce(lambda x, y: x+y,genotype_dict[sample])
        
        outstring = " ".join( map(str,genotype_dict[sample]) )
        print " ".join ( [sample, outstring,str(tota)])

    print

    for (type,count) in TsTv_counter.items():
        print type, count
    TsTvratio=float(TsTv_counter['transition'])/float(TsTv_counter['transversion'])
    print "TsTv: ",  round( TsTvratio,2)
    

    totalpercent=0
    for a1,a2  in combinations('ACGT',2):
        count1 = RefAlt_counter[ ' '.join ( [ a1, a2] ) ]
        count2 = RefAlt_counter[ ' '.join ( [ a2, a1] ) ]
        total=count1 + count2
        try:
            percent= round ( float(total) / float(sum(RefAlt_counter.values()) ), 4)
            print ' '.join ( [ a1, a2] ), str(total), str(percent)
            totalpercent+=percent
        except ZeroDivisionError:
            sys.stderr.write( " integer division or modulo by zero\n")
    #for (type, count) in RefAlt_counter.items():
    #    print type, count
    print sum(RefAlt_counter.values()), str(totalpercent)


    print "Total vcf records: " + str(counter) + "\n"
예제 #8
0
def main():
    usage = "usage: %prog [options] file.vcf.gz "
    parser = argparse.ArgumentParser(description='filter records  based on genotypes')
   
    parser.add_argument('vcf', metavar='vcf', type=str,
                   help='vcf.gz file')
    """ http://stackoverflow.com/a/15008806/1735942 """
    parser.add_argument('--no-header',dest='header',action='store_false')
    parser.add_argument('-gt', metavar='gt', type=str, nargs='*', action='append',
                   help='sample 0/0')
   
    args = parser.parse_args()
    
    """ http://stackoverflow.com/q/12460989/1735942 """
    args.gt = [el for elements in args.gt for el in elements]
    
    #print args.gq
    
    gt_filter=[ tuple(x.split(' ')) for x in args.gt ]
    
    gt_dict=defaultdict(list)
    for (k,v) in gt_filter:
        gt_dict[k].append(v)
        
    #print gt_dict
    
    
    
    
    vcfh=gzip.open(args.vcf,'r')
    vcfobj=VcfFile(args.vcf)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader()
    if args.header == True:
        print header
    samplelist=vcfobj.getSampleList()   
    for s in gt_dict.keys():
        if s not in samplelist:
            print s ," not in samples!\n"
            sys.exit(1)
    #print header
    #print header
    #print gt_dict.keys()

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        genotypes_toFilter=[] #list [ (sample,genoObj) ] to be filtered 
        genotype_tuple= vrec.zipGenotypes(samplelist) ## get a list of tuples [ (sample, VcfGenotype object) ... ]
        for (s,g) in genotype_tuple:
            if s in  gt_dict.keys():
                #print s
                if  len(gt_dict[s]) > 1: # logical or 
                    
                    if any( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ):
                        genotypes_toFilter.append(True)
                    else: genotypes_toFilter.append(False)
                else:
                    if all( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ):
                        genotypes_toFilter.append(True)
                    else:genotypes_toFilter.append(False)
                
        # all gt filters need to evaluate to True in order for record to print
        if all(item == True for item in genotypes_toFilter):
            print vrec.toStringwithGenotypes()
def main():

    """ given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags 
    to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF"""

    usage = "usage: %prog [option] file.vcf.gz"
    parser = OptionParser(usage)
    parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on")
    parser.add_option(
        "--mapq",
        type="float",
        dest="mapq",
        default=0.0,
        help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)",
    )
    parser.add_option(
        "--bq",
        type="float",
        dest="bq",
        default=0.0,
        help="Exclude bases from analysis if their supporting base quality is less that --bq (default is 0)",
    )
    parser.add_option(
        "--includeDuplicates",
        action="store_false",
        dest="duplicate",
        help="include duplicate marked reads in analysis (turned off by default) ",
    )
    (options, args) = parser.parse_args()
    if options.bam == None:
        sys.stderr.write("please provide a value to --bam option\n")
        sys.exit(1)

    vcfilename = args[0]

    bamfilename = options.bam

    ra_formatline = FormatLine("RA", number="1", type="Integer", description="number of reference alleles observed")
    aa_formatline = FormatLine("AA", number="1", type="Integer", description="number of alternate alleles observed")

    if os.path.exists(bamfilename + ".bai") == False:
        sys.stderr.write("please check for existence of bam index file (*.bai)\n")
        exit(1)

    vcfobj = VcfFile(vcfilename)

    vcfh = gzip.open(vcfilename, "r")

    vcfobj.parseMetaAndHeaderLines(vcfh)
    vcfobj.addMetaFormatHeader(ra_formatline)
    vcfobj.addMetaFormatHeader(aa_formatline)
    vcfobj.addMetaInfoHeader("RA", "Integer", "1", "total number of reference alleles observed")
    vcfobj.addMetaInfoHeader("AA", "Integer", "1", "total number of alternate alleles observed")
    header = vcfobj.returnHeader()

    print header
    readgroupdict = {}
    pybamfile = pysam.Samfile(bamfilename, "rb")
    rgdictlist = pybamfile.header["RG"]
    for dictionary in rgdictlist:
        readgroupdict[dictionary["ID"]] = dictionary["SM"]
    # print readgroupdict

    samples = vcfobj.getSampleList()

    # print samples

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        (chrom, start, end) = vrec.getChrom(), int(vrec.getPos()) - 1, int(vrec.getPos())
        # print chrom, str(start), str(end)
        # print vrec.getRef()
        # print vrec.toStringwithGenotypes()

        for pileupcolumn in pybamfile.pileup(chrom, start, end):
            if pileupcolumn.pos != end:
                continue
            # sys.stdout.write('chr'+chrom+ " " + str(start) +  " " + str(end) + " " + str(pileupcolumn.pos) + " ")
            # print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.n)

            seqdict = {}
            sampledict = {}
            for s in samples:
                sampledict[s] = []
            # print sampledict
            for (base, count) in (("A", 0), ("C", 0), ("G", 0), ("T", 0), ("N", 0)):
                seqdict[base] = count

            for pileupread in pileupcolumn.pileups:

                if pileupread.alignment.is_duplicate == True and options.duplicate == False:
                    continue
                if pileupread.alignment.mapq < options.mapq:
                    continue
                if (ord(pileupread.alignment.qual[pileupread.qpos - 1]) - 33) < options.bq:
                    continue
                seqdict[pileupread.alignment.seq[pileupread.qpos - 1]] += 1
                readgroup = dict(pileupread.alignment.tags)["RG"]

                sample = readgroupdict[readgroup]
                # print readgroup,sample, pileupread.alignment.seq[pileupread.qpos-1]
                sampledict[sample].append(pileupread.alignment.seq[pileupread.qpos - 1])
                # print pileupread.alignment.seq, len(pileupread.alignment.seq), pileupread.qpos

            vrec.addInfo("RA=" + str(seqdict[vrec.getRef()]))
            if vrec.getAlt() != ".":
                vrec.addInfo("AA=" + str(seqdict[vrec.getAlt()]))
            zip_genos = vrec.zipGenotypes(samples)
            for (sample, vcfgenobj) in zip_genos:

                if len(sampledict[sample]) == 0:
                    vcfgenobj.addFormat("RA")
                    vcfgenobj.addFormat("AA")
                    continue
                else:
                    ra = 0
                    aa = 0
                    c = dict(Counter(sampledict[sample]))
                    if vrec.getRef() in c.keys():
                        ra = c[vrec.getRef()]
                    if vrec.getAlt() in c.keys():
                        aa = c[vrec.getAlt()]
                    vcfgenobj.addFormatVal("RA", str(ra))
                    vcfgenobj.addFormatVal("AA", str(aa))

            # for nt in ('A', 'C', 'G', 'T', 'N'):
            #    sys.stdout.write( str(seqdict[nt]) + " ")
            # sys.stdout.write("\n")
            print vrec.toStringwithGenotypes()

    pybamfile.close()
def main():
    usage = "usage: %prog [options] file.vcf.gz"
    parser = OptionParser(usage)
    parser.add_option("--model", type="string", dest="model", default = "dominant", help=" inheritance model [dominant|recessive], default is dominant ")
    parser.add_option("--ped", type="string", dest="pedfile", default=None, help="ped file of samples with phenotype (disease) status")
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those  records matching filter (default is PASS)", default='PASS')

    (options, args)=parser.parse_args()
    if options.pedfile==None:
        sys.stderr.write("please provide a value to --ped parameter!\n")
        exit(1)


    affecteds=[] # list of affected samples
    unaffecteds=[] # list of unaffected samples
    
    pedobjects=[] #list of pedobjects, represents lines in a pedfile
    pedfh=open(options.pedfile, 'r')
    for line in pedfh:
        fields=line.strip().split('\t')
        (fid,iid,pid,mid,sex,phenotype)=fields[0:6]
        phenotype=int(phenotype)
        pedobjects.append( Ped(fid,iid,pid,mid,sex,phenotype) )

    #the phenotype status is set to 2 if the sample is affected: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped
    affecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 2  ]
    unaffecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 1  ]



    

    #check if any overlapping samples between unaffected and affected
    if len( list( set(unaffecteds).intersection( set(affecteds) ) )  ) != 0:
        sys.stderr.write("check list of affected and unaffecteds for overlapping samples!\n")
        exit(1)

    #    sys.stderr.write("check list of affected and unaffected for overlapping samples!\n")
    #    exit(1)


    vcfilename=args[0]
    vcfh=gzip.open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader()
    samplelist=vcfobj.getSampleList()

    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        
        affected_genotypes=[] #list of tuples (sample, VcfGenotype object) with samples that are affected
        unaffected_genotypes=[] # list of tuples (sample, VcfGenotype object) with samples that are unaffected

        if vrec.getFilter() != options.filter and options.filter != None : continue
        
        genotype_tuple= vrec.zipGenotypes(samplelist) # get a list of tuples [ (sample, VcfGenotype object) ... ]
        for (sample, genotype) in genotype_tuple: #iterate thru and see if they are in affected or unaffected list
            if options.model == 'dominant':
                if sample in affecteds:  # if so ...
                    affected_genotypes.append( ( sample, genotype.toString(),  genotype.isSegregating() )  ) # are they segregating for a non-ref allele?
                if sample in unaffecteds:
                    unaffected_genotypes.append( (sample,  genotype.toString(),  genotype.isSegregating() ) ) # are they segregating for a non-ref allele?
            elif options.model == 'recessive':
                if sample in affecteds:
                    affected_genotypes.append( ( sample, genotype.toString(),  genotype.isNonRefHomz() )  ) # are they segregating for a non-ref homoz?
                if sample in unaffecteds:
                    unaffected_genotypes.append( (sample,  genotype.toString(),  genotype.isNonRefHomz() ) ) # are they segregating for a non-ref non-refhomoz?
            else:
                sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n")


        if options.model == 'dominant':
        #under dominant model, all affecteds should be
        #segrgating for non-ref allele and all UN-affecteds should *NOT* be segregating for non-ref allele
            
            #how many affected individuals are segregating for non-ref allele?
            count_segregating_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True)

            #how many UN-affected individuals are *NOT*  segregating for non-ref allele?
            count_segregating_unaffected =  [ tpl[2] == False for tpl in unaffected_genotypes ].count(True)

            #now if all affects are segregating for the site
            # and all the un-affecteds are *not* segregating for the site
            # it is a candidate
            if count_segregating_affected == len(affecteds):
                if  count_segregating_unaffected == len(unaffecteds):
                    print vrec.toStringwithGenotypes()

        elif options.model == 'recessive':
            #how many affected individuals are segregating for non-ref allele?
            #http://stackoverflow.com/a/5684324/1735942
            count_homoz_nonref_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True)

            #how many UN-affected individuals are *NOT*  segregating for non-ref allele?
            count_homoz_ref_unaffected =  [ tpl[2] == False for tpl in unaffected_genotypes ].count(True)



            #now if all affects are homoz nonref for the site
            # and all the un-affecteds are homoz ref for the site
            # it is a candidate
            if count_homoz_nonref_affected == len(affecteds):
                if  count_homoz_ref_unaffected  == len(unaffecteds):
                    print vrec.toStringwithGenotypes()
        else:
            sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n")