def main():
    usage = "usage: %prog [options] file.vcf \n output format values from  genotype data field  in a VCF  for suitabale plotting/dataviz"
    parser = OptionParser(usage)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    parser.add_option("--formatTag", dest="format", default="GT", help="format tag to compare (default GT)")
    (options, args)=parser.parse_args()
    vcfilename=args[0]
    #vcfilename='/Users/indapa/software/Pgmsnp/PythonNotebook/child5x.nrs.sites.calledWith20x_bam.child5x.nrs.sites.calledWith5x_bam.combineVariants.vcf'
    
    basename=os.path.splitext(vcfilename)[0]

    vcfobj=VcfFile(vcfilename)
    vcfh=open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"

    samples=vcfobj.getSampleList()
    print "\t".join(samples)
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        
        vrec_ziptuple=vrec.zipGenotypes(samples)
        outputs=[]
        for (sample, geno_obj) in vrec_ziptuple:
            outputs.append( "\t".join( [geno_obj.getFormatVal(options.format) ] ) )
        print "\t".join(outputs)
def main():

    """ This program adds non-reference positions to a VCF file with variant positions.
        It does this by the following. Given a bed file of non-variant intervals and a 2bit file of the reference genome,
        it retrieves the refernce alllele, and prints out the VCF data line with the ref/ref genotypes. Then 
        it prints a single line from the segregating VCF file, and then start the loop again.
        It assumes that the input vcf is position sorted.
        
        To generate the non-segrgating bed interval file, run the following program from bx-python:
        bed_subtract_basewise.py   reference_genome.bed  segregating.sites.bed
        bed_subtract_basewise.py   ~/software/Pgmsnp/PythonNotebook/simref.1.bed  Simulation1.segregating.bed """

    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--bed", type="string", dest="bed", help="bed file with non-variant intervals")
    parser.add_option("--tbf", type="string", dest="tbf", help="2bit file of reference genome", default='/Users/amit/data/MySimulations/Simulation1/Reference/simref.1.2bit')
    
    (options, args)=parser.parse_args()

    try:
        sys.stderr.write("opening twobitfile...\n")
        twobit=bx.seq.twobit.TwoBitFile( open( options.tbf ) )
    except:
        sys.stderr.write("unable to open twobit file!\n")

    segregatingVcf=args[0]
    bedfh=open(options.bed,'r')

    vcfh=open(segregatingVcf,'r')
    vcfobj=VcfFile(segregatingVcf)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader()
    formatstring="GT"
    print header
    for (chrom,start,end) in yield_bedcoordinate(bedfh):
        start=int(start)
        end=int(end)
        for i in range(start,end):
            begin=i
            end=i+1
            refseq=twobit[chrom][begin:end]
            vrec=VcfRecord(chrom,str(end),'.',refseq,'.','.','.','NS=3')

            vrec.addGenotype( VcfGenotype(formatstring,'0/0') )
            vrec.addGenotype( VcfGenotype(formatstring,'0/0') )
            vrec.addGenotype( VcfGenotype(formatstring,'0/0') )
            print vrec.toStringwithGenotypes()
        vcf_gen=vcfobj.yieldVcfRecordwithGenotypes(vcfh)
        print vcf_gen.next().toStringwithGenotypes()
def main():
    
    """  This program extracts out records matching the set=(\S+). Typically, the VCF is derived
    from GATK CombineVariants, but any vcf with set=(\S+) can be examined with this program """
   
    usage = "usage: %prog [options] file.vcf.gz "
    #parser = OptionParser(usage)
    parser = argparse.ArgumentParser(description=' extract records with matching set=(\S+) tag')
    
    parser.add_argument('vcfile', metavar='vcfile', type=str, help='file.vcf.gz')
    #parser.add_argument('-filter', dest='filter', type=str, default=".", help='filter value')
    parser.add_argument('-set', dest='set', type=str, default=None, help="name of set to extract")
    
    args = parser.parse_args()
    if args.set == None: 
        sys.stderr.write("please provide value to -set option!\n")
        sys.exit(1)
   
    (path, vcfile)=os.path.split(args.vcfile )
    
    basename=return_file_basename( return_file_basename(vcfile) )
    sys.stderr.write( basename +"\n")
    
    outvcf=".".join([basename, args.set, 'vcf'])
    sys.stderr.write( outvcf +"\n")
    outfh=open(outvcf, 'w')
    
    if args.vcfile.endswith(".gz"):
        vcfh=gzip.open(args.vcfile,'r')
    else:
        vcfh=open(args.vcfile,'r')
    vcfobj=VcfFile(args.vcfile)
    
    pattern=';set=(\S+)'
   
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() 
    outfh.write( header +"\n")
    
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        searchresult=re.search(pattern, vrec.getInfo() )
        if re.search(pattern, vrec.getInfo() ) == None:
            continue
        value=re.search(pattern, vrec.getInfo() ).groups()[0]
        #print value
        if value == args.set:
            outfh.write(  vrec.toStringwithGenotypes() +"\n" )    
def main():
    usage = "usage: %prog [options]  "
    parser = argparse.ArgumentParser(description='Given a gzipped vcf file and pedigree file, generate a new vcf with only those samples present in the pedigree (ped file) ')
    parser.add_argument('-ped', dest='pedfile', type=str, help="*.ped file")
    parser.add_argument('vcfile',  type=str,help='*.vcf.gz file')

    args=parser.parse_args()

    """ parse the pedfile and return the list of iids to keep from the VCF file """
    pedobj=Pedfile(args.pedfile)
    pedobj.parsePedfile()

    keeplist=  pedobj.returnIndivids()

    #open the VCFfile
    vcfh=gzip.open(args.vcfile,'r')
    vcfobj=VcfFile(args.vcfile)

    vcfobj.parseMetaAndHeaderLines(vcfh)
    samples=vcfobj.getSampleList()
    newsamples= [ s for s in samples if s in keeplist]

    print newsamples

    vcfobj.setSampleList(newsamples)
    header=vcfobj.returnHeader()
    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        keepGenotypes=[]
        vrec_ziptuple=vrec.zipGenotypes(samples)
        for (s, genObj) in vrec_ziptuple:
            if s in keeplist:
                keepGenotypes.append( genObj )
    
        vrec.addGenotypeList(  keepGenotypes )
        print vrec.toStringwithGenotypes()
def main():

    """  remove samples from a vcf file """
    usage = "usage: %prog [options] file.vcf.gz "
    # parser = OptionParser(usage)
    parser = argparse.ArgumentParser(description="remove samples from vcf file")
    parser.add_argument("removesamples", metavar="sample", type=str, nargs="+", help="sample names to remove")
    parser.add_argument("-vcf", dest="vcfile", type=str, help="vcf file to remove samples from")
    # parser.add_argument("vcf", help="vcf file to analyze")
    args = parser.parse_args()
    # print 'remove these samples: ', args.samples
    # print args.vcfile

    vcfh = gzip.open(args.vcfile, "r")
    vcfobj = VcfFile(args.vcfile)

    vcfobj.parseMetaAndHeaderLines(vcfh)

    # print header
    samples = vcfobj.getSampleList()
    newsamples = [s for s in samples if s not in args.removesamples]
    # print 'keep these samples: ',  newsamples
    vcfobj.setSampleList(newsamples)
    header = vcfobj.returnHeader()
    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        keepGenotypes = []
        vrec_ziptuple = vrec.zipGenotypes(samples)
        for (s, genObj) in vrec_ziptuple:
            if s not in args.removesamples:
                # print s
                keepGenotypes.append(genObj)
        # print keepGenotypes
        vrec.addGenotypeList(keepGenotypes)
        print vrec.toStringwithGenotypes()
def main():
    usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
    parser = OptionParser(usage)
    
    parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    (options, args)=parser.parse_args()

    vcfilename=args[0]
    basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
    """ row is eval, column is comparison 
        make a numpy matrix to represent genotype concordance matrix """
    
    concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] )
    calledtable = np.matrix ( [ [0 ,0] , [0,0] ] )
    
    #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix
    outputfile=".".join([basename, 'variantEval','txt'])
    outputfh=open(outputfile, 'w')
    #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set
    nrslog=".".join([basename, 'nrs','log'])
    nrdlog=".".join([basename, 'nrd','log'])
    filterlog=".".join([basename, 'filtered','log'])
    multialleliclog=".".join([basename, 'multiallelic','log'])
    concordancelog=".".join([basename, 'concordance','log'])
    fieldslog=".".join([basename, 'fields', 'log'])
    nrsfh=open(nrslog, 'w')
    nrdfh=open(nrdlog, 'w')
    filteredfh=open(filterlog, 'w')
    multifh=open(multialleliclog, 'w')
    concordancefh=open(concordancelog, 'w')
    fieldsfh=open(fieldslog, 'w')
    fieldsfh.write('set'+"\n")
    vcfobj=VcfFile(vcfilename)
    vcfh=gzip.open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"
    
    nrsfh.write(header)
    nrdfh.write(header)
    filteredfh.write(header)
    concordancefh.write(header)
    multifh.write(header)
    #outputfh.write(header)
    #multifh.write(header)

    samples=vcfobj.getSampleList()
    
    #for (comparename, evalname) in grouper(2,samples):
    #    print comparename, evalname
    vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for  (compare,eval) in grouper(2,samples) ] 
    
    for evalObj in vcf_sample_eval_objects:
        evalObj.writeHeaders(header)
    
    totalrecords=0

    pattern=';set=(\S+)'
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        if ',' in vrec.getAlt() > 1:
            outstring=vrec.toStringwithGenotypes() + "\n"
            multifh.write(outstring)
            #continue


        """ skip homoz reference calls unless you want to include them!  """
        if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False:
            continue

        """ if variant is filtered, skip it! """
        if 'filterIn' in vrec.getInfo() and options.includeFilter == False:
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        if 'FilteredInAll' in vrec.getInfo():
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        #returns a list [ (samplename, vcfgenotype) , ... () ]
        vrec_ziptuple=vrec.zipGenotypes(samples)
        """ we make a hack and make a list like so:
           [(sample.variant, compare_genotype, sample.variant2, eval_genotype) ...   ] 
           basically it halves the length of vrec_ziptuple and gives it the same structure
           as the list of VcfSampleEval objects"""
        compare_eval =[ compare+evalu  for (compare,evalu) in grouper(2,vrec_ziptuple) ]
        
       
        #what set are you in?
        field=re.search(pattern, vrec.getInfo()).groups()[0]
        fieldsfh.write(field+"\n")
        totalrecords+=1
        """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype  """
        
        for (genotype_tuple, evalObj) in izip(compare_eval, vcf_sample_eval_objects):
            
            #print genotype_tuple
            compare=genotype_tuple[0:2]
            eval=genotype_tuple[2::]
            #print compare
            #print eval
            

           
                
            (comp_allele1, comp_allele2)=compare[1].getAlleles()
            (eval_allele1, eval_allele2)=eval[1].getAlleles()

            eval_alleletype=typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype=typeofGenotype(comp_allele1, comp_allele2)
           
            """ increment the cell count  """
            concordancetable[eval_alleletype, comp_alleletype]+=1
            evalObj.incrementcellcount(eval_alleletype,comp_alleletype)


            """write gentoype record to log appropriate log file """
            #print records that contirubut the NRS penalty
            if eval_alleletype == 3:
                if comp_alleletype == 1 or comp_alleletype==2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring)
                    evalObj.writeNrs(outstring)
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring )
                    evalObj.writeNrs(outstring)
    
        
            #print records that contribute to NRD penalty
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 0:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)
            if eval_alleletype == 1:
                if comp_alleletype == 0 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)
            if eval_alleletype == 2:
                if comp_alleletype == 0 or comp_alleletype ==1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)

    
    for evalObj in vcf_sample_eval_objects:
        evalObj.writeEvalOutput()
    
    outputfh.write("total records analyzed: " + str(totalrecords) + "\n" )

    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write("\t".join(['','AA','AB','BB', './.'  ])  +"\n")
   
    rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.']
    for (i, gt) in grouper(2,rownames):
        row=concordancetable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")

    outputfh.write( "matrix sum: \n")
    sum=np.sum(concordancetable)
    outputfh.write( str(sum) +"\n")

    #now we figure out how many sites were called or not called
    calledtable[0,0]=concordancetable[0:3,0:3].sum()
    calledtable[0,1]=concordancetable[0:3,3].sum()
    calledtable[1,0]=concordancetable[3,0:3].sum()
    calledtable[1,1]=concordancetable[3,3]
    outputfh.write("\n")
    rownames=[ 0,'called', 1,'./.' ]
    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write(  "\t".join(['','called','./.' ]) +"\n" )
    
    for (i, gt) in grouper(2,rownames):
        row=calledtable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")
    outputfh.write( "matrix sum: \n")
    sum=np.sum(calledtable)
    outputfh.write( str(sum) +"\n")
   
    outputfh.write("\n")


    if options.matrixonly == False:
        discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1]
        total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2]
    
        nrd=round( (float(discordance)/float(total)) * 100, 2)
    
        variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2]
    
        variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2]
        nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2)
    
        outputfh.write( "NRD: " + str(nrd) +" \n")
        outputfh.write( "NRS " + str(nrs) +" \n")
Beispiel #7
0
def main():
    
    usage = "usage: %prog [options] vcf_file_one vcf|bed_file_two\n\nFind regions in the first vcf file that overlap regions of the second vcf or bed file\n"
    parser = OptionParser(usage)
    parser.add_option("--minCols", type="int", dest="mincols", default=1, help="mininum basepair overlap (default is one)")
    parser.add_option("--v", action="store_true", dest="reverse",  help="Print regions in first vcf  that DO NOT overlap second vcf|bed file")
    parser.add_option("--filter", type="string", dest="filter", default=None, help="intersect records only set with filter (default is None")
    parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
    parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file one  has no header line", default=False)
    parser.add_option("--nochrprefix", action="store_false", dest="chrprefix", help="use if the bed  doesn't  have chr prefix in chrom column", default=True)
    
    (options, args)=parser.parse_args()

    sys.stderr.write("intersecting two files ...\n")
    
    vcf_file_one=args[0]
    in2_fname=args[1]

    in2_fname_ext= os.path.splitext(in2_fname)[1][1:]
   
    if "bed" == in2_fname_ext:
       
        bitsets = binned_bitsets_from_file( open( in2_fname ) )

    if "vcf" ==  in2_fname_ext:
         bitsets = binned_bitsets_from_vcffile( in2_fname , options.filter)

   
    vcfobj=VcfFile(vcf_file_one)
    vcfh=open(vcf_file_one,'r')

    if options.noheader == False:
        vcfobj.parseMetaAndHeaderLines(vcfh)
        header=vcfobj.returnHeader()
        #print header
        #vcfobj.parseMetaAndHeaderLines(vcfh)
    
        #descriptors = vcfobj.getMetaInfoDescription()
        #infoids=[]
        #for (tag, description) in descriptors:
        #    infoids.append(tag)

        #if options.infotag  not in infoids and options.infotag != 'QUAL'  and  options.infotag != "" and options.noheader == False:
        #    sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n")
        #    exit(1)
        print header

    for dataline in vcfobj.yieldVcfDataLine(vcfh):
        fields=dataline.strip().split('\t')
        (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8]
        (start,end) = (int(pos)-1, int(pos))

        #pass the filter code
        if filtercode != options.filter and options.filter != None:
            continue

        #check to see if record is the correct variant TYPE
        if options.variantype != None:
            pattern=options.infotag+'=('+options.variantype+')'
            if re.search(pattern, info ) == None:
                continue
           


        if options.chrprefix == True:
            chrom="chr"+chrom
            
        if chrom in bitsets and bitsets[chrom].count_range( start, end-start ) >= options.mincols:
            if not options.reverse:
                print dataline
        else:
            if options.reverse == True:
                print dataline
Beispiel #8
0
def main():
    usage = "usage: %prog [options] file.vcf.gz "
    parser = argparse.ArgumentParser(description='filter records  based on genotypes')
   
    parser.add_argument('vcf', metavar='vcf', type=str,
                   help='vcf.gz file')
    """ http://stackoverflow.com/a/15008806/1735942 """
    parser.add_argument('--no-header',dest='header',action='store_false')
    parser.add_argument('-gt', metavar='gt', type=str, nargs='*', action='append',
                   help='sample 0/0')
   
    args = parser.parse_args()
    
    """ http://stackoverflow.com/q/12460989/1735942 """
    args.gt = [el for elements in args.gt for el in elements]
    
    #print args.gq
    
    gt_filter=[ tuple(x.split(' ')) for x in args.gt ]
    
    gt_dict=defaultdict(list)
    for (k,v) in gt_filter:
        gt_dict[k].append(v)
        
    #print gt_dict
    
    
    
    
    vcfh=gzip.open(args.vcf,'r')
    vcfobj=VcfFile(args.vcf)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader()
    if args.header == True:
        print header
    samplelist=vcfobj.getSampleList()   
    for s in gt_dict.keys():
        if s not in samplelist:
            print s ," not in samples!\n"
            sys.exit(1)
    #print header
    #print header
    #print gt_dict.keys()

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        genotypes_toFilter=[] #list [ (sample,genoObj) ] to be filtered 
        genotype_tuple= vrec.zipGenotypes(samplelist) ## get a list of tuples [ (sample, VcfGenotype object) ... ]
        for (s,g) in genotype_tuple:
            if s in  gt_dict.keys():
                #print s
                if  len(gt_dict[s]) > 1: # logical or 
                    
                    if any( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ):
                        genotypes_toFilter.append(True)
                    else: genotypes_toFilter.append(False)
                else:
                    if all( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ):
                        genotypes_toFilter.append(True)
                    else:genotypes_toFilter.append(False)
                
        # all gt filters need to evaluate to True in order for record to print
        if all(item == True for item in genotypes_toFilter):
            print vrec.toStringwithGenotypes()
def main():

    """ given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags 
    to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF"""

    usage = "usage: %prog [option] file.vcf.gz"
    parser = OptionParser(usage)
    parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on")
    parser.add_option(
        "--mapq",
        type="float",
        dest="mapq",
        default=0.0,
        help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)",
    )
    parser.add_option(
        "--bq",
        type="float",
        dest="bq",
        default=0.0,
        help="Exclude bases from analysis if their supporting base quality is less that --bq (default is 0)",
    )
    parser.add_option(
        "--includeDuplicates",
        action="store_false",
        dest="duplicate",
        help="include duplicate marked reads in analysis (turned off by default) ",
    )
    (options, args) = parser.parse_args()
    if options.bam == None:
        sys.stderr.write("please provide a value to --bam option\n")
        sys.exit(1)

    vcfilename = args[0]

    bamfilename = options.bam

    ra_formatline = FormatLine("RA", number="1", type="Integer", description="number of reference alleles observed")
    aa_formatline = FormatLine("AA", number="1", type="Integer", description="number of alternate alleles observed")

    if os.path.exists(bamfilename + ".bai") == False:
        sys.stderr.write("please check for existence of bam index file (*.bai)\n")
        exit(1)

    vcfobj = VcfFile(vcfilename)

    vcfh = gzip.open(vcfilename, "r")

    vcfobj.parseMetaAndHeaderLines(vcfh)
    vcfobj.addMetaFormatHeader(ra_formatline)
    vcfobj.addMetaFormatHeader(aa_formatline)
    vcfobj.addMetaInfoHeader("RA", "Integer", "1", "total number of reference alleles observed")
    vcfobj.addMetaInfoHeader("AA", "Integer", "1", "total number of alternate alleles observed")
    header = vcfobj.returnHeader()

    print header
    readgroupdict = {}
    pybamfile = pysam.Samfile(bamfilename, "rb")
    rgdictlist = pybamfile.header["RG"]
    for dictionary in rgdictlist:
        readgroupdict[dictionary["ID"]] = dictionary["SM"]
    # print readgroupdict

    samples = vcfobj.getSampleList()

    # print samples

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        (chrom, start, end) = vrec.getChrom(), int(vrec.getPos()) - 1, int(vrec.getPos())
        # print chrom, str(start), str(end)
        # print vrec.getRef()
        # print vrec.toStringwithGenotypes()

        for pileupcolumn in pybamfile.pileup(chrom, start, end):
            if pileupcolumn.pos != end:
                continue
            # sys.stdout.write('chr'+chrom+ " " + str(start) +  " " + str(end) + " " + str(pileupcolumn.pos) + " ")
            # print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.n)

            seqdict = {}
            sampledict = {}
            for s in samples:
                sampledict[s] = []
            # print sampledict
            for (base, count) in (("A", 0), ("C", 0), ("G", 0), ("T", 0), ("N", 0)):
                seqdict[base] = count

            for pileupread in pileupcolumn.pileups:

                if pileupread.alignment.is_duplicate == True and options.duplicate == False:
                    continue
                if pileupread.alignment.mapq < options.mapq:
                    continue
                if (ord(pileupread.alignment.qual[pileupread.qpos - 1]) - 33) < options.bq:
                    continue
                seqdict[pileupread.alignment.seq[pileupread.qpos - 1]] += 1
                readgroup = dict(pileupread.alignment.tags)["RG"]

                sample = readgroupdict[readgroup]
                # print readgroup,sample, pileupread.alignment.seq[pileupread.qpos-1]
                sampledict[sample].append(pileupread.alignment.seq[pileupread.qpos - 1])
                # print pileupread.alignment.seq, len(pileupread.alignment.seq), pileupread.qpos

            vrec.addInfo("RA=" + str(seqdict[vrec.getRef()]))
            if vrec.getAlt() != ".":
                vrec.addInfo("AA=" + str(seqdict[vrec.getAlt()]))
            zip_genos = vrec.zipGenotypes(samples)
            for (sample, vcfgenobj) in zip_genos:

                if len(sampledict[sample]) == 0:
                    vcfgenobj.addFormat("RA")
                    vcfgenobj.addFormat("AA")
                    continue
                else:
                    ra = 0
                    aa = 0
                    c = dict(Counter(sampledict[sample]))
                    if vrec.getRef() in c.keys():
                        ra = c[vrec.getRef()]
                    if vrec.getAlt() in c.keys():
                        aa = c[vrec.getAlt()]
                    vcfgenobj.addFormatVal("RA", str(ra))
                    vcfgenobj.addFormatVal("AA", str(aa))

            # for nt in ('A', 'C', 'G', 'T', 'N'):
            #    sys.stdout.write( str(seqdict[nt]) + " ")
            # sys.stdout.write("\n")
            print vrec.toStringwithGenotypes()

    pybamfile.close()
Beispiel #10
0
def main():
    
    """  This program bins the records of a VCF file according to a user defined range and number of bins.
        For example if -start 10 and -end 100 and -num of 10 it would make 10 bins:
        10, 20, 30, 40,50,60,70,80,90,100
        
        Then for each record if the QUAL is >=x, then that record is written to *.qual_x.vcf file
    """
    usage = "usage: %prog [options] file.vcf.gz "
    #parser = OptionParser(usage)
    parser = argparse.ArgumentParser(description=' bin vcf records according to QUAL')
    
    parser.add_argument('vcfile', metavar='vcfile', type=str, help='file.vcf.gz')
    parser.add_argument('-filter', dest='filter', type=str, default=".", help='filter value')
    parser.add_argument('-start', dest='start', type=int, help="starting point for QUAL range")
    parser.add_argument('-end', dest='end', type=int, help="ending pint for QUAL range")
    parser.add_argument('-num', dest='num', type=int, help='number of bins')
    
    #parser.add_argument("vcf", help="vcf file to analyze")
    args = parser.parse_args()
    #print args
    (path, vcfile)=os.path.split(args.vcfile )
    basename=return_file_basename( return_file_basename(vcfile) )
    print basename
    if args.start == None or  args.end  == None or  args.num == None:
        sys.stderr.write("please give start stop and number of bins for QUAL")
        sys.exit(1)
        
    bins=np.linspace(args.start, args.end, args.num)
    binstring=bins
    binstring=binstring.astype(int).tolist()
    print binstring
    binned_vcfilenames=[ ".".join( [ basename, "qual_"+ str(s), "vcf"]) for s in binstring ]
    print binned_vcfilenames
    #binned_fh = itertools.chain(*(open(f, "w") for f in binned_vcfilenames))   
    """ we create a list of filehandles for the binned VCFs """                     
    binned_fh=list(itertools.imap(lambda x:open(x,'w'), binned_vcfilenames))                               
    
    if args.vcfile.endswith(".gz"):
        vcfh=gzip.open(args.vcfile,'r')
    else:
        vcfh=open(args.vcfile,'r')
    vcfobj=VcfFile(args.vcfile)
    
    
    #vcf_reader = vcf.Reader(open(args.vcfile, 'r'))
    #print vcf_reader.metadata
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() 
    
    map(lambda x: x.write(header+"\n"),binned_fh)
    
    
    #vcfrecord_bins= [ [] for  i in xrange(len(bins)) ]
    sys.stderr.write("binning vcf records based on quality ....\n")
    #for vrec in vcf_reader:
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        """ skip records that do not have PASS in filter column """
        if vrec.getFilter() != args.filter:
            continue
        QUAL=float(vrec.getQual())
        vcfstring=vrec.toStringwithGenotypes()
        for i in xrange(len(bins)):
            if QUAL >= bins[i]:
                binned_fh[i].write(vcfstring+"\n")
                #vcfrecord_bins[i].append(vrec)
            else: break
        

                
    map(lambda x: x.close(),binned_fh)
def main():
    usage = "usage: %prog [options] file.vcf.gz"
    parser = OptionParser(usage)
    parser.add_option("--model", type="string", dest="model", default = "dominant", help=" inheritance model [dominant|recessive], default is dominant ")
    parser.add_option("--ped", type="string", dest="pedfile", default=None, help="ped file of samples with phenotype (disease) status")
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those  records matching filter (default is PASS)", default='PASS')

    (options, args)=parser.parse_args()
    if options.pedfile==None:
        sys.stderr.write("please provide a value to --ped parameter!\n")
        exit(1)


    affecteds=[] # list of affected samples
    unaffecteds=[] # list of unaffected samples
    
    pedobjects=[] #list of pedobjects, represents lines in a pedfile
    pedfh=open(options.pedfile, 'r')
    for line in pedfh:
        fields=line.strip().split('\t')
        (fid,iid,pid,mid,sex,phenotype)=fields[0:6]
        phenotype=int(phenotype)
        pedobjects.append( Ped(fid,iid,pid,mid,sex,phenotype) )

    #the phenotype status is set to 2 if the sample is affected: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped
    affecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 2  ]
    unaffecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 1  ]



    

    #check if any overlapping samples between unaffected and affected
    if len( list( set(unaffecteds).intersection( set(affecteds) ) )  ) != 0:
        sys.stderr.write("check list of affected and unaffecteds for overlapping samples!\n")
        exit(1)

    #    sys.stderr.write("check list of affected and unaffected for overlapping samples!\n")
    #    exit(1)


    vcfilename=args[0]
    vcfh=gzip.open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader()
    samplelist=vcfobj.getSampleList()

    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        
        affected_genotypes=[] #list of tuples (sample, VcfGenotype object) with samples that are affected
        unaffected_genotypes=[] # list of tuples (sample, VcfGenotype object) with samples that are unaffected

        if vrec.getFilter() != options.filter and options.filter != None : continue
        
        genotype_tuple= vrec.zipGenotypes(samplelist) # get a list of tuples [ (sample, VcfGenotype object) ... ]
        for (sample, genotype) in genotype_tuple: #iterate thru and see if they are in affected or unaffected list
            if options.model == 'dominant':
                if sample in affecteds:  # if so ...
                    affected_genotypes.append( ( sample, genotype.toString(),  genotype.isSegregating() )  ) # are they segregating for a non-ref allele?
                if sample in unaffecteds:
                    unaffected_genotypes.append( (sample,  genotype.toString(),  genotype.isSegregating() ) ) # are they segregating for a non-ref allele?
            elif options.model == 'recessive':
                if sample in affecteds:
                    affected_genotypes.append( ( sample, genotype.toString(),  genotype.isNonRefHomz() )  ) # are they segregating for a non-ref homoz?
                if sample in unaffecteds:
                    unaffected_genotypes.append( (sample,  genotype.toString(),  genotype.isNonRefHomz() ) ) # are they segregating for a non-ref non-refhomoz?
            else:
                sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n")


        if options.model == 'dominant':
        #under dominant model, all affecteds should be
        #segrgating for non-ref allele and all UN-affecteds should *NOT* be segregating for non-ref allele
            
            #how many affected individuals are segregating for non-ref allele?
            count_segregating_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True)

            #how many UN-affected individuals are *NOT*  segregating for non-ref allele?
            count_segregating_unaffected =  [ tpl[2] == False for tpl in unaffected_genotypes ].count(True)

            #now if all affects are segregating for the site
            # and all the un-affecteds are *not* segregating for the site
            # it is a candidate
            if count_segregating_affected == len(affecteds):
                if  count_segregating_unaffected == len(unaffecteds):
                    print vrec.toStringwithGenotypes()

        elif options.model == 'recessive':
            #how many affected individuals are segregating for non-ref allele?
            #http://stackoverflow.com/a/5684324/1735942
            count_homoz_nonref_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True)

            #how many UN-affected individuals are *NOT*  segregating for non-ref allele?
            count_homoz_ref_unaffected =  [ tpl[2] == False for tpl in unaffected_genotypes ].count(True)



            #now if all affects are homoz nonref for the site
            # and all the un-affecteds are homoz ref for the site
            # it is a candidate
            if count_homoz_nonref_affected == len(affecteds):
                if  count_homoz_ref_unaffected  == len(unaffecteds):
                    print vrec.toStringwithGenotypes()
        else:
            sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n")