def main():

    """ prints the description of ##INFO metalines in a VCF  """
    
    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--infotag", type="string", dest="infotag", help="prints the  description for the INFO id infotag")
    parser.add_option("--all", action="store_true", dest="all",  help="prints  the  description for  *every* INFO  tag in VCF")
    parser.add_option
    (options, args)=parser.parse_args()
    
    vcfilename=args[0]
    vcfh=open(vcfilename, 'r')
    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)

    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaLines(vcfh)

    descriptors = vcfobj.getMetaInfoDescription()
    found_tag=0
    for (id, description) in descriptors:
        if options.all==True:
            print id, description
            found_tag=1
            continue
        if id == options.infotag:
            print id, "\t", description
            found_tag=1
    if found_tag  ==0  : sys.stderr.write(options.infotag + " not in ##INFO headers\n")
def main():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
    parser.add_option("--filter", type="string", dest="filter", help="only analyze records with matching filter (default is None)", default=None)

    (options, args)=parser.parse_args()
    if options.infotag == "":
        sys.stderr.write("provide a value for --info parameter!\n")
        exit(1)


    variant_dict={} #key variant type value VcfRecord object

    vcfilename=args[0]
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    
    descriptors = vcfobj.getMetaInfoDescription()
    infoids=[]
    for (tag, description) in descriptors:
        tag
        infoids.append(tag)

    if options.infotag  not in infoids and options.infotag != 'QUAL':
        sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n")
        exit(1)

    

    pattern=options.infotag+'=(\S+)'
    
    for vrec in vcfobj.yieldVcfRecord(vcfh):
        if vrec.getFilter() != options.filter and options.filter != None: continue
        
        searchresult=re.search(pattern, vrec.getInfo() )
        if re.search(pattern, vrec.getInfo() ) == None:
            continue
        else:
            value=re.search(pattern, vrec.getInfo() ).groups()[0]
            #rint value
            if value not in variant_dict.keys():
                variant_dict[value]=[]
                variant_dict[value].append( vrec )
            else:
                variant_dict[value].append( vrec )


    
    sum=0
    sys.stderr.write("types and count of different variant classes in " + vcfilename + "\n")
    for k in variant_dict.keys():
        print k, len( variant_dict[k] )
        sum+=len( variant_dict[k] )
    print "TOTAL:", sum
Beispiel #3
0
def binned_bitsets_from_vcffile( vcfilename, chrom_col=0, start_col=1,  upstream_pad=0, downstream_pad=0, lens={} ):
    """
    Read a vcffile into a dictionary of bitsets. The defaults arguments

    - 'vcfilename' should be a filename for vcf file
    - 'chrom_col', 'start_col', and 'end_col' must exist in each line.

    - if 'lens' is provided bitset sizes will be looked up from it, otherwise
      chromosomes will be assumed to be the maximum size

    - the bitset interval made into a   zero-based, half-open interval!!!!!!!

    """
    last_chrom = None
    last_bitset = None
    bitsets = dict()
    MAX=2147483647

    vcfobj=VcfFile(vcfilename)
    fh=open(vcfilename,'r')

    for vrec in vcfobj.yieldVcfRecord(fh):

        filtercode = vrec.getFilter()
        chrom = vrec.getChrom()
        pos=int( vrec.getPos() )


        #if filtercode != filtercodeoption and filtercodeoption != None:
        #    continue


        if filtercode != 'PASS':
            if filtercode == '.':
                pass
            else:
                continue


        chrom="chr"+chrom
        if chrom != last_chrom:
            if chrom not in bitsets:
                if chrom in lens:
                    size = lens[chrom]
                else:
                    size = MAX
                bitsets[chrom] = BinnedBitSet( size )
            last_chrom = chrom
            last_bitset = bitsets[chrom]
        start, end = (pos-1, pos)

        if upstream_pad: start = max( 0, start - upstream_pad )
        if downstream_pad: end = min( size, end + downstream_pad )
        if start > end: warn( "Interval start after end!" )
        last_bitset.set_range( start, end-start )
    fh.close()
    return bitsets
def main():
    usage = "usage: %prog [options]  nrd.log.vcf\n"
    parser = OptionParser(usage)
    # parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    # parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)

    (options, args) = parser.parse_args()
    vcfilename = args[0]
    basename = os.path.splitext(vcfilename)[0]

    vcfobj = VcfFile(vcfilename)
    vcfh = open(vcfilename, "r")
    nrdallfh = open(basename + ".allgenos.nrd.txt", "w")
    nrdtwofh = open(basename + ".twogenos.nrd.txt", "w")
    nrdonefh = open(basename + ".onegenos.nrd.txt", "w")
    vcfobj.parseMetaAndHeaderLines(vcfh)
    samples = vcfobj.getSampleList()
    # print samples
    # print "#setname\t" + "\t".join(samples)
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        outputline = [[vrec.getPos()]]

        setname = vrec.returnInfoDict()["set"]  # which callset does the site belong to?

        outputline.append([setname])  # we aggregate genotypes per sample heere

        vrec_ziptuple = vrec.zipGenotypes(samples)
        # print vrec_ziptuple
        """ Since I'm testing against trio, NRD count can be 1 2 or 3
            We keep track of the nrd count and print those records to the appropriate file:
            nrdallfh, nrdtwofh, nrdonefh  """
        nrd_count = 0
        for (compare, eval) in grouper(2, vrec_ziptuple):
            (comp_allele1, comp_allele2) = compare[1].getAlleles()
            (eval_allele1, eval_allele2) = eval[1].getAlleles()
            eval_alleletype = typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype = typeofGenotype(comp_allele1, comp_allele2)
            if eval_alleletype == comp_alleletype:
                continue
            outputline.append([eval[0], str(eval_alleletype), compare[0], str(comp_alleletype)])
            nrd_count += 1

        output = "\t".join(melt_lol(outputline))
        """ depending on the nrd count, print the records to appropirate file(s) """
        if nrd_count == 3:
            nrdallfh.write(output + "\n")
        if nrd_count == 2:
            nrdtwofh.write(output + "\n")
        if nrd_count == 1:
            nrdonefh.write(output + "\n")
def main():

    """ This program adds non-reference positions to a VCF file with variant positions.
        It does this by the following. Given a bed file of non-variant intervals and a 2bit file of the reference genome,
        it retrieves the refernce alllele, and prints out the VCF data line with the ref/ref genotypes. Then 
        it prints a single line from the segregating VCF file, and then start the loop again.
        It assumes that the input vcf is position sorted.
        
        To generate the non-segrgating bed interval file, run the following program from bx-python:
        bed_subtract_basewise.py   reference_genome.bed  segregating.sites.bed
        bed_subtract_basewise.py   ~/software/Pgmsnp/PythonNotebook/simref.1.bed  Simulation1.segregating.bed """

    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--bed", type="string", dest="bed", help="bed file with non-variant intervals")
    parser.add_option("--tbf", type="string", dest="tbf", help="2bit file of reference genome", default='/Users/amit/data/MySimulations/Simulation1/Reference/simref.1.2bit')
    
    (options, args)=parser.parse_args()

    try:
        sys.stderr.write("opening twobitfile...\n")
        twobit=bx.seq.twobit.TwoBitFile( open( options.tbf ) )
    except:
        sys.stderr.write("unable to open twobit file!\n")

    segregatingVcf=args[0]
    bedfh=open(options.bed,'r')

    vcfh=open(segregatingVcf,'r')
    vcfobj=VcfFile(segregatingVcf)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader()
    formatstring="GT"
    print header
    for (chrom,start,end) in yield_bedcoordinate(bedfh):
        start=int(start)
        end=int(end)
        for i in range(start,end):
            begin=i
            end=i+1
            refseq=twobit[chrom][begin:end]
            vrec=VcfRecord(chrom,str(end),'.',refseq,'.','.','.','NS=3')

            vrec.addGenotype( VcfGenotype(formatstring,'0/0') )
            vrec.addGenotype( VcfGenotype(formatstring,'0/0') )
            vrec.addGenotype( VcfGenotype(formatstring,'0/0') )
            print vrec.toStringwithGenotypes()
        vcf_gen=vcfobj.yieldVcfRecordwithGenotypes(vcfh)
        print vcf_gen.next().toStringwithGenotypes()
def main():
    
    """  This program extracts out records matching the set=(\S+). Typically, the VCF is derived
    from GATK CombineVariants, but any vcf with set=(\S+) can be examined with this program """
   
    usage = "usage: %prog [options] file.vcf.gz "
    #parser = OptionParser(usage)
    parser = argparse.ArgumentParser(description=' extract records with matching set=(\S+) tag')
    
    parser.add_argument('vcfile', metavar='vcfile', type=str, help='file.vcf.gz')
    #parser.add_argument('-filter', dest='filter', type=str, default=".", help='filter value')
    parser.add_argument('-set', dest='set', type=str, default=None, help="name of set to extract")
    
    args = parser.parse_args()
    if args.set == None: 
        sys.stderr.write("please provide value to -set option!\n")
        sys.exit(1)
   
    (path, vcfile)=os.path.split(args.vcfile )
    
    basename=return_file_basename( return_file_basename(vcfile) )
    sys.stderr.write( basename +"\n")
    
    outvcf=".".join([basename, args.set, 'vcf'])
    sys.stderr.write( outvcf +"\n")
    outfh=open(outvcf, 'w')
    
    if args.vcfile.endswith(".gz"):
        vcfh=gzip.open(args.vcfile,'r')
    else:
        vcfh=open(args.vcfile,'r')
    vcfobj=VcfFile(args.vcfile)
    
    pattern=';set=(\S+)'
   
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() 
    outfh.write( header +"\n")
    
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        searchresult=re.search(pattern, vrec.getInfo() )
        if re.search(pattern, vrec.getInfo() ) == None:
            continue
        value=re.search(pattern, vrec.getInfo() ).groups()[0]
        #print value
        if value == args.set:
            outfh.write(  vrec.toStringwithGenotypes() +"\n" )    
Beispiel #7
0
def main():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    parser.add_option("--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None)
    parser.add_option("--addchr", action="store_true", dest="addchr",  help="pre-pend 'chr' to chrom column ", default=False)
    parser.add_option("--siteinfo", action="store_true", dest="siteinfo", help="use if vcf only has site information and lacks FORMAT column")
    parser.add_option("--dump", action="store_true", dest="dump", help="dump everything after teh ID column in the 4th bed column")
    parser.add_option("--chr", type="string", dest="chr", default=None, help="restrct to chromosome number specified by --chr")
    (options, args)=parser.parse_args()

    vcfilename=args[0]
    #basename, extension = os.path.splitext(vcfilename)
    #bedfile=basename+".bed"
    #bedfh=open(bedfile,'w')
    vcfh=open(vcfilename,'r')
    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaAndHeaderLines(vcfh)

    for dataline in vcfobj.yieldVcfDataLine(vcfh):
        fields=dataline.strip().split('\t')
        if options.siteinfo == True:
            (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8]
        else:
            (chrom,pos,id,ref,alt,qual,filtercode,info,format)=fields[0:9]
        if options.chr != None and chrom != options.chr: continue
        if options.addchr ==True:
            chrom='chr'+chrom
        if filtercode != options.filter and options.filter != None : continue
        (start,end) = (int(pos)-1, int(pos))
        if options.dump == True:
            # @type options
            if options.siteinfo == True:
                gstrings=",".join(fields[8::])
            else:
                gstrings=",".join(fields[9::])
            dumpstring="".join([ref,alt,qual,filtercode,info,gstrings])
            bedstring= "\t".join( [ chrom, str(start), str(end), id ,dumpstring] )
        else:
            bedstring= "\t".join( [ chrom, str(start), str(end), id] )

        print bedstring
def main():
    usage = "usage: %prog [options] file.vcf \n output format values from  genotype data field  in a VCF  for suitabale plotting/dataviz"
    parser = OptionParser(usage)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    parser.add_option("--formatTag", dest="format", default="GT", help="format tag to compare (default GT)")
    (options, args)=parser.parse_args()
    vcfilename=args[0]
    #vcfilename='/Users/indapa/software/Pgmsnp/PythonNotebook/child5x.nrs.sites.calledWith20x_bam.child5x.nrs.sites.calledWith5x_bam.combineVariants.vcf'
    
    basename=os.path.splitext(vcfilename)[0]

    vcfobj=VcfFile(vcfilename)
    vcfh=open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"

    samples=vcfobj.getSampleList()
    print "\t".join(samples)
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        
        vrec_ziptuple=vrec.zipGenotypes(samples)
        outputs=[]
        for (sample, geno_obj) in vrec_ziptuple:
            outputs.append( "\t".join( [geno_obj.getFormatVal(options.format) ] ) )
        print "\t".join(outputs)
def main():
    usage = "usage: %prog [options] arg"
    parser = OptionParser(usage)
    usage = "usage: %prog [options] file.vcf \n print summary information about site depth in records of a VCF file\n"
    parser = OptionParser(usage)
    parser.add_option("--max", type="int", dest="max", help="skip records that are greater than or equal to max (default sys.maxint)", default=sys.maxint)
    #parser.add_option("--v", action="store_true", dest="snp",  help="restrict analysis to SNPs (must have INFO ID SNP in header")

    (options, args)=parser.parse_args()

    vcfilename=args[0]
    fileName, fileExtension = os.path.splitext(vcfilename)
    #nuller.12:80717441..80717681.vcf
    regionpattern='nuller.(\d+):(\d+)..(\d+)'
    results=re.search(regionpattern,fileName ).groups()
    regionstr="\t".join(list(results))
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaLines(vcfh)
    descriptors = vcfobj.getMetaInfoDescription()
    infoids=[]
    for (tag, description) in descriptors:
        infoids.append(tag)

    if 'DP' not in infoids:
        sys.stderr.write("DP tag not in ##INFO headers!")
        exit(1)

    vcfh.seek(0)
    vcfobj.parseHeaderLine(vcfh)

    pattern='DP=(\d+)'
    depth_list=[]
    for vrec in vcfobj.yieldVcfRecord(vcfh):

        dp=re.search(pattern, vrec.getInfo() ).groups()[0]
        if dp == None:
            sys.stderr.write("unable to parse DP value from INFO field\n")
            continue
        else:
            if int(dp) >= options.max: continue
            depth_list.append(int(dp))

    maxDP=max( array (depth_list))
    minDP= min (array (depth_list))
    medianDP=median (array (depth_list))
    meanDP=mean( array(depth_list))
    length=len(depth_list)

    outstr="\t".join([regionstr, str(maxDP), str(minDP), str(medianDP), str(meanDP), str(length)])
    print outstr
def main():
    usage = "usage: %prog [options]  "
    parser = argparse.ArgumentParser(description='Given a gzipped vcf file and pedigree file, generate a new vcf with only those samples present in the pedigree (ped file) ')
    parser.add_argument('-ped', dest='pedfile', type=str, help="*.ped file")
    parser.add_argument('vcfile',  type=str,help='*.vcf.gz file')

    args=parser.parse_args()

    """ parse the pedfile and return the list of iids to keep from the VCF file """
    pedobj=Pedfile(args.pedfile)
    pedobj.parsePedfile()

    keeplist=  pedobj.returnIndivids()

    #open the VCFfile
    vcfh=gzip.open(args.vcfile,'r')
    vcfobj=VcfFile(args.vcfile)

    vcfobj.parseMetaAndHeaderLines(vcfh)
    samples=vcfobj.getSampleList()
    newsamples= [ s for s in samples if s in keeplist]

    print newsamples

    vcfobj.setSampleList(newsamples)
    header=vcfobj.returnHeader()
    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        keepGenotypes=[]
        vrec_ziptuple=vrec.zipGenotypes(samples)
        for (s, genObj) in vrec_ziptuple:
            if s in keeplist:
                keepGenotypes.append( genObj )
    
        vrec.addGenotypeList(  keepGenotypes )
        print vrec.toStringwithGenotypes()
def main():

    """  remove samples from a vcf file """
    usage = "usage: %prog [options] file.vcf.gz "
    # parser = OptionParser(usage)
    parser = argparse.ArgumentParser(description="remove samples from vcf file")
    parser.add_argument("removesamples", metavar="sample", type=str, nargs="+", help="sample names to remove")
    parser.add_argument("-vcf", dest="vcfile", type=str, help="vcf file to remove samples from")
    # parser.add_argument("vcf", help="vcf file to analyze")
    args = parser.parse_args()
    # print 'remove these samples: ', args.samples
    # print args.vcfile

    vcfh = gzip.open(args.vcfile, "r")
    vcfobj = VcfFile(args.vcfile)

    vcfobj.parseMetaAndHeaderLines(vcfh)

    # print header
    samples = vcfobj.getSampleList()
    newsamples = [s for s in samples if s not in args.removesamples]
    # print 'keep these samples: ',  newsamples
    vcfobj.setSampleList(newsamples)
    header = vcfobj.returnHeader()
    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        keepGenotypes = []
        vrec_ziptuple = vrec.zipGenotypes(samples)
        for (s, genObj) in vrec_ziptuple:
            if s not in args.removesamples:
                # print s
                keepGenotypes.append(genObj)
        # print keepGenotypes
        vrec.addGenotypeList(keepGenotypes)
        print vrec.toStringwithGenotypes()
def main():
    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those records with matching filter")

    (options, args)=parser.parse_args()

    vcfilename=args[0]
    if vcfilename.endswith(".gz"):
        vcfh=gzip.open(vcfilename,'r')
    else:
        vcfh=open(vcfilename,'r')
    #vcfh=open(vcfilename,'r')
    
    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaAndHeaderLines(vcfh)

    TsTv_counter=collections.Counter()
    RefAlt_counter=collections.Counter()
   


    samples=vcfobj.getSampleList()


    genotype_dict={}
    for s in samples:
        genotype_dict[s]=[0,0,0,0]
    counter=0
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        if vrec.getFilter() != options.filter and options.filter != None:
            sys.stderr.write("skipped filter..\n")
            continue
        #print vrec.toString()
        counter+=1
        
        if vrec.getAlt() == ".": continue
        ref=vrec.getRef()
        numAlleles=vrec.getAlt().split(',')
        
        if len(numAlleles) > 1:
            sys.stderr.write("multi alleleic record\n")
         
       
        for alt in numAlleles:
            if len(alt) ==1 and len(ref) ==1:
                if isTransition(ref,alt) == True:
                    TsTv_counter['transition']+=1
                else:
                    TsTv_counter['transversion']+=1
                refalt_string=" ".join( [ ref, alt])
            #since the number of alleles on indels is unbounded, we only keep track of single nucleotide substitutions
                RefAlt_counter[ refalt_string ]+=1
                
        
        vrec_ziptuple=vrec.zipGenotypes(samples)
        genotype_typecounts=get_genotype_counts(vrec_ziptuple)
        for (g, sample) in genotype_typecounts:
            #print g,sample
            if g == None:
                sys.stderr.write("skipped genotype\n")
                continue
            genotype_dict[sample][g]+=1

    
    print
    print " ".join( ['sample', 'homoz_ref', 'het', 'homoz_nonref', 'nocall', 'total'])
    for sample in genotype_dict.keys():
        """ http://docs.python.org/library/functions.html#reduce """
        tota=reduce(lambda x, y: x+y,genotype_dict[sample])
        
        outstring = " ".join( map(str,genotype_dict[sample]) )
        print " ".join ( [sample, outstring,str(tota)])

    print

    for (type,count) in TsTv_counter.items():
        print type, count
    TsTvratio=float(TsTv_counter['transition'])/float(TsTv_counter['transversion'])
    print "TsTv: ",  round( TsTvratio,2)
    

    totalpercent=0
    for a1,a2  in combinations('ACGT',2):
        count1 = RefAlt_counter[ ' '.join ( [ a1, a2] ) ]
        count2 = RefAlt_counter[ ' '.join ( [ a2, a1] ) ]
        total=count1 + count2
        try:
            percent= round ( float(total) / float(sum(RefAlt_counter.values()) ), 4)
            print ' '.join ( [ a1, a2] ), str(total), str(percent)
            totalpercent+=percent
        except ZeroDivisionError:
            sys.stderr.write( " integer division or modulo by zero\n")
    #for (type, count) in RefAlt_counter.items():
    #    print type, count
    print sum(RefAlt_counter.values()), str(totalpercent)


    print "Total vcf records: " + str(counter) + "\n"
Beispiel #13
0
def main():
    
    """  This program bins the records of a VCF file according to a user defined range and number of bins.
        For example if -start 10 and -end 100 and -num of 10 it would make 10 bins:
        10, 20, 30, 40,50,60,70,80,90,100
        
        Then for each record if the QUAL is >=x, then that record is written to *.qual_x.vcf file
    """
    usage = "usage: %prog [options] file.vcf.gz "
    #parser = OptionParser(usage)
    parser = argparse.ArgumentParser(description=' bin vcf records according to QUAL')
    
    parser.add_argument('vcfile', metavar='vcfile', type=str, help='file.vcf.gz')
    parser.add_argument('-filter', dest='filter', type=str, default=".", help='filter value')
    parser.add_argument('-start', dest='start', type=int, help="starting point for QUAL range")
    parser.add_argument('-end', dest='end', type=int, help="ending pint for QUAL range")
    parser.add_argument('-num', dest='num', type=int, help='number of bins')
    
    #parser.add_argument("vcf", help="vcf file to analyze")
    args = parser.parse_args()
    #print args
    (path, vcfile)=os.path.split(args.vcfile )
    basename=return_file_basename( return_file_basename(vcfile) )
    print basename
    if args.start == None or  args.end  == None or  args.num == None:
        sys.stderr.write("please give start stop and number of bins for QUAL")
        sys.exit(1)
        
    bins=np.linspace(args.start, args.end, args.num)
    binstring=bins
    binstring=binstring.astype(int).tolist()
    print binstring
    binned_vcfilenames=[ ".".join( [ basename, "qual_"+ str(s), "vcf"]) for s in binstring ]
    print binned_vcfilenames
    #binned_fh = itertools.chain(*(open(f, "w") for f in binned_vcfilenames))   
    """ we create a list of filehandles for the binned VCFs """                     
    binned_fh=list(itertools.imap(lambda x:open(x,'w'), binned_vcfilenames))                               
    
    if args.vcfile.endswith(".gz"):
        vcfh=gzip.open(args.vcfile,'r')
    else:
        vcfh=open(args.vcfile,'r')
    vcfobj=VcfFile(args.vcfile)
    
    
    #vcf_reader = vcf.Reader(open(args.vcfile, 'r'))
    #print vcf_reader.metadata
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() 
    
    map(lambda x: x.write(header+"\n"),binned_fh)
    
    
    #vcfrecord_bins= [ [] for  i in xrange(len(bins)) ]
    sys.stderr.write("binning vcf records based on quality ....\n")
    #for vrec in vcf_reader:
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        """ skip records that do not have PASS in filter column """
        if vrec.getFilter() != args.filter:
            continue
        QUAL=float(vrec.getQual())
        vcfstring=vrec.toStringwithGenotypes()
        for i in xrange(len(bins)):
            if QUAL >= bins[i]:
                binned_fh[i].write(vcfstring+"\n")
                #vcfrecord_bins[i].append(vrec)
            else: break
        

                
    map(lambda x: x.close(),binned_fh)
def main():
    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--tbf", type="string", dest="tbf", help="2bit file")
    parser.add_option(
        "--pad",
        type="int",
        dest="pad",
        default=0,
        help="extract sequence  upstream and downstream of position by pad value",
    )

    parser.add_option(
        "--info",
        type="string",
        dest="infotag",
        help="INFO tag id that annotates what type of variant the VCF record is",
        default="TYPE",
    )
    parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default="snp")

    (options, args) = parser.parse_args()
    # open 2bitfile
    try:
        sys.stderr.write("opening twobitfile...\n")
        twobit = bx.seq.twobit.TwoBitFile(open(options.tbf))
    except:
        sys.stderr.write("unable to open twobit file!\n")
        exit(1)

    # open the vcf file
    vcfile = args[0]
    vcfh = open(vcfile, "r")
    vcfobj = VcfFile(vcfh)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    pattern = options.infotag + "=(" + options.variantype + ")"

    sequence = ""
    downstream_seq = ""
    upstream_seq = ""

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        pos = vrec.getPos()
        start = int(pos) - 1
        end = int(pos)

        info = vrec.getInfo()
        if re.search(pattern, info) == None:
            continue
        else:
            value = re.search(pattern, info).groups()[0]
            # print vrec.toString()
            assert end > start, "end greater than start!"
            try:
                sequence = twobit["chr" + vrec.getChrom()][start:end]
                sequence = sequence.upper()
            except:
                sys.stderr.write("unable to fetch sequence from 2bit file!\n")

            if options.pad != 0:
                downstream_start = int(pos)
                upstream_end = int(pos) - 1
                downstream_seq = twobit["chr" + vrec.getChrom()][downstream_start : downstream_start + options.pad]
                upstream_seq = twobit["chr" + vrec.getChrom()][upstream_end - options.pad : upstream_end]
                outstr = "\t".join(
                    [
                        "chr" + vrec.getChrom(),
                        str(start),
                        str(end),
                        sequence,
                        str(upstream_end - options.pad),
                        str(upstream_end),
                        upstream_seq,
                        str(downstream_start),
                        str(downstream_start + options.pad),
                        downstream_seq,
                    ]
                )
            else:
                outstr = "\t".join(["chr" + vrec.getChrom(), str(start), str(end), sequence])
            print outstr
def main():

    """ given a VCF file and bam file containing the sample(s) in the VCF this will add INFO and FORMAT tags 
    to indicate the count of reference and alt alleles observed in total and per-sample and print out a new VCF"""

    usage = "usage: %prog [option] file.vcf.gz"
    parser = OptionParser(usage)
    parser.add_option("--bam", type="string", dest="bam", default=None, help="bam file to perform pileup on")
    parser.add_option(
        "--mapq",
        type="float",
        dest="mapq",
        default=0.0,
        help="Exclude alignments from analysis if they have a mapping less than mapq (default is 0)",
    )
    parser.add_option(
        "--bq",
        type="float",
        dest="bq",
        default=0.0,
        help="Exclude bases from analysis if their supporting base quality is less that --bq (default is 0)",
    )
    parser.add_option(
        "--includeDuplicates",
        action="store_false",
        dest="duplicate",
        help="include duplicate marked reads in analysis (turned off by default) ",
    )
    (options, args) = parser.parse_args()
    if options.bam == None:
        sys.stderr.write("please provide a value to --bam option\n")
        sys.exit(1)

    vcfilename = args[0]

    bamfilename = options.bam

    ra_formatline = FormatLine("RA", number="1", type="Integer", description="number of reference alleles observed")
    aa_formatline = FormatLine("AA", number="1", type="Integer", description="number of alternate alleles observed")

    if os.path.exists(bamfilename + ".bai") == False:
        sys.stderr.write("please check for existence of bam index file (*.bai)\n")
        exit(1)

    vcfobj = VcfFile(vcfilename)

    vcfh = gzip.open(vcfilename, "r")

    vcfobj.parseMetaAndHeaderLines(vcfh)
    vcfobj.addMetaFormatHeader(ra_formatline)
    vcfobj.addMetaFormatHeader(aa_formatline)
    vcfobj.addMetaInfoHeader("RA", "Integer", "1", "total number of reference alleles observed")
    vcfobj.addMetaInfoHeader("AA", "Integer", "1", "total number of alternate alleles observed")
    header = vcfobj.returnHeader()

    print header
    readgroupdict = {}
    pybamfile = pysam.Samfile(bamfilename, "rb")
    rgdictlist = pybamfile.header["RG"]
    for dictionary in rgdictlist:
        readgroupdict[dictionary["ID"]] = dictionary["SM"]
    # print readgroupdict

    samples = vcfobj.getSampleList()

    # print samples

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        (chrom, start, end) = vrec.getChrom(), int(vrec.getPos()) - 1, int(vrec.getPos())
        # print chrom, str(start), str(end)
        # print vrec.getRef()
        # print vrec.toStringwithGenotypes()

        for pileupcolumn in pybamfile.pileup(chrom, start, end):
            if pileupcolumn.pos != end:
                continue
            # sys.stdout.write('chr'+chrom+ " " + str(start) +  " " + str(end) + " " + str(pileupcolumn.pos) + " ")
            # print 'coverage at base %s = %s' % (pileupcolumn.pos , pileupcolumn.n)

            seqdict = {}
            sampledict = {}
            for s in samples:
                sampledict[s] = []
            # print sampledict
            for (base, count) in (("A", 0), ("C", 0), ("G", 0), ("T", 0), ("N", 0)):
                seqdict[base] = count

            for pileupread in pileupcolumn.pileups:

                if pileupread.alignment.is_duplicate == True and options.duplicate == False:
                    continue
                if pileupread.alignment.mapq < options.mapq:
                    continue
                if (ord(pileupread.alignment.qual[pileupread.qpos - 1]) - 33) < options.bq:
                    continue
                seqdict[pileupread.alignment.seq[pileupread.qpos - 1]] += 1
                readgroup = dict(pileupread.alignment.tags)["RG"]

                sample = readgroupdict[readgroup]
                # print readgroup,sample, pileupread.alignment.seq[pileupread.qpos-1]
                sampledict[sample].append(pileupread.alignment.seq[pileupread.qpos - 1])
                # print pileupread.alignment.seq, len(pileupread.alignment.seq), pileupread.qpos

            vrec.addInfo("RA=" + str(seqdict[vrec.getRef()]))
            if vrec.getAlt() != ".":
                vrec.addInfo("AA=" + str(seqdict[vrec.getAlt()]))
            zip_genos = vrec.zipGenotypes(samples)
            for (sample, vcfgenobj) in zip_genos:

                if len(sampledict[sample]) == 0:
                    vcfgenobj.addFormat("RA")
                    vcfgenobj.addFormat("AA")
                    continue
                else:
                    ra = 0
                    aa = 0
                    c = dict(Counter(sampledict[sample]))
                    if vrec.getRef() in c.keys():
                        ra = c[vrec.getRef()]
                    if vrec.getAlt() in c.keys():
                        aa = c[vrec.getAlt()]
                    vcfgenobj.addFormatVal("RA", str(ra))
                    vcfgenobj.addFormatVal("AA", str(aa))

            # for nt in ('A', 'C', 'G', 'T', 'N'):
            #    sys.stdout.write( str(seqdict[nt]) + " ")
            # sys.stdout.write("\n")
            print vrec.toStringwithGenotypes()

    pybamfile.close()
Beispiel #16
0
def main():
    usage = "usage: %prog [options] file.vcf"
    parser = OptionParser(usage)
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those  records matching filter (default is None)", default=None)
    parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
    parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    (options, args)=parser.parse_args()


    vcfilename=args[0]
    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaLines(vcfh)
    vcfobj.addMetaInfoHeader("CR", "D", 1, "site call rate")
    vcfobj.printMetaLines()

    vcfh.seek(0)


    vcfobj.parseHeaderLine(vcfh)
    vcfobj.printHeaderLine()
    
    samplelist = vcfobj.getSampleList()
    sampleCalls={} #key sample name value #called genotypes
    for s in samplelist: sampleCalls[s]=0



    totalrecords=0
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        if vrec.getFilter() != options.filter and options.filter != None : continue
        totalrecords+=1
        sitecallrate=vrec.siteCallrate()
        vrec.appendInfoString("CR="+str(sitecallrate))
        vrec.sampleCallrate(samplelist,sampleCalls)
        #print vrec.toStringwithGenotypes()

    for s in samplelist:
        callrate=float(sampleCalls[s])/float(totalrecords)
        print s, sampleCalls[s], totalrecords, callrate
Beispiel #17
0
def main():
    usage = "usage: %prog [options] maf file.vcf"
    parser = OptionParser(usage)
  
    parser.add_option("--maftag", type="string", dest="maftag", help="INFO tag id that annotates the allele freq of the record", default="AF")
    parser.add_option("--variantag", type="string", dest="vtag", help="INFO tag that annotates the type of variant type", default="VT")
    parser.add_option("--variantype", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    parser.add_option("--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None)
    parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file  has no header file", default=False)
    parser.add_option("--quiet", action="store_true", dest="quiet", help="don't print vcf output to stdout", default=False)
    parser.add_option("--leq", type="float", dest="leq", default=1.0, help="keep variants with AF <= (default 1)")
    parser.add_option("--geq", type="float", dest="geq", default=0.0, help="keep variants with AF >= (default 0)")
    (options, args)=parser.parse_args()

    

    if len(args)!=1:
        sys.stderr.write(usage+"\n")
        exit(1)
    vcfilename=args[0]
    #maf=float(args[0])

    freqfh=open('freq.log', 'w')

    vcfh=open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    #parse its metainfo lines (ones that begin with ##)
    if options.noheader == False:
        vcfobj.parseMetaLines(vcfh)
    #vcfobj.printMetaLines()
    descriptors = vcfobj.getMetaInfoDescription()
    infoids=[]
    for (tag, description) in descriptors:
        infoids.append(tag)

    if options.maftag  not in infoids and options.maftag != 'QUAL' and options.noheader == False:
        sys.stderr.write(options.maftag + " tag not in ##INFO headers!\n")
        exit(1)

    if options.vtag  not in infoids and options.vtag != 'QUAL' and options.noheader==False:
        sys.stderr.write(options.vtag + " tag not in ##INFO headers!\n")
        exit(1)

   
    #vcfh.seek(0)
    if options.noheader == False:
        vcfobj.parseHeaderLine(vcfh)
  


    if options.variantype==None:
        variantpattern=options.vtag+'=(\w+);'
    else:
        variantpattern=options.vtag+'=('+options.variantype+');'
    mafpattern=options.maftag+'=(0.\d+)'

    #print mafpattern, variantpattern


    for dataline in vcfobj.yieldVcfDataLine(vcfh):
        #print dataline
        fields=dataline.strip().split('\t')

        (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8]
        #if filtercode != options.filter and options.filter != None : continue

        
        if re.search(variantpattern, info ) == None:
            #sys.stderr.write("no variant pattern\n")
            continue
        
        variant_type=re.search(variantpattern, info ).groups()[0]
        
        
        if re.search(mafpattern, info ) == None:
            #sys.stderr.write("No mafpattern!\n")
            #sys.stderr.write(dataline+"\n")
            continue
        
        maf_value=re.search(mafpattern, info ).groups()[0]
        
        if float(maf_value) <= options.leq and float(maf_value) >= options.geq:

            if options.quiet == False:
                print dataline
            logstring="\t".join([chrom,pos,id,ref,alt,variant_type, options.maftag, maf_value])
            freqfh.write(logstring+'\n')
def main():
    usage = "usage: %prog [options] file.vcf.gz"
    parser = OptionParser(usage)
    parser.add_option("--model", type="string", dest="model", default = "dominant", help=" inheritance model [dominant|recessive], default is dominant ")
    parser.add_option("--ped", type="string", dest="pedfile", default=None, help="ped file of samples with phenotype (disease) status")
    parser.add_option("--filter", type="string", dest="filter", help="analyze only those  records matching filter (default is PASS)", default='PASS')

    (options, args)=parser.parse_args()
    if options.pedfile==None:
        sys.stderr.write("please provide a value to --ped parameter!\n")
        exit(1)


    affecteds=[] # list of affected samples
    unaffecteds=[] # list of unaffected samples
    
    pedobjects=[] #list of pedobjects, represents lines in a pedfile
    pedfh=open(options.pedfile, 'r')
    for line in pedfh:
        fields=line.strip().split('\t')
        (fid,iid,pid,mid,sex,phenotype)=fields[0:6]
        phenotype=int(phenotype)
        pedobjects.append( Ped(fid,iid,pid,mid,sex,phenotype) )

    #the phenotype status is set to 2 if the sample is affected: http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml#ped
    affecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 2  ]
    unaffecteds=[ pedobj.getid() for pedobj in pedobjects if pedobj.getpheno() == 1  ]



    

    #check if any overlapping samples between unaffected and affected
    if len( list( set(unaffecteds).intersection( set(affecteds) ) )  ) != 0:
        sys.stderr.write("check list of affected and unaffecteds for overlapping samples!\n")
        exit(1)

    #    sys.stderr.write("check list of affected and unaffected for overlapping samples!\n")
    #    exit(1)


    vcfilename=args[0]
    vcfh=gzip.open(vcfilename,'r')

    #instantiate a VcfFile object
    vcfobj=VcfFile(vcfilename)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader()
    samplelist=vcfobj.getSampleList()

    print header

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        
        affected_genotypes=[] #list of tuples (sample, VcfGenotype object) with samples that are affected
        unaffected_genotypes=[] # list of tuples (sample, VcfGenotype object) with samples that are unaffected

        if vrec.getFilter() != options.filter and options.filter != None : continue
        
        genotype_tuple= vrec.zipGenotypes(samplelist) # get a list of tuples [ (sample, VcfGenotype object) ... ]
        for (sample, genotype) in genotype_tuple: #iterate thru and see if they are in affected or unaffected list
            if options.model == 'dominant':
                if sample in affecteds:  # if so ...
                    affected_genotypes.append( ( sample, genotype.toString(),  genotype.isSegregating() )  ) # are they segregating for a non-ref allele?
                if sample in unaffecteds:
                    unaffected_genotypes.append( (sample,  genotype.toString(),  genotype.isSegregating() ) ) # are they segregating for a non-ref allele?
            elif options.model == 'recessive':
                if sample in affecteds:
                    affected_genotypes.append( ( sample, genotype.toString(),  genotype.isNonRefHomz() )  ) # are they segregating for a non-ref homoz?
                if sample in unaffecteds:
                    unaffected_genotypes.append( (sample,  genotype.toString(),  genotype.isNonRefHomz() ) ) # are they segregating for a non-ref non-refhomoz?
            else:
                sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n")


        if options.model == 'dominant':
        #under dominant model, all affecteds should be
        #segrgating for non-ref allele and all UN-affecteds should *NOT* be segregating for non-ref allele
            
            #how many affected individuals are segregating for non-ref allele?
            count_segregating_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True)

            #how many UN-affected individuals are *NOT*  segregating for non-ref allele?
            count_segregating_unaffected =  [ tpl[2] == False for tpl in unaffected_genotypes ].count(True)

            #now if all affects are segregating for the site
            # and all the un-affecteds are *not* segregating for the site
            # it is a candidate
            if count_segregating_affected == len(affecteds):
                if  count_segregating_unaffected == len(unaffecteds):
                    print vrec.toStringwithGenotypes()

        elif options.model == 'recessive':
            #how many affected individuals are segregating for non-ref allele?
            #http://stackoverflow.com/a/5684324/1735942
            count_homoz_nonref_affected = [ tpl[2] == True for tpl in affected_genotypes ].count(True)

            #how many UN-affected individuals are *NOT*  segregating for non-ref allele?
            count_homoz_ref_unaffected =  [ tpl[2] == False for tpl in unaffected_genotypes ].count(True)



            #now if all affects are homoz nonref for the site
            # and all the un-affecteds are homoz ref for the site
            # it is a candidate
            if count_homoz_nonref_affected == len(affecteds):
                if  count_homoz_ref_unaffected  == len(unaffecteds):
                    print vrec.toStringwithGenotypes()
        else:
            sys.stderr.write(options.model + " not supported for genotype discrete filtering ...\n")
Beispiel #19
0
def main():
    
    usage = "usage: %prog [options] vcf_file_one vcf|bed_file_two\n\nFind regions in the first vcf file that overlap regions of the second vcf or bed file\n"
    parser = OptionParser(usage)
    parser.add_option("--minCols", type="int", dest="mincols", default=1, help="mininum basepair overlap (default is one)")
    parser.add_option("--v", action="store_true", dest="reverse",  help="Print regions in first vcf  that DO NOT overlap second vcf|bed file")
    parser.add_option("--filter", type="string", dest="filter", default=None, help="intersect records only set with filter (default is None")
    parser.add_option("--info", type="string", dest="infotag", help="INFO tag id that annotates what type of variant the VCF record is", default="TYPE")
    parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file one  has no header line", default=False)
    parser.add_option("--nochrprefix", action="store_false", dest="chrprefix", help="use if the bed  doesn't  have chr prefix in chrom column", default=True)
    
    (options, args)=parser.parse_args()

    sys.stderr.write("intersecting two files ...\n")
    
    vcf_file_one=args[0]
    in2_fname=args[1]

    in2_fname_ext= os.path.splitext(in2_fname)[1][1:]
   
    if "bed" == in2_fname_ext:
       
        bitsets = binned_bitsets_from_file( open( in2_fname ) )

    if "vcf" ==  in2_fname_ext:
         bitsets = binned_bitsets_from_vcffile( in2_fname , options.filter)

   
    vcfobj=VcfFile(vcf_file_one)
    vcfh=open(vcf_file_one,'r')

    if options.noheader == False:
        vcfobj.parseMetaAndHeaderLines(vcfh)
        header=vcfobj.returnHeader()
        #print header
        #vcfobj.parseMetaAndHeaderLines(vcfh)
    
        #descriptors = vcfobj.getMetaInfoDescription()
        #infoids=[]
        #for (tag, description) in descriptors:
        #    infoids.append(tag)

        #if options.infotag  not in infoids and options.infotag != 'QUAL'  and  options.infotag != "" and options.noheader == False:
        #    sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n")
        #    exit(1)
        print header

    for dataline in vcfobj.yieldVcfDataLine(vcfh):
        fields=dataline.strip().split('\t')
        (chrom,pos,id,ref,alt,qual,filtercode,info)=fields[0:8]
        (start,end) = (int(pos)-1, int(pos))

        #pass the filter code
        if filtercode != options.filter and options.filter != None:
            continue

        #check to see if record is the correct variant TYPE
        if options.variantype != None:
            pattern=options.infotag+'=('+options.variantype+')'
            if re.search(pattern, info ) == None:
                continue
           


        if options.chrprefix == True:
            chrom="chr"+chrom
            
        if chrom in bitsets and bitsets[chrom].count_range( start, end-start ) >= options.mincols:
            if not options.reverse:
                print dataline
        else:
            if options.reverse == True:
                print dataline
Beispiel #20
0
def main():
    usage = "usage: %prog [options] file.vcf.gz "
    parser = argparse.ArgumentParser(description='filter records  based on genotypes')
   
    parser.add_argument('vcf', metavar='vcf', type=str,
                   help='vcf.gz file')
    """ http://stackoverflow.com/a/15008806/1735942 """
    parser.add_argument('--no-header',dest='header',action='store_false')
    parser.add_argument('-gt', metavar='gt', type=str, nargs='*', action='append',
                   help='sample 0/0')
   
    args = parser.parse_args()
    
    """ http://stackoverflow.com/q/12460989/1735942 """
    args.gt = [el for elements in args.gt for el in elements]
    
    #print args.gq
    
    gt_filter=[ tuple(x.split(' ')) for x in args.gt ]
    
    gt_dict=defaultdict(list)
    for (k,v) in gt_filter:
        gt_dict[k].append(v)
        
    #print gt_dict
    
    
    
    
    vcfh=gzip.open(args.vcf,'r')
    vcfobj=VcfFile(args.vcf)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader()
    if args.header == True:
        print header
    samplelist=vcfobj.getSampleList()   
    for s in gt_dict.keys():
        if s not in samplelist:
            print s ," not in samples!\n"
            sys.exit(1)
    #print header
    #print header
    #print gt_dict.keys()

    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh ):
        genotypes_toFilter=[] #list [ (sample,genoObj) ] to be filtered 
        genotype_tuple= vrec.zipGenotypes(samplelist) ## get a list of tuples [ (sample, VcfGenotype object) ... ]
        for (s,g) in genotype_tuple:
            if s in  gt_dict.keys():
                #print s
                if  len(gt_dict[s]) > 1: # logical or 
                    
                    if any( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ):
                        genotypes_toFilter.append(True)
                    else: genotypes_toFilter.append(False)
                else:
                    if all( [ g.getFormatVal('GT') == v for v in gt_dict[s] ] ):
                        genotypes_toFilter.append(True)
                    else:genotypes_toFilter.append(False)
                
        # all gt filters need to evaluate to True in order for record to print
        if all(item == True for item in genotypes_toFilter):
            print vrec.toStringwithGenotypes()
def main():
    usage = "usage: %prog [options] file.vcf.gz \n calcuate NRS and NRD on a vcf generated from CombineVariants --genotypemergeoption UNIQUIFY\n"
    parser = OptionParser(usage)
    
    parser.add_option("--matrixonly", action="store_true", dest="matrixonly", help="only print concordance matrixe", default=False)
    parser.add_option("--includeRef", action="store_true", dest="includeRef", help="include sites in the set ReferenceInAll", default=False)
    parser.add_option("--includeFilter", action="store_true", dest="includeFilter", help="include site filtered or not!", default=False)
    (options, args)=parser.parse_args()

    vcfilename=args[0]
    basename=os.path.splitext(os.path.splitext(vcfilename)[0])[0]
    """ row is eval, column is comparison 
        make a numpy matrix to represent genotype concordance matrix """
    
    concordancetable= np.matrix( [ [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ], [ 0,0,0,0 ] ] )
    calledtable = np.matrix ( [ [0 ,0] , [0,0] ] )
    
    #outputfile is the the basename of the VCF to be analyzed replaced with a variantEval.txt suffix
    outputfile=".".join([basename, 'variantEval','txt'])
    outputfh=open(outputfile, 'w')
    #log file of sites that contribute to NRS penalty; hom-ref and no-calls at variant sites in comparison set
    nrslog=".".join([basename, 'nrs','log'])
    nrdlog=".".join([basename, 'nrd','log'])
    filterlog=".".join([basename, 'filtered','log'])
    multialleliclog=".".join([basename, 'multiallelic','log'])
    concordancelog=".".join([basename, 'concordance','log'])
    fieldslog=".".join([basename, 'fields', 'log'])
    nrsfh=open(nrslog, 'w')
    nrdfh=open(nrdlog, 'w')
    filteredfh=open(filterlog, 'w')
    multifh=open(multialleliclog, 'w')
    concordancefh=open(concordancelog, 'w')
    fieldsfh=open(fieldslog, 'w')
    fieldsfh.write('set'+"\n")
    vcfobj=VcfFile(vcfilename)
    vcfh=gzip.open(vcfilename,'r')

    vcfobj.parseMetaAndHeaderLines(vcfh)
    header=vcfobj.returnHeader() +"\n"
    
    nrsfh.write(header)
    nrdfh.write(header)
    filteredfh.write(header)
    concordancefh.write(header)
    multifh.write(header)
    #outputfh.write(header)
    #multifh.write(header)

    samples=vcfobj.getSampleList()
    
    #for (comparename, evalname) in grouper(2,samples):
    #    print comparename, evalname
    vcf_sample_eval_objects = [ VcfSampleEval(compare,eval,basename) for  (compare,eval) in grouper(2,samples) ] 
    
    for evalObj in vcf_sample_eval_objects:
        evalObj.writeHeaders(header)
    
    totalrecords=0

    pattern=';set=(\S+)'
    for vrec in vcfobj.yieldVcfRecordwithGenotypes(vcfh):
        if ',' in vrec.getAlt() > 1:
            outstring=vrec.toStringwithGenotypes() + "\n"
            multifh.write(outstring)
            #continue


        """ skip homoz reference calls unless you want to include them!  """
        if 'ReferenceInAll' in vrec.getInfo() and options.includeRef == False:
            continue

        """ if variant is filtered, skip it! """
        if 'filterIn' in vrec.getInfo() and options.includeFilter == False:
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        if 'FilteredInAll' in vrec.getInfo():
            outstring=vrec.toStringwithGenotypes() + "\n"
            filteredfh.write(outstring)
            continue
        #returns a list [ (samplename, vcfgenotype) , ... () ]
        vrec_ziptuple=vrec.zipGenotypes(samples)
        """ we make a hack and make a list like so:
           [(sample.variant, compare_genotype, sample.variant2, eval_genotype) ...   ] 
           basically it halves the length of vrec_ziptuple and gives it the same structure
           as the list of VcfSampleEval objects"""
        compare_eval =[ compare+evalu  for (compare,evalu) in grouper(2,vrec_ziptuple) ]
        
       
        #what set are you in?
        field=re.search(pattern, vrec.getInfo()).groups()[0]
        fieldsfh.write(field+"\n")
        totalrecords+=1
        """ we take records two at a time, assuming the first is the comparison genotype the second is the evaluation genotype  """
        
        for (genotype_tuple, evalObj) in izip(compare_eval, vcf_sample_eval_objects):
            
            #print genotype_tuple
            compare=genotype_tuple[0:2]
            eval=genotype_tuple[2::]
            #print compare
            #print eval
            

           
                
            (comp_allele1, comp_allele2)=compare[1].getAlleles()
            (eval_allele1, eval_allele2)=eval[1].getAlleles()

            eval_alleletype=typeofGenotype(eval_allele1, eval_allele2)
            comp_alleletype=typeofGenotype(comp_allele1, comp_allele2)
           
            """ increment the cell count  """
            concordancetable[eval_alleletype, comp_alleletype]+=1
            evalObj.incrementcellcount(eval_alleletype,comp_alleletype)


            """write gentoype record to log appropriate log file """
            #print records that contirubut the NRS penalty
            if eval_alleletype == 3:
                if comp_alleletype == 1 or comp_alleletype==2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring)
                    evalObj.writeNrs(outstring)
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrsfh.write( outstring )
                    evalObj.writeNrs(outstring)
    
        
            #print records that contribute to NRD penalty
            if eval_alleletype==0:
                if comp_alleletype == 1 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 0:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)
            if eval_alleletype == 1:
                if comp_alleletype == 0 or comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)
            if eval_alleletype == 2:
                if comp_alleletype == 0 or comp_alleletype ==1:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    nrdfh.write( outstring )
                    evalObj.writeNrd(outstring)
                if comp_alleletype == 2:
                    outstring=vrec.toStringwithGenotypes() + "\n"
                    concordancefh.write( outstring )
                    evalObj.writeConcordance( outstring)

    
    for evalObj in vcf_sample_eval_objects:
        evalObj.writeEvalOutput()
    
    outputfh.write("total records analyzed: " + str(totalrecords) + "\n" )

    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write("\t".join(['','AA','AB','BB', './.'  ])  +"\n")
   
    rownames=[0,'AA', 1,'AB', 2,'BB', 3,'./.']
    for (i, gt) in grouper(2,rownames):
        row=concordancetable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")

    outputfh.write( "matrix sum: \n")
    sum=np.sum(concordancetable)
    outputfh.write( str(sum) +"\n")

    #now we figure out how many sites were called or not called
    calledtable[0,0]=concordancetable[0:3,0:3].sum()
    calledtable[0,1]=concordancetable[0:3,3].sum()
    calledtable[1,0]=concordancetable[3,0:3].sum()
    calledtable[1,1]=concordancetable[3,3]
    outputfh.write("\n")
    rownames=[ 0,'called', 1,'./.' ]
    outputfh.write( "rows are eval genotypes columns comparison genotypes\n")
    
    outputfh.write(  "\t".join(['','called','./.' ]) +"\n" )
    
    for (i, gt) in grouper(2,rownames):
        row=calledtable[i,:].tolist()
        for r in row:
            outstr="\t".join(map(str,r))
            outputfh.write( gt +"\t"+outstr+"\n")
    outputfh.write( "matrix sum: \n")
    sum=np.sum(calledtable)
    outputfh.write( str(sum) +"\n")
   
    outputfh.write("\n")


    if options.matrixonly == False:
        discordance=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1]
        total=concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,0]+concordancetable[1,1]+ concordancetable[1,2]+concordancetable[2,0]+concordancetable[2,1] +concordancetable[2,2]
    
        nrd=round( (float(discordance)/float(total)) * 100, 2)
    
        variant_count_evaluation= concordancetable[1,1]+ concordancetable[1,2]+ concordancetable[2,1]+ concordancetable[2,2]
    
        variant_count_comparison= concordancetable[0,1]+concordancetable[0,2]+concordancetable[1,1]+concordancetable[1,2]+concordancetable[2,1]+concordancetable[2,2]+concordancetable[3,1]+concordancetable[3,2]
        nrs=round( float(variant_count_evaluation)/float(variant_count_comparison) * 100 , 2)
    
        outputfh.write( "NRD: " + str(nrd) +" \n")
        outputfh.write( "NRS " + str(nrs) +" \n")
def main():
    usage = "usage: %prog [options] file.vcf\n print records belonging to a certain type of variant class (e.g. SNP) in a VCF file\n\n"
    parser = OptionParser(usage)
    parser.add_option(
        "--info",
        type="string",
        dest="infotag",
        help="INFO tag id that annotates what type of variant the VCF record is",
        default="TYPE",
    )
    parser.add_option("--type", type="string", dest="variantype", help="type of variant (SNP INS DEL)", default=None)
    parser.add_option(
        "--filter", type="string", dest="filter", help="extract records matching filter (default is None)", default=None
    )
    parser.add_option("--noheader", action="store_true", dest="noheader", help="VCF file  has no header file")

    (options, args) = parser.parse_args()
    if options.infotag == "":
        sys.stderr.write("provide a value for --info parameter!\n")
        exit(1)
    if options.variantype == "":
        sys.stderr.write("provide a value of --type parameter!\n")
        exit(1)

    variant_dict = {}

    vcfilename = args[0]
    vcfh = open(vcfilename, "r")

    # instantiate a VcfFile object
    vcfobj = VcfFile(vcfilename)
    # parse its metainfo lines (ones that begin with ##)
    vcfobj.parseMetaAndHeaderLines(vcfh)
    vcfobj.printMetaAndHeaderLines()

    descriptors = vcfobj.getMetaInfoDescription()
    infoids = []
    for (tag, description) in descriptors:
        infoids.append(tag)

    if options.infotag not in infoids and options.infotag != "QUAL":
        sys.stderr.write(options.infotag + " tag not in ##INFO headers!\n")
        exit(1)

    if options.variantype != None:
        pattern = options.infotag + "=(" + options.variantype + ")"

    for dataline in vcfobj.yieldVcfDataLine(vcfh):
        fields = dataline.strip().split("\t")
        (chrom, pos, id, ref, alt, qual, filtercode, info) = fields[0:8]
        if filtercode != options.filter and options.filter != None:
            continue

        if options.variantype != None:
            if re.search(pattern, info) == None:
                continue
            else:
                value = re.search(pattern, info).groups()[0]
                print dataline
        else:
            print dataline