def main(): usage = "%(prog)s [options]" parser = argparse.ArgumentParser(usage=usage, description=desc, epilog=epilog, \ formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose") parser.add_argument('--version', action='version', version='1.1') parser.add_argument("-g", "--gtf", help="genome annotation gtf/gff [requires -f]" ) parser.add_argument("-f", "--fasta", help="genome fasta [can be gzipped]" ) parser.add_argument("-i", "--input", type=file, #default=sys.stdin, help="input stream [stdin]") parser.add_argument("-o", "--out", default=sys.stdout, help="output stream [stdout]") parser.add_argument("-p", "--pfam", default="", help="pfam tblout file") parser.add_argument("-q", "--faa", default="", help="proteome fasta (to get protein annotation)") parser.add_argument("-t", "--tab", default="", help="tab-delimited annotation") o = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\n"%str(o)) ctg2cds, id2gene, ctg2seq = {},{},{} if o.gtf: # if annotation # load genome if not o.fasta: # fasta has to be provided parser.errer("Fasta file (-f) is requeired!") elif not os.path.isfile( o.fasta ): parser.error("No such file: %s"%o.fasta) ctg2seq = genome2dict(o.fasta) # load genome annotation if not os.path.isfile(o.gtf): # check if correct file parser.error("No such file: %s"%o.gtf) # load gtf/gff if o.gtf.endswith(".gff"): id2gene,ctg2cds = load_gff(o.gtf) else: id2gene,ctg2cds = load_gtf(o.gtf) if o.verbose: sys.stderr.write("Loaded annotation of %s CDS from %s\n"%(len(id2gene), o.gtf)) #load function annotation trans2ann = trans2pfam = trans2tab = {} if o.faa: trans2ann = load_fasta_headers(o.faa) if o.pfam: trans2pfam = load_pfam(o.pfam) if o.tab: trans2tab = load_tab(o.tab) # parse pileup parse_snps(o.input, o.out, ctg2cds, id2gene, ctg2seq, trans2ann, trans2pfam, \ trans2tab, o.verbose)
def main(): usage = "usage: %prog [options] vcf1 [ vcf2 ... vcfN ]" parser = OptionParser(usage=usage, version="%prog 1.0") #allow_interspersed_args=True parser.add_option("-g", dest="gtf", help="genome annotation") parser.add_option("-f", dest="fasta", help="genome fasta") parser.add_option("-o", dest="outbase", default="plots", help="output directory [%default]") parser.add_option("-s", dest="splitFn", default=False, action="store_true", help="split fname (sheet name) by dot") parser.add_option("-w", dest="window", default=10, type=int, help="window size in kb [%default]") parser.add_option( "-p", dest="ext", default="png", help= "Supported: emf, eps, pdf, png, ps, raw, rgba, svg, svgz [%default]") parser.add_option("-v", dest="verbose", default=True, action="store_false") (o, args) = parser.parse_args() if o.verbose: sys.stderr.write("%s\nFiles to process: %s\n" % (str(o), ", ".join(args))) #check if any input file if not args: parser.error("At least one input file has to be specified!") #check if files exists for fn in args: if not os.path.isfile(fn): parser.error("No such file: %s") #load genome - in fact need only contig sizes contig2size = get_contig2size(o.fasta) #load gtf if o.gtf.endswith(".gff"): gene2position, contig2gene = load_gff(o.gtf) else: gene2position, contig2gene = load_gtf(o.gtf) #process vcf for fn in args: print fn snps2plot(fn, o.window, contig2gene, contig2size, o.outbase, o.splitFn, o.ext, o.verbose)
def main(): usage = "usage: %prog [options]\nfor f in *.bam; do echo `date` $f; bam2counts.py -rv -i $f -g F.oxysporum.gtf > $f.genecounts.txt; done" parser = OptionParser(usage=usage, version="%prog 1.0") #allow_interspersed_args=True parser.add_option("-i", dest="bam", default="", help="bam file") parser.add_option("-g", dest="gtf", default="", help="genome annotation gtf/gff") parser.add_option( "-r", dest="rpkm", default=False, action="store_true", help= "RPKM normalisation (reads per kb of gene per million of aligned reads)" ) parser.add_option("-f", dest="fasta", default="", help="genome fasta [required if -r]") parser.add_option("-v", dest="verbose", default=False, action="store_true") (o, fnames) = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\nArgs: %s\n" % (o, fnames)) for fn in (o.bam, o.gtf): if not fn: parser.error("Provide input file!") if not os.path.isfile(fn): parser.error("No such file: %s" % fn) ctg2cds, id2gene, ctg2seq = {}, {}, {} # load gtf/gff if o.gtf: if o.gtf.endswith(".gff"): id2gene, ctg2cds = load_gff(o.gtf) else: id2gene, ctg2cds = load_gtf(o.gtf) if o.verbose: sys.stderr.write("Loaded annotation of %s CDS from %s\n" % (len(id2gene), o.gtf)) if o.rpkm: if not o.fasta: parser.error("Specify genome fasta file!") if not os.path.isfile(o.fasta): parser.error("No such file: %s" % o.fasta) ctg2seq = genome2dict(o.fasta) bam2counts(o.bam, o.rpkm, id2gene, ctg2cds, ctg2seq, o.verbose)
def main(): usage = "%(prog)s [options]" parser = argparse.ArgumentParser( usage=usage, description=desc, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter ) parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose") parser.add_argument("--version", action="version", version="1.1") parser.add_argument("-g", "--gtf", help="genome annotation gtf/gff [requires -f]") parser.add_argument("-f", "--fasta", help="genome fasta [can be gzipped]") parser.add_argument("-i", "--input", type=file, help="input stream [stdin]") # default=sys.stdin, parser.add_argument("-o", "--out", default=sys.stdout, help="output stream [stdout]") parser.add_argument("-p", "--pfam", default="", help="pfam tblout file") parser.add_argument("-q", "--faa", default="", help="proteome fasta (to get protein annotation)") parser.add_argument("-t", "--tab", default="", help="tab-delimited annotation") o = parser.parse_args() if o.verbose: sys.stderr.write("Options: %s\n" % str(o)) ctg2cds, id2gene, ctg2seq = {}, {}, {} if o.gtf: # if annotation # load genome if not o.fasta: # fasta has to be provided parser.errer("Fasta file (-f) is requeired!") elif not os.path.isfile(o.fasta): parser.error("No such file: %s" % o.fasta) ctg2seq = genome2dict(o.fasta) # load genome annotation if not os.path.isfile(o.gtf): # check if correct file parser.error("No such file: %s" % o.gtf) # load gtf/gff if o.gtf.endswith(".gff"): id2gene, ctg2cds = load_gff(o.gtf) else: id2gene, ctg2cds = load_gtf(o.gtf) if o.verbose: sys.stderr.write("Loaded annotation of %s CDS from %s\n" % (len(id2gene), o.gtf)) # load function annotation trans2ann = trans2pfam = trans2tab = {} if o.faa: trans2ann = load_fasta_headers(o.faa) if o.pfam: trans2pfam = load_pfam(o.pfam) if o.tab: trans2tab = load_tab(o.tab) # parse pileup parse_snps(o.input, o.out, ctg2cds, id2gene, ctg2seq, trans2ann, trans2pfam, trans2tab, o.verbose)
def main(): usage = "usage: %prog [options]" parser = OptionParser( usage=usage,version="%prog 1.0" ) # allow_interspersed_args=True parser.add_option("-g", dest="gtf", help="genome annotation gtf/gff [requires -f]" ) parser.add_option("-f", dest="fasta", help="genome fasta [can be gzipped]" ) parser.add_option("-i", dest="fpath", help="input file [stdin]") parser.add_option("-o", dest="outfn", help="output fname [stdout]") parser.add_option("-d", dest="minDepth", default=10, type=int, help="minimal depth [%default]") parser.add_option("-m", dest="minFreq", default=0.8, type=float, help="min frequency of alternative base [%default]") parser.add_option("-n", dest="indels", default=True, action="store_false", help="ignore indels") parser.add_option("-b", dest="bothStrands", default=True, action="store_false", help="report events confirmed by single strand algs") parser.add_option("-v", dest="verbose", default=True, action="store_false") ( o, args ) = parser.parse_args() if o.verbose: sys.stderr.write( "%s\n" % ( str(o), ) ) ctg2cds,id2gene,ctg2seq = {},{},{} if o.gtf: # if annotation # load genome if not o.fasta: # fasta has to be provided parser.errer( "Fasta file (-f) is requeired!" ) elif not os.path.isfile( o.fasta ): parser.error( "No such file: %s" % o.fasta ) ctg2seq = genome2dict( o.fasta ) # load genome annotation if not os.path.isfile( o.gtf ): # check if correct file parser.error( "No such file: %s" % o.gtf ) # load gtf/gff if o.gtf.endswith(".gff"): id2gene,ctg2cds = load_gff( o.gtf ) else: id2gene,ctg2cds = load_gtf( o.gtf ) if o.verbose: sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) ) # parse pileup parse_vcf( o.fpath,o.outfn,ctg2cds,id2gene,ctg2seq,o.minDepth,o.minFreq,o.indels,o.bothStrands )
def main(): usage = "usage: %prog [options] vcf1 [ vcf2 ... vcfN ]" parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True parser.add_option("-g", dest="gtf", help="genome annotation" ) parser.add_option("-f", dest="fasta", help="genome fasta" ) parser.add_option("-o", dest="outbase", default="plots", help="output directory [%default]" ) parser.add_option("-s", dest="splitFn", default=False, action="store_true", help="split fname (sheet name) by dot") parser.add_option("-w", dest="window", default=10, type=int, help="window size in kb [%default]") parser.add_option("-p", dest="ext", default="png", help="Supported: emf, eps, pdf, png, ps, raw, rgba, svg, svgz [%default]") parser.add_option("-v", dest="verbose", default=True, action="store_false") ( o, args ) = parser.parse_args() if o.verbose: sys.stderr.write( "%s\nFiles to process: %s\n" % ( str(o),", ".join( args ) ) ) #check if any input file if not args: parser.error( "At least one input file has to be specified!" ) #check if files exists for fn in args: if not os.path.isfile( fn ): parser.error( "No such file: %s" ) #load genome - in fact need only contig sizes contig2size = get_contig2size( o.fasta ) #load gtf if o.gtf.endswith(".gff"): gene2position, contig2gene = load_gff( o.gtf ) else: gene2position, contig2gene = load_gtf( o.gtf ) #process vcf for fn in args: print fn snps2plot( fn,o.window,contig2gene,contig2size,o.outbase,o.splitFn,o.ext,o.verbose )
def main(): usage = "usage: %prog [options]\nfor f in *.bam; do echo `date` $f; bam2counts.py -rv -i $f -g F.oxysporum.gtf > $f.genecounts.txt; done" parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True parser.add_option("-i", dest="bam", default="", help="bam file") parser.add_option("-g", dest="gtf",default="", help="genome annotation gtf/gff" ) parser.add_option("-r", dest="rpkm", default=False, action="store_true", help="RPKM normalisation (reads per kb of gene per million of aligned reads)" ) parser.add_option("-f", dest="fasta", default="", help="genome fasta [required if -r]") parser.add_option("-v", dest="verbose", default=False, action="store_true" ) ( o, fnames ) = parser.parse_args() if o.verbose: sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) ) for fn in ( o.bam,o.gtf ): if not fn: parser.error( "Provide input file!" ) if not os.path.isfile( fn ): parser.error( "No such file: %s" % fn ) ctg2cds,id2gene,ctg2seq = {},{},{} # load gtf/gff if o.gtf: if o.gtf.endswith(".gff"): id2gene,ctg2cds = load_gff( o.gtf ) else: id2gene,ctg2cds = load_gtf( o.gtf ) if o.verbose: sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) ) if o.rpkm: if not o.fasta: parser.error( "Specify genome fasta file!" ) if not os.path.isfile( o.fasta ): parser.error( "No such file: %s" % o.fasta ) ctg2seq = genome2dict( o.fasta ) bam2counts( o.bam,o.rpkm,id2gene,ctg2cds,ctg2seq,o.verbose )
def process(fnames, faa, pfam, gtf, log2th, splitFn, skipExons, verbose): """main function """ #load function annotation trans2ann = trans2pfam = {} if faa: trans2ann = load_fasta_headers(faa) if pfam: trans2pfam = load_pfam(pfam) ctg2cds, id2gene = {}, {} if gtf: # load gtf/gff if gtf.endswith(".gff"): id2gene, ctg2cds = load_gff(gtf) else: id2gene, ctg2cds = load_gtf(gtf) if verbose: sys.stderr.write("Loaded annotation of %s CDS from %s\n" % (len(id2gene), o.gtf)) #get samples names samples = [] for fn in fnames: if splitFn: fn = fn.split(".")[0] samples.append(fn) #load gene counts if verbose: sys.stderr.write("Loading gene counts...\n") gene2counts = {} for fn in fnames: if verbose: sys.stderr.write(" %s \r" % fn) gene2counts = load_counts(fn, gene2counts) ## print results # header if verbose: sys.stderr.write("Calculating...\n") header = "#gene\tcoordinate\t%s" % samples[0] for s in samples[1:]: header += "\t%s\tlog2(%s/%s)" % (s, s, samples[0]) header += "\tannotation\tpfam" print header # per gene scores for gene in sorted(gene2counts.keys()): #if genes only requested then skip if skipExons: #check if exon, and skip if so if gene.split(".")[-1].isdigit(): continue # coord, counts = gene2counts[gene] passed = False line = "%s\t%s\t%.2f" % (gene, coord, counts[0]) for c in counts[1:]: line += "\t%.2f" % c #ref 0 if not counts[0]: line += "\t+NA" passed = True elif not c: line += "\t-NA" passed = True else: log2 = log(c * 1.0 / counts[0], 2) line += "\t%.2f" % log2 #filter lines that contain log2 > than log2th or log2 < -log2th if log2th: if not -log2th < log2 < log2th: passed = True else: passed = True #print only if passed filtering if passed: ann = pfam = "" if gene in id2gene: ann = id2gene[gene][-1] #contig,cdsList,strand,function if gene in trans2ann: ann = trans2ann[gene] if gene in trans2pfam: pfam = trans2pfam[gene] line += "\t%s\t%s" % (ann, pfam) print line
def process(fnames, expCov, genome, faa, pfam, gtf, log2th, splitFn, skipExons, verbose): """main function """ #load function annotation trans2ann = trans2pfam = {} if faa: trans2ann = load_fasta_headers(faa) if pfam: trans2pfam = load_pfam(pfam) ctg2cds, id2gene = {}, {} if gtf: # load gtf/gff if gtf.endswith(".gff"): id2gene, ctg2cds = load_gff(gtf) else: id2gene, ctg2cds = load_gtf(gtf) if verbose: sys.stderr.write("Loaded annotation of %s CDS from %s\n" % (len(id2gene), gtf)) #get samples names samples = [] for fn in fnames: if splitFn: fn = fn.split(".")[0] samples.append(fn) #get expected coverage (RPKMs) if not expCov: c2cs = get_contig2size_samtools(genome) gsize = sum([s for c, s in c2cs.itervalues()]) rcount = sum([c for c, s in c2cs.itervalues()]) expCov = rcount * 10.0**3 / gsize if verbose: sys.stderr.write("Set expected coverage [RPKM]: %.3f\n" % expCov) #load gene counts and calculate means if verbose: sys.stderr.write("Loading gene counts...\n") means = [] gene2counts = {} for fn in fnames: if verbose: sys.stderr.write(" %s \r" % fn) gene2counts = load_counts(fn, gene2counts) ## print results # header if verbose: sys.stderr.write("Calculating...\n") header = "#gene\tcoordinate" for s in samples: header += "\t%s\tlog2 vs mean" % (s, ) header += "\tannotation\tpfam" print header # per gene scores for gene in sorted(gene2counts.keys()): #if genes only requested then skip if skipExons: #check if exon, and skip if so if gene.split(".")[-1].isdigit(): continue # coord, counts = gene2counts[gene] passed = False line = "%s\t%s" % (gene, coord) for c in counts: line += "\t%.2f" % c if not c: line += "\t-NA" passed = True else: log2 = log(c * 1.0 / expCov, 2) line += "\t%.2f" % log2 #filter lines that contain log2 > than log2th or log2 < -log2th if log2th: if not -log2th < log2 < log2th: passed = True else: passed = True #print only if passed filtering if passed: ann = pfam = "" if gene in id2gene: ann = id2gene[gene][-1] #contig,cdsList,strand,function if gene in trans2ann: ann = trans2ann[gene] if gene in trans2pfam: pfam = trans2pfam[gene] line += "\t%s\t%s" % (ann, pfam) print line
def main(): """ """ usage = "%prog [options] file1 [file2 ... fileN]" parser = OptionParser(usage) parser.add_option("-a", dest="annotation", default='', type=str, help="GTF annotation file [%default]") parser.add_option("-f", dest="fdr", default=1e-04, type=float, help="false dicovery rate [%default]") (o, fnames) = parser.parse_args() sys.stderr.write("Options: %s\nFiles to be processed: %s\n" % (o, fnames)) if not fnames: sys.exit("Speficy at least one input file") prot2ann, prot2locus = {}, {} if o.annotation.endswith(".gff"): gene2position, contig2gene = load_gff(o.annotation) elif o.annotation: gene2position, contig2gene = load_gtf(o.annotation) #process files i = 0 samples = [] gene2fc = {} gene2reads = {} sys.stderr.write("Processing files...\n") de2sample = {} for fn in fnames: i += 1 sys.stderr.write("%s\t%s\n" % ( datetime.now(), fn, )) s = fn.split('.')[0] samples.append(s) #load files i = 0 for l in open(fn): #skip header i += 1 if i == 1: continue id, baseMean, baseMeanA, baseMeanB, foldChange, log2FoldChange, pval, padj = l.split( '\t') baseMeanA, baseMeanB = float(baseMeanA), float(baseMeanB) if id not in gene2fc: gene2fc[id] = [] gene2reads[id] = [] #baseMeanA #add expression info gene2fc[id].append(log2FoldChange) gene2reads[id].append((baseMeanA, baseMeanB)) try: padj = float(padj) except: continue if padj < o.fdr: if id in de2sample: de2sample[id].append(s) else: de2sample[id] = [s] #print out header = "#gene\tlocus\t#de\tcontrol" for s in samples: header += "\t%s log2" % s header += "\tannotation" print header for gene in sorted(de2sample.keys()): #annotation function = '' if gene in gene2position: contig, coords, strand, function = gene2position[gene] locus = "%s:%s-%s %s" % (contig, coords[0][0], coords[-1][-1], strand) out = "%s\t%s\t%s" % ( gene, locus, len(de2sample[gene]) ) #,gene2control[gene],'\t'.join(gene2fc[gene]),ann ) # i = 0 for reads, fc in zip(gene2reads[gene], gene2fc[gene]): a, b = reads if not i: out += "\t%s" % a i += 1 out += "\t%s" % fc out += "\t%s" % function print out
def main(): usage = "usage: %prog [options] *.vcf" parser = OptionParser( usage=usage,version="%prog 1.0" ) # allow_interspersed_args=True parser.add_option("-g", dest="gtf", help="genome annotation gtf/gff [requires -f]" ) parser.add_option("-f", dest="fasta", help="genome fasta" ) parser.add_option("-1", dest="bam1", help="sample bam") parser.add_option("-2", dest="bam2", help="reference bam") parser.add_option("-o", dest="outfn", help="output fname [stdout]") parser.add_option("-d", dest="minDepth", default=5, type=int, help="""minimal depth; note both samples need to have pass depth filtering [%default]""") parser.add_option("-m", dest="minFreq", default=0.8, type=float, help="min frequency of alternative base [%default]") parser.add_option("-n", dest="indels", default=True, action="store_false", help="ignore indels [%default]") parser.add_option("-b", dest="bothStrands", default=True, action="store_false", help="report events confirmed by single strand algs") parser.add_option("-v", dest="verbose", default=True, action="store_false") ( o, args ) = parser.parse_args() if o.verbose: sys.stderr.write( "%s\n" % ( str(o), ) ) if not args: parser.error( "At least one vcf file has to be specified!" ) for fn in args: if not os.path.isfile( fn ): parser.error( "No such file: %s" % fn ) ctg2cds,id2gene,ctg2seq = {},{},{} if o.gtf: # if annotation # load genome if not o.fasta: # fasta has to be provided parser.errer( "Fasta file (-f) is requeired!" ) elif not os.path.isfile( o.fasta ): parser.error( "No such file: %s" % o.fasta ) ctg2seq = genome2dict( o.fasta ) # load genome annotation if not os.path.isfile( o.gtf ): # check if correct file parser.error( "No such file: %s" % o.gtf ) # load gtf/gff if o.gtf.endswith(".gff"): id2gene,ctg2cds = load_gff( o.gtf ) else: id2gene,ctg2cds = load_gtf( o.gtf ) if o.verbose: sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) ) # load possible SNPs coordinates coords = load_vcf( args,o.indels ) # check with mpileup check_snps( coords,o.bam1,o.bam2,o.fasta,o.outfn,ctg2cds,id2gene,ctg2seq,o.minDepth,o.minFreq,o.indels,o.bothStrands )
def process( fnames,expCov,genome,faa,pfam,gtf,log2th,splitFn,skipExons,verbose ): """main function """ #load function annotation trans2ann = trans2pfam = {} if faa: trans2ann = load_fasta_headers( faa ) if pfam: trans2pfam = load_pfam( pfam ) ctg2cds,id2gene = {},{} if gtf: # load gtf/gff if gtf.endswith(".gff"): id2gene,ctg2cds = load_gff( gtf ) else: id2gene,ctg2cds = load_gtf( gtf ) if verbose: sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),gtf ) ) #get samples names samples = [] for fn in fnames: if splitFn: fn = fn.split(".")[0] samples.append( fn ) #get expected coverage (RPKMs) if not expCov: c2cs = get_contig2size_samtools( genome ) gsize = sum( [ s for c,s in c2cs.itervalues() ] ) rcount = sum( [ c for c,s in c2cs.itervalues() ] ) expCov = rcount * 10.0**3 / gsize if verbose: sys.stderr.write( "Set expected coverage [RPKM]: %.3f\n" % expCov ) #load gene counts and calculate means if verbose: sys.stderr.write( "Loading gene counts...\n" ) means = [] gene2counts = {} for fn in fnames: if verbose: sys.stderr.write( " %s \r" % fn ) gene2counts = load_counts( fn,gene2counts ) ## print results # header if verbose: sys.stderr.write( "Calculating...\n" ) header = "#gene\tcoordinate" for s in samples: header += "\t%s\tlog2 vs mean" % ( s, ) header += "\tannotation\tpfam" print header # per gene scores for gene in sorted( gene2counts.keys() ): #if genes only requested then skip if skipExons: #check if exon, and skip if so if gene.split(".")[-1].isdigit(): continue # coord,counts = gene2counts[gene] passed = False line = "%s\t%s" % ( gene,coord ) for c in counts: line += "\t%.2f" % c if not c: line += "\t-NA" passed = True else: log2 = log( c*1.0/expCov,2 ) line += "\t%.2f" % log2 #filter lines that contain log2 > than log2th or log2 < -log2th if log2th: if not -log2th < log2 < log2th: passed = True else: passed = True #print only if passed filtering if passed: ann = pfam = "" if gene in id2gene: ann = id2gene[gene][-1] #contig,cdsList,strand,function if gene in trans2ann: ann = trans2ann[gene] if gene in trans2pfam: pfam = trans2pfam[gene] line += "\t%s\t%s" % ( ann,pfam ) print line
def main(): usage = "usage: %prog [options] *.vcf" parser = OptionParser(usage=usage, version="%prog 1.0") # allow_interspersed_args=True parser.add_option("-g", dest="gtf", help="genome annotation gtf/gff [requires -f]") parser.add_option("-f", dest="fasta", help="genome fasta") parser.add_option("-1", dest="bam1", help="sample bam") parser.add_option("-2", dest="bam2", help="reference bam") parser.add_option("-o", dest="outfn", help="output fname [stdout]") parser.add_option( "-d", dest="minDepth", default=5, type=int, help= """minimal depth; note both samples need to have pass depth filtering [%default]""" ) parser.add_option("-m", dest="minFreq", default=0.8, type=float, help="min frequency of alternative base [%default]") parser.add_option("-n", dest="indels", default=True, action="store_false", help="ignore indels [%default]") parser.add_option("-b", dest="bothStrands", default=True, action="store_false", help="report events confirmed by single strand algs") parser.add_option("-v", dest="verbose", default=True, action="store_false") (o, args) = parser.parse_args() if o.verbose: sys.stderr.write("%s\n" % (str(o), )) if not args: parser.error("At least one vcf file has to be specified!") for fn in args: if not os.path.isfile(fn): parser.error("No such file: %s" % fn) ctg2cds, id2gene, ctg2seq = {}, {}, {} if o.gtf: # if annotation # load genome if not o.fasta: # fasta has to be provided parser.errer("Fasta file (-f) is requeired!") elif not os.path.isfile(o.fasta): parser.error("No such file: %s" % o.fasta) ctg2seq = genome2dict(o.fasta) # load genome annotation if not os.path.isfile(o.gtf): # check if correct file parser.error("No such file: %s" % o.gtf) # load gtf/gff if o.gtf.endswith(".gff"): id2gene, ctg2cds = load_gff(o.gtf) else: id2gene, ctg2cds = load_gtf(o.gtf) if o.verbose: sys.stderr.write("Loaded annotation of %s CDS from %s\n" % (len(id2gene), o.gtf)) # load possible SNPs coordinates coords = load_vcf(args, o.indels) # check with mpileup check_snps(coords, o.bam1, o.bam2, o.fasta, o.outfn, ctg2cds, id2gene, ctg2seq, o.minDepth, o.minFreq, o.indels, o.bothStrands)
def process( fnames,faa,pfam,gtf,log2th,splitFn,skipExons,verbose ): """main function """ #load function annotation trans2ann = trans2pfam = {} if faa: trans2ann = load_fasta_headers( faa ) if pfam: trans2pfam = load_pfam( pfam ) ctg2cds,id2gene = {},{} if gtf: # load gtf/gff if gtf.endswith(".gff"): id2gene,ctg2cds = load_gff( gtf ) else: id2gene,ctg2cds = load_gtf( gtf ) if verbose: sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) ) #get samples names samples = [] for fn in fnames: if splitFn: fn = fn.split(".")[0] samples.append( fn ) #load gene counts if verbose: sys.stderr.write( "Loading gene counts...\n" ) gene2counts = {} for fn in fnames: if verbose: sys.stderr.write( " %s \r" % fn ) gene2counts = load_counts( fn,gene2counts ) ## print results # header if verbose: sys.stderr.write( "Calculating...\n" ) header = "#gene\tcoordinate\t%s" % samples[0] for s in samples[1:]: header += "\t%s\tlog2(%s/%s)" % ( s,s,samples[0] ) header += "\tannotation\tpfam" print header # per gene scores for gene in sorted( gene2counts.keys() ): #if genes only requested then skip if skipExons: #check if exon, and skip if so if gene.split(".")[-1].isdigit(): continue # coord,counts = gene2counts[gene] passed = False line = "%s\t%s\t%.2f" % ( gene,coord,counts[0] ) for c in counts[1:]: line += "\t%.2f" % c #ref 0 if not counts[0]: line += "\t+NA" passed = True elif not c: line += "\t-NA" passed = True else: log2 = log( c*1.0/counts[0],2 ) line += "\t%.2f" % log2 #filter lines that contain log2 > than log2th or log2 < -log2th if log2th: if not -log2th < log2 < log2th: passed = True else: passed = True #print only if passed filtering if passed: ann = pfam = "" if gene in id2gene: ann = id2gene[gene][-1] #contig,cdsList,strand,function if gene in trans2ann: ann = trans2ann[gene] if gene in trans2pfam: pfam = trans2pfam[gene] line += "\t%s\t%s" % ( ann,pfam ) print line