Ejemplo n.º 1
def main():
    usage  = "%(prog)s [options]" 
    parser  = argparse.ArgumentParser(usage=usage, description=desc, epilog=epilog, \

    parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose")    
    parser.add_argument('--version', action='version', version='1.1')
    parser.add_argument("-g", "--gtf",
                        help="genome annotation gtf/gff [requires -f]" )
    parser.add_argument("-f", "--fasta",
                        help="genome fasta [can be gzipped]" )
    parser.add_argument("-i", "--input", type=file,  #default=sys.stdin,
                        help="input stream [stdin]")
    parser.add_argument("-o", "--out", default=sys.stdout, 
                        help="output stream [stdout]")
    parser.add_argument("-p", "--pfam", default="", 
                        help="pfam tblout file") 
    parser.add_argument("-q", "--faa", default="", 
                        help="proteome fasta (to get protein annotation)") 
    parser.add_argument("-t", "--tab", default="", 
                        help="tab-delimited annotation") 
    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n"%str(o))

    ctg2cds, id2gene, ctg2seq = {},{},{}
    if o.gtf: # if annotation
        # load genome
        if not o.fasta: # fasta has to be provided
            parser.errer("Fasta file (-f) is requeired!")
        elif not os.path.isfile( o.fasta ):
            parser.error("No such file: %s"%o.fasta)
        ctg2seq        = genome2dict(o.fasta)

        # load genome annotation
        if not os.path.isfile(o.gtf): # check if correct file
            parser.error("No such file: %s"%o.gtf)
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff(o.gtf)
            id2gene,ctg2cds = load_gtf(o.gtf)
        if o.verbose:
            sys.stderr.write("Loaded annotation of %s CDS from %s\n"%(len(id2gene), o.gtf))

    #load function annotation
    trans2ann = trans2pfam = trans2tab = {}
    if o.faa:
        trans2ann = load_fasta_headers(o.faa)
    if o.pfam:
        trans2pfam = load_pfam(o.pfam)
    if o.tab:
        trans2tab = load_tab(o.tab)
    # parse pileup
    parse_snps(o.input, o.out, ctg2cds, id2gene, ctg2seq, trans2ann, trans2pfam, \
               trans2tab, o.verbose)
Ejemplo n.º 2
def main():

    usage = "usage: %prog [options] vcf1 [ vcf2 ... vcfN ]"
    parser = OptionParser(usage=usage,
                          version="%prog 1.0")  #allow_interspersed_args=True

    parser.add_option("-g", dest="gtf", help="genome annotation")
    parser.add_option("-f", dest="fasta", help="genome fasta")
                      help="output directory [%default]")
                      help="split fname (sheet name) by dot")
                      help="window size in kb [%default]")
        "Supported: emf, eps, pdf, png, ps, raw, rgba, svg, svgz [%default]")
    parser.add_option("-v", dest="verbose", default=True, action="store_false")

    (o, args) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("%s\nFiles to process: %s\n" %
                         (str(o), ", ".join(args)))

    #check if any input file
    if not args:
        parser.error("At least one input file has to be specified!")

    #check if files exists
    for fn in args:
        if not os.path.isfile(fn):
            parser.error("No such file: %s")

    #load genome - in fact need only contig sizes
    contig2size = get_contig2size(o.fasta)
    #load gtf
    if o.gtf.endswith(".gff"):
        gene2position, contig2gene = load_gff(o.gtf)
        gene2position, contig2gene = load_gtf(o.gtf)

    #process vcf
    for fn in args:
        print fn
        snps2plot(fn, o.window, contig2gene, contig2size, o.outbase, o.splitFn,
                  o.ext, o.verbose)
Ejemplo n.º 3
def main():

    usage = "usage: %prog [options]\nfor f in *.bam; do echo `date` $f; bam2counts.py -rv -i $f -g F.oxysporum.gtf > $f.genecounts.txt; done"
    parser = OptionParser(usage=usage,
                          version="%prog 1.0")  #allow_interspersed_args=True

    parser.add_option("-i", dest="bam", default="", help="bam file")
                      help="genome annotation gtf/gff")
        "RPKM normalisation (reads per kb of gene per million of aligned reads)"
                      help="genome fasta [required if -r]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true")

    (o, fnames) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\nArgs: %s\n" % (o, fnames))

    for fn in (o.bam, o.gtf):
        if not fn:
            parser.error("Provide input file!")
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    ctg2cds, id2gene, ctg2seq = {}, {}, {}
    # load gtf/gff
    if o.gtf:
        if o.gtf.endswith(".gff"):
            id2gene, ctg2cds = load_gff(o.gtf)
            id2gene, ctg2cds = load_gtf(o.gtf)
    if o.verbose:
        sys.stderr.write("Loaded annotation of %s CDS from %s\n" %
                         (len(id2gene), o.gtf))

    if o.rpkm:
        if not o.fasta:
            parser.error("Specify genome fasta file!")
        if not os.path.isfile(o.fasta):
            parser.error("No such file: %s" % o.fasta)
        ctg2seq = genome2dict(o.fasta)

    bam2counts(o.bam, o.rpkm, id2gene, ctg2cds, ctg2seq, o.verbose)
Ejemplo n.º 4
def main():

    usage = "%(prog)s [options]"
    parser = argparse.ArgumentParser(
        usage=usage, description=desc, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter

    parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose")
    parser.add_argument("--version", action="version", version="1.1")
    parser.add_argument("-g", "--gtf", help="genome annotation gtf/gff [requires -f]")
    parser.add_argument("-f", "--fasta", help="genome fasta [can be gzipped]")
    parser.add_argument("-i", "--input", type=file, help="input stream [stdin]")  # default=sys.stdin,
    parser.add_argument("-o", "--out", default=sys.stdout, help="output stream [stdout]")
    parser.add_argument("-p", "--pfam", default="", help="pfam tblout file")
    parser.add_argument("-q", "--faa", default="", help="proteome fasta (to get protein annotation)")
    parser.add_argument("-t", "--tab", default="", help="tab-delimited annotation")

    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n" % str(o))

    ctg2cds, id2gene, ctg2seq = {}, {}, {}
    if o.gtf:  # if annotation
        # load genome
        if not o.fasta:  # fasta has to be provided
            parser.errer("Fasta file (-f) is requeired!")
        elif not os.path.isfile(o.fasta):
            parser.error("No such file: %s" % o.fasta)
        ctg2seq = genome2dict(o.fasta)

        # load genome annotation
        if not os.path.isfile(o.gtf):  # check if correct file
            parser.error("No such file: %s" % o.gtf)
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene, ctg2cds = load_gff(o.gtf)
            id2gene, ctg2cds = load_gtf(o.gtf)
        if o.verbose:
            sys.stderr.write("Loaded annotation of %s CDS from %s\n" % (len(id2gene), o.gtf))

    # load function annotation
    trans2ann = trans2pfam = trans2tab = {}
    if o.faa:
        trans2ann = load_fasta_headers(o.faa)
    if o.pfam:
        trans2pfam = load_pfam(o.pfam)
    if o.tab:
        trans2tab = load_tab(o.tab)
    # parse pileup
    parse_snps(o.input, o.out, ctg2cds, id2gene, ctg2seq, trans2ann, trans2pfam, trans2tab, o.verbose)
Ejemplo n.º 5
def main():
    usage = "usage: %prog [options]" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) # allow_interspersed_args=True

    parser.add_option("-g", dest="gtf",
                      help="genome annotation gtf/gff [requires -f]" )
    parser.add_option("-f", dest="fasta",
                      help="genome fasta [can be gzipped]" )
    parser.add_option("-i", dest="fpath",
                      help="input file [stdin]")
    parser.add_option("-o", dest="outfn",
                      help="output fname [stdout]")
    parser.add_option("-d", dest="minDepth", default=10,  type=int,
                      help="minimal depth [%default]")
    parser.add_option("-m", dest="minFreq",  default=0.8, type=float,
                      help="min frequency of alternative base [%default]")
    parser.add_option("-n", dest="indels",   default=True, action="store_false", 
                      help="ignore indels")
    parser.add_option("-b", dest="bothStrands", default=True, action="store_false", 
                      help="report events confirmed by single strand algs")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    ctg2cds,id2gene,ctg2seq = {},{},{}
    if o.gtf: # if annotation
        # load genome
        if not o.fasta: # fasta has to be provided
            parser.errer( "Fasta file (-f) is requeired!" )
        elif not os.path.isfile( o.fasta ):
            parser.error( "No such file: %s" % o.fasta )
        ctg2seq        = genome2dict( o.fasta )

        # load genome annotation
        if not os.path.isfile( o.gtf ): # check if correct file
            parser.error( "No such file: %s" % o.gtf )
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff( o.gtf )
            id2gene,ctg2cds = load_gtf( o.gtf )
        if o.verbose:
            sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )

    # parse pileup
    parse_vcf( o.fpath,o.outfn,ctg2cds,id2gene,ctg2seq,o.minDepth,o.minFreq,o.indels,o.bothStrands )
Ejemplo n.º 6
def main():
    usage = "usage: %prog [options]" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) # allow_interspersed_args=True

    parser.add_option("-g", dest="gtf",
                      help="genome annotation gtf/gff [requires -f]" )
    parser.add_option("-f", dest="fasta",
                      help="genome fasta [can be gzipped]" )
    parser.add_option("-i", dest="fpath",
                      help="input file [stdin]")
    parser.add_option("-o", dest="outfn",
                      help="output fname [stdout]")
    parser.add_option("-d", dest="minDepth", default=10,  type=int,
                      help="minimal depth [%default]")
    parser.add_option("-m", dest="minFreq",  default=0.8, type=float,
                      help="min frequency of alternative base [%default]")
    parser.add_option("-n", dest="indels",   default=True, action="store_false", 
                      help="ignore indels")
    parser.add_option("-b", dest="bothStrands", default=True, action="store_false", 
                      help="report events confirmed by single strand algs")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    ctg2cds,id2gene,ctg2seq = {},{},{}
    if o.gtf: # if annotation
        # load genome
        if not o.fasta: # fasta has to be provided
            parser.errer( "Fasta file (-f) is requeired!" )
        elif not os.path.isfile( o.fasta ):
            parser.error( "No such file: %s" % o.fasta )
        ctg2seq        = genome2dict( o.fasta )

        # load genome annotation
        if not os.path.isfile( o.gtf ): # check if correct file
            parser.error( "No such file: %s" % o.gtf )
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff( o.gtf )
            id2gene,ctg2cds = load_gtf( o.gtf )
        if o.verbose:
            sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )

    # parse pileup
    parse_vcf( o.fpath,o.outfn,ctg2cds,id2gene,ctg2seq,o.minDepth,o.minFreq,o.indels,o.bothStrands )
Ejemplo n.º 7
def main():
    usage = "usage: %prog [options] vcf1 [ vcf2 ... vcfN ]" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True

    parser.add_option("-g", dest="gtf",
                      help="genome annotation" )
    parser.add_option("-f", dest="fasta",
                      help="genome fasta" )
    parser.add_option("-o", dest="outbase", default="plots",
                      help="output directory [%default]" )
    parser.add_option("-s", dest="splitFn",  default=False, action="store_true", 
                      help="split fname (sheet name) by dot")
    parser.add_option("-w", dest="window",   default=10, type=int,
                      help="window size in kb [%default]")
    parser.add_option("-p", dest="ext",   default="png",
                      help="Supported: emf, eps, pdf, png, ps, raw, rgba, svg, svgz [%default]")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\nFiles to process: %s\n" % ( str(o),", ".join( args ) ) )

    #check if any input file
    if not args:
        parser.error( "At least one input file has to be specified!" )

    #check if files exists
    for fn in args:
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" )
    #load genome - in fact need only contig sizes
    contig2size = get_contig2size( o.fasta )
    #load gtf
    if o.gtf.endswith(".gff"):
        gene2position, contig2gene = load_gff( o.gtf )
        gene2position, contig2gene = load_gtf( o.gtf )
    #process vcf
    for fn in args:
        print fn
        snps2plot( fn,o.window,contig2gene,contig2size,o.outbase,o.splitFn,o.ext,o.verbose )
Ejemplo n.º 8
def main():

    usage  = "usage: %prog [options]\nfor f in *.bam; do echo `date` $f; bam2counts.py -rv -i $f -g F.oxysporum.gtf > $f.genecounts.txt; done" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True

    parser.add_option("-i", dest="bam", default="",
                      help="bam file")
    parser.add_option("-g", dest="gtf",default="",
                      help="genome annotation gtf/gff" )
    parser.add_option("-r", dest="rpkm",  default=False, action="store_true",
                      help="RPKM normalisation (reads per kb of gene per million of aligned reads)" )
    parser.add_option("-f", dest="fasta", default="",
                      help="genome fasta [required if -r]")
    parser.add_option("-v", dest="verbose",  default=False, action="store_true" )    
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) )

    for fn in ( o.bam,o.gtf ):
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )
    ctg2cds,id2gene,ctg2seq = {},{},{}
    # load gtf/gff
    if o.gtf:
        if o.gtf.endswith(".gff"):      
            id2gene,ctg2cds = load_gff( o.gtf )
            id2gene,ctg2cds = load_gtf( o.gtf )        
    if o.verbose:
        sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )

    if o.rpkm:
        if not o.fasta:
            parser.error( "Specify genome fasta file!" )
        if not os.path.isfile( o.fasta ):
            parser.error( "No such file: %s" % o.fasta )
        ctg2seq = genome2dict( o.fasta )

    bam2counts( o.bam,o.rpkm,id2gene,ctg2cds,ctg2seq,o.verbose )
Ejemplo n.º 9
def process(fnames, faa, pfam, gtf, log2th, splitFn, skipExons, verbose):
    """main function
    #load function annotation
    trans2ann = trans2pfam = {}
    if faa:
        trans2ann = load_fasta_headers(faa)
    if pfam:
        trans2pfam = load_pfam(pfam)

    ctg2cds, id2gene = {}, {}
    if gtf:
        # load gtf/gff
        if gtf.endswith(".gff"):
            id2gene, ctg2cds = load_gff(gtf)
            id2gene, ctg2cds = load_gtf(gtf)
        if verbose:
            sys.stderr.write("Loaded annotation of %s CDS from %s\n" %
                             (len(id2gene), o.gtf))

    #get samples names
    samples = []
    for fn in fnames:
        if splitFn:
            fn = fn.split(".")[0]

    #load gene counts
    if verbose:
        sys.stderr.write("Loading gene counts...\n")
    gene2counts = {}
    for fn in fnames:
        if verbose:
            sys.stderr.write(" %s      \r" % fn)
        gene2counts = load_counts(fn, gene2counts)

    ## print results
    # header
    if verbose:
    header = "#gene\tcoordinate\t%s" % samples[0]
    for s in samples[1:]:
        header += "\t%s\tlog2(%s/%s)" % (s, s, samples[0])
    header += "\tannotation\tpfam"
    print header

    # per gene scores
    for gene in sorted(gene2counts.keys()):
        #if genes only requested then skip
        if skipExons:
            #check if exon, and skip if so
            if gene.split(".")[-1].isdigit():
        coord, counts = gene2counts[gene]
        passed = False
        line = "%s\t%s\t%.2f" % (gene, coord, counts[0])
        for c in counts[1:]:
            line += "\t%.2f" % c
            #ref 0
            if not counts[0]:
                line += "\t+NA"
                passed = True
            elif not c:
                line += "\t-NA"
                passed = True
                log2 = log(c * 1.0 / counts[0], 2)
                line += "\t%.2f" % log2
                #filter lines that contain log2 > than log2th or log2 < -log2th
                if log2th:
                    if not -log2th < log2 < log2th:
                        passed = True
                    passed = True

        #print only if passed filtering
        if passed:
            ann = pfam = ""
            if gene in id2gene:
                ann = id2gene[gene][-1]  #contig,cdsList,strand,function
            if gene in trans2ann:
                ann = trans2ann[gene]
            if gene in trans2pfam:
                pfam = trans2pfam[gene]
            line += "\t%s\t%s" % (ann, pfam)
            print line
Ejemplo n.º 10
def process(fnames, expCov, genome, faa, pfam, gtf, log2th, splitFn, skipExons,
    """main function
    #load function annotation
    trans2ann = trans2pfam = {}
    if faa:
        trans2ann = load_fasta_headers(faa)
    if pfam:
        trans2pfam = load_pfam(pfam)

    ctg2cds, id2gene = {}, {}
    if gtf:
        # load gtf/gff
        if gtf.endswith(".gff"):
            id2gene, ctg2cds = load_gff(gtf)
            id2gene, ctg2cds = load_gtf(gtf)
        if verbose:
            sys.stderr.write("Loaded annotation of %s CDS from %s\n" %
                             (len(id2gene), gtf))

    #get samples names
    samples = []
    for fn in fnames:
        if splitFn:
            fn = fn.split(".")[0]

    #get expected coverage (RPKMs)
    if not expCov:
        c2cs = get_contig2size_samtools(genome)
        gsize = sum([s for c, s in c2cs.itervalues()])
        rcount = sum([c for c, s in c2cs.itervalues()])
        expCov = rcount * 10.0**3 / gsize
        if verbose:
            sys.stderr.write("Set expected coverage [RPKM]: %.3f\n" % expCov)

    #load gene counts and calculate means
    if verbose:
        sys.stderr.write("Loading gene counts...\n")
    means = []
    gene2counts = {}
    for fn in fnames:
        if verbose:
            sys.stderr.write(" %s      \r" % fn)
        gene2counts = load_counts(fn, gene2counts)

    ## print results
    # header
    if verbose:
    header = "#gene\tcoordinate"
    for s in samples:
        header += "\t%s\tlog2 vs mean" % (s, )
    header += "\tannotation\tpfam"
    print header

    # per gene scores
    for gene in sorted(gene2counts.keys()):
        #if genes only requested then skip
        if skipExons:
            #check if exon, and skip if so
            if gene.split(".")[-1].isdigit():
        coord, counts = gene2counts[gene]
        passed = False
        line = "%s\t%s" % (gene, coord)
        for c in counts:
            line += "\t%.2f" % c
            if not c:
                line += "\t-NA"
                passed = True
                log2 = log(c * 1.0 / expCov, 2)
                line += "\t%.2f" % log2
                #filter lines that contain log2 > than log2th or log2 < -log2th
                if log2th:
                    if not -log2th < log2 < log2th:
                        passed = True
                    passed = True

        #print only if passed filtering
        if passed:
            ann = pfam = ""
            if gene in id2gene:
                ann = id2gene[gene][-1]  #contig,cdsList,strand,function
            if gene in trans2ann:
                ann = trans2ann[gene]
            if gene in trans2pfam:
                pfam = trans2pfam[gene]
            line += "\t%s\t%s" % (ann, pfam)
            print line
Ejemplo n.º 11
def main():
    usage = "%prog [options] file1 [file2 ... fileN]"
    parser = OptionParser(usage)

                      help="GTF annotation file [%default]")
                      help="false dicovery rate [%default]")

    (o, fnames) = parser.parse_args()
    sys.stderr.write("Options: %s\nFiles to be processed: %s\n" % (o, fnames))

    if not fnames:
        sys.exit("Speficy at least one input file")

    prot2ann, prot2locus = {}, {}
    if o.annotation.endswith(".gff"):
        gene2position, contig2gene = load_gff(o.annotation)
    elif o.annotation:
        gene2position, contig2gene = load_gtf(o.annotation)
    #process files
    i = 0
    samples = []
    gene2fc = {}
    gene2reads = {}
    sys.stderr.write("Processing files...\n")
    de2sample = {}
    for fn in fnames:
        i += 1
        sys.stderr.write("%s\t%s\n" % (
        s = fn.split('.')[0]

        #load files
        i = 0
        for l in open(fn):
            #skip header
            i += 1
            if i == 1:

            id, baseMean, baseMeanA, baseMeanB, foldChange, log2FoldChange, pval, padj = l.split(
            baseMeanA, baseMeanB = float(baseMeanA), float(baseMeanB)

            if id not in gene2fc:
                gene2fc[id] = []
                gene2reads[id] = []  #baseMeanA

            #add expression info
            gene2reads[id].append((baseMeanA, baseMeanB))

                padj = float(padj)

            if padj < o.fdr:
                if id in de2sample:
                    de2sample[id] = [s]

    #print out
    header = "#gene\tlocus\t#de\tcontrol"
    for s in samples:
        header += "\t%s log2" % s
    header += "\tannotation"
    print header
    for gene in sorted(de2sample.keys()):
        function = ''
        if gene in gene2position:
            contig, coords, strand, function = gene2position[gene]
            locus = "%s:%s-%s %s" % (contig, coords[0][0], coords[-1][-1],

        out = "%s\t%s\t%s" % (
            gene, locus, len(de2sample[gene])
        )  #,gene2control[gene],'\t'.join(gene2fc[gene]),ann )
        i = 0
        for reads, fc in zip(gene2reads[gene], gene2fc[gene]):
            a, b = reads
            if not i:
                out += "\t%s" % a
            i += 1
            out += "\t%s" % fc
        out += "\t%s" % function
        print out
Ejemplo n.º 12
def main():
    usage = "usage: %prog [options] *.vcf" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) # allow_interspersed_args=True

    parser.add_option("-g", dest="gtf",
                      help="genome annotation gtf/gff [requires -f]" )
    parser.add_option("-f", dest="fasta",
                      help="genome fasta" )
    parser.add_option("-1", dest="bam1",
                      help="sample bam")
    parser.add_option("-2", dest="bam2",
                      help="reference bam")
    parser.add_option("-o", dest="outfn",
                      help="output fname [stdout]")
    parser.add_option("-d", dest="minDepth", default=5,  type=int,
                      help="""minimal depth; note both samples need to have pass depth filtering [%default]""")
    parser.add_option("-m", dest="minFreq",  default=0.8, type=float,
                      help="min frequency of alternative base [%default]")
    parser.add_option("-n", dest="indels",   default=True, action="store_false", 
                      help="ignore indels [%default]")
    parser.add_option("-b", dest="bothStrands", default=True, action="store_false", 
                      help="report events confirmed by single strand algs")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    if not args:
        parser.error( "At least one vcf file has to be specified!" )

    for fn in args:
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    ctg2cds,id2gene,ctg2seq = {},{},{}
    if o.gtf: # if annotation
        # load genome
        if not o.fasta: # fasta has to be provided
            parser.errer( "Fasta file (-f) is requeired!" )
        elif not os.path.isfile( o.fasta ):
            parser.error( "No such file: %s" % o.fasta )
        ctg2seq        = genome2dict( o.fasta )

        # load genome annotation
        if not os.path.isfile( o.gtf ): # check if correct file
            parser.error( "No such file: %s" % o.gtf )
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff( o.gtf )
            id2gene,ctg2cds = load_gtf( o.gtf )
        if o.verbose:
            sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )

    # load possible SNPs coordinates
    coords = load_vcf( args,o.indels )
    # check with mpileup
    check_snps( coords,o.bam1,o.bam2,o.fasta,o.outfn,ctg2cds,id2gene,ctg2seq,o.minDepth,o.minFreq,o.indels,o.bothStrands )
Ejemplo n.º 13
def process( fnames,expCov,genome,faa,pfam,gtf,log2th,splitFn,skipExons,verbose ):
    """main function
    #load function annotation
    trans2ann = trans2pfam = {}
    if faa:
        trans2ann = load_fasta_headers( faa )
    if pfam:
        trans2pfam = load_pfam( pfam )

    ctg2cds,id2gene = {},{}
    if gtf:
        # load gtf/gff
        if gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff( gtf )
            id2gene,ctg2cds = load_gtf( gtf )
        if verbose:
            sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),gtf ) )
    #get samples names 
    samples = []
    for fn in fnames:
        if splitFn:
            fn = fn.split(".")[0]        
        samples.append( fn )
    #get expected coverage (RPKMs)
    if not expCov:
        c2cs = get_contig2size_samtools( genome )
        gsize    = sum( [ s for c,s in c2cs.itervalues() ] )
        rcount   = sum( [ c for c,s in c2cs.itervalues() ] )        
        expCov   = rcount * 10.0**3 / gsize
        if verbose:
            sys.stderr.write( "Set expected coverage [RPKM]: %.3f\n" % expCov )
    #load gene counts and calculate means
    if verbose:
        sys.stderr.write( "Loading gene counts...\n" )
    means = []
    gene2counts = {}
    for fn in fnames:
        if verbose:
            sys.stderr.write( " %s      \r" % fn )
        gene2counts = load_counts( fn,gene2counts )

    ## print results
    # header
    if verbose:
        sys.stderr.write( "Calculating...\n" )
    header = "#gene\tcoordinate"
    for s in samples:
        header += "\t%s\tlog2 vs mean" % ( s, )
    header += "\tannotation\tpfam"
    print header
    # per gene scores
    for gene in sorted( gene2counts.keys() ):
        #if genes only requested then skip
        if skipExons:
            #check if exon, and skip if so
            if gene.split(".")[-1].isdigit():
        coord,counts = gene2counts[gene]
        passed = False
        line   = "%s\t%s" % ( gene,coord )
        for c in counts:
            line += "\t%.2f" % c
            if not c:
                line  += "\t-NA"
                passed = True
                log2  = log( c*1.0/expCov,2 )
                line += "\t%.2f" % log2
                #filter lines that contain log2 > than log2th or log2 < -log2th
                if log2th:
                    if not -log2th < log2 < log2th:
                        passed = True
                    passed = True
        #print only if passed filtering
        if passed:
            ann = pfam = ""
            if gene in id2gene:
                ann  = id2gene[gene][-1] #contig,cdsList,strand,function
            if gene in trans2ann:
                ann  = trans2ann[gene]
            if gene in trans2pfam:
                pfam = trans2pfam[gene]
            line += "\t%s\t%s" % ( ann,pfam )
            print line
Ejemplo n.º 14
def main():

    usage = "usage: %prog [options] *.vcf"
    parser = OptionParser(usage=usage,
                          version="%prog 1.0")  # allow_interspersed_args=True

                      help="genome annotation gtf/gff [requires -f]")
    parser.add_option("-f", dest="fasta", help="genome fasta")
    parser.add_option("-1", dest="bam1", help="sample bam")
    parser.add_option("-2", dest="bam2", help="reference bam")
    parser.add_option("-o", dest="outfn", help="output fname [stdout]")
        """minimal depth; note both samples need to have pass depth filtering [%default]"""
                      help="min frequency of alternative base [%default]")
                      help="ignore indels [%default]")
                      help="report events confirmed by single strand algs")
    parser.add_option("-v", dest="verbose", default=True, action="store_false")

    (o, args) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("%s\n" % (str(o), ))

    if not args:
        parser.error("At least one vcf file has to be specified!")

    for fn in args:
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    ctg2cds, id2gene, ctg2seq = {}, {}, {}
    if o.gtf:  # if annotation
        # load genome
        if not o.fasta:  # fasta has to be provided
            parser.errer("Fasta file (-f) is requeired!")
        elif not os.path.isfile(o.fasta):
            parser.error("No such file: %s" % o.fasta)
        ctg2seq = genome2dict(o.fasta)

        # load genome annotation
        if not os.path.isfile(o.gtf):  # check if correct file
            parser.error("No such file: %s" % o.gtf)
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene, ctg2cds = load_gff(o.gtf)
            id2gene, ctg2cds = load_gtf(o.gtf)
        if o.verbose:
            sys.stderr.write("Loaded annotation of %s CDS from %s\n" %
                             (len(id2gene), o.gtf))

    # load possible SNPs coordinates
    coords = load_vcf(args, o.indels)

    # check with mpileup
    check_snps(coords, o.bam1, o.bam2, o.fasta, o.outfn, ctg2cds, id2gene,
               ctg2seq, o.minDepth, o.minFreq, o.indels, o.bothStrands)
Ejemplo n.º 15
def process( fnames,faa,pfam,gtf,log2th,splitFn,skipExons,verbose ):
    """main function
    #load function annotation
    trans2ann = trans2pfam = {}
    if faa:
        trans2ann = load_fasta_headers( faa )
    if pfam:
        trans2pfam = load_pfam( pfam )

    ctg2cds,id2gene = {},{}
    if gtf:
        # load gtf/gff
        if gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff( gtf )
            id2gene,ctg2cds = load_gtf( gtf )
        if verbose:
            sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )
    #get samples names
    samples = []
    for fn in fnames:
        if splitFn:
            fn = fn.split(".")[0]        
        samples.append( fn )

    #load gene counts
    if verbose:
        sys.stderr.write( "Loading gene counts...\n" )
    gene2counts = {}
    for fn in fnames:
        if verbose:
            sys.stderr.write( " %s      \r" % fn )
        gene2counts = load_counts( fn,gene2counts )

    ## print results
    # header
    if verbose:
        sys.stderr.write( "Calculating...\n" )
    header = "#gene\tcoordinate\t%s" % samples[0]
    for s in samples[1:]:
        header += "\t%s\tlog2(%s/%s)" % ( s,s,samples[0] )
    header += "\tannotation\tpfam"
    print header
    # per gene scores
    for gene in sorted( gene2counts.keys() ):
        #if genes only requested then skip
        if skipExons:
            #check if exon, and skip if so
            if gene.split(".")[-1].isdigit():
        coord,counts = gene2counts[gene]
        passed = False
        line   = "%s\t%s\t%.2f" % ( gene,coord,counts[0] )
        for c in counts[1:]:
            line += "\t%.2f" % c
            #ref 0
            if not counts[0]:
                line  += "\t+NA"
                passed = True
            elif not c:
                line  += "\t-NA"
                passed = True
                log2  = log( c*1.0/counts[0],2 )
                line += "\t%.2f" % log2
                #filter lines that contain log2 > than log2th or log2 < -log2th
                if log2th:
                    if not -log2th < log2 < log2th:
                        passed = True
                    passed = True
        #print only if passed filtering
        if passed:
            ann = pfam = ""
            if gene in id2gene:
                ann  = id2gene[gene][-1] #contig,cdsList,strand,function
            if gene in trans2ann:
                ann  = trans2ann[gene]
            if gene in trans2pfam:
                pfam = trans2pfam[gene]
            line += "\t%s\t%s" % ( ann,pfam )
            print line