コード例 #1
0
ファイル: coords2ordered.py プロジェクト: jpmtavares/bin
def main():

    usage   = "%(prog)s [options]" 
    parser  = argparse.ArgumentParser( usage=usage,description=desc,epilog=epilog )

    parser.add_argument("-v", dest="verbose", default=False, action="store_true")
    parser.add_argument("-i", dest="coords",  default=sys.stdin, type=file,
                        help="coords file name [stdin]")
    parser.add_argument("-f", dest="fasta",   required=True, type=file,
                        help="query fasta file" )
    parser.add_argument("-o", dest="out",  default=sys.stdout, type=argparse.FileType("w"),
                        help="output base name [stdout]")
    
    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\n" % str(o) )

    #load query genome
    query2fasta = genome2dict( o.fasta.name )

    # load nucmer
    matches = nucmer2list( o.coords.name )#; print matches

    # sort hits by ref position
    sort_hits( matches,query2fasta,o.out,o.verbose )                    
コード例 #2
0
def main():

    usage   = "%(prog)s [options]" 
    parser  = argparse.ArgumentParser( usage=usage,description=desc,epilog=epilog )

    parser.add_argument("-v", dest="verbose", default=False, action="store_true")
    parser.add_argument("-i", dest="coords",  default=sys.stdin, type=file,
                        help="coords file name [stdin]")
    parser.add_argument("-f", dest="fasta",   required=True, type=file,
                        help="query fasta file" )
    parser.add_argument("-o", dest="out",  default=sys.stdout, type=argparse.FileType("w"),
                        help="output base name [stdout]")
    
    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\n" % str(o) )

    #load query genome
    query2fasta = genome2dict( o.fasta.name )

    # load nucmer
    matches = nucmer2list( o.coords.name )#; print matches

    # sort hits by ref position
    sort_hits( matches,query2fasta,o.out,o.verbose )                    
コード例 #3
0
ファイル: augustus_gff2fasta.py プロジェクト: lpryszcz/bin
def gff2fasta(gff, fasta, entireGene, codonTable, verbose):
    """Report gene, cds, peptide.
    not reporting 1000bp flanking intergenic sequence.
    """
    #load genome
    chr2seq = genome2dict(fasta)
    #load gff
    gene2position, contig2gene = load_gtf(gff)
    #get out streams
    genout = open(gff.name+".gene.fa", "w")
    cdsout = open(gff.name+".cds.fa", "w")
    pepout = open(gff.name+".pep.fa", "w")
    #process entries
    i = 0
    genes = set()
    for ci, contig in enumerate(sorted(contig2gene), 1):
        sys.stderr.write(" %s %s            \r" % (ci, contig))
        for s, e, feature, gene in contig2gene[contig]:
            i += 1
            contig, boundaries, strand, function, frames = gene2position[gene]
            #store CDS and peptide    
            cds, pep = get_cds_pep(chr2seq[contig], gene, boundaries, strand, function, frames, codonTable)
            cdsout.write(cds.format('fasta'))
            pepout.write(pep.format('fasta'))
            #get geneid and store gene
            geneid = ".".join(gene.split(".")[:-1])#; print geneid
            if geneid not in genes:
                genes.add(geneid)
                seq = chr2seq[contig][s-1:e]
                if strand == "-":
                    seq = seq.reverse_complement()
                gen = SeqRecord(seq, id=geneid, name="", description=function)
                genout.write(gen.format('fasta'))            
コード例 #4
0
ファイル: augustus_gff2fasta.py プロジェクト: jpmtavares/bin
def gff2fasta(gff, fasta, entireGene, codonTable, verbose):
    """Report gene, cds, peptide.
    not reporting 1000bp flanking intergenic sequence.
    """
    #load genome
    chr2seq = genome2dict(fasta)
    #load gff
    gene2position, contig2gene = load_gtf(gff)
    #get out streams
    genout = open(gff.name+".gene.fa", "w")
    cdsout = open(gff.name+".cds.fa", "w")
    pepout = open(gff.name+".pep.fa", "w")
    #process entries
    i = 0
    genes = set()
    for ci, contig in enumerate(sorted(contig2gene), 1):
        sys.stderr.write(" %s %s            \r" % (ci, contig))
        for s, e, feature, gene in contig2gene[contig]:
            i += 1
            contig, boundaries, strand, function, frames = gene2position[gene]
            #store CDS and peptide    
            cds, pep = get_cds_pep(chr2seq[contig], gene, boundaries, strand, function, frames, codonTable)
            cdsout.write(cds.format('fasta'))
            pepout.write(pep.format('fasta'))
            #get geneid and store gene
            geneid = ".".join(gene.split(".")[:-1])#; print geneid
            if geneid not in genes:
                genes.add(geneid)
                seq = chr2seq[contig][s-1:e]
                if strand == "-":
                    seq = seq.reverse_complement()
                gen = SeqRecord(seq, id=geneid, name="", description=function)
                genout.write(gen.format('fasta'))            
コード例 #5
0
def main():
    usage = "usage: %prog [options]" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True

    parser.add_option("-i", dest="input",  default="",
                      help="input file with tab-separated ids [mandatory]" )
    parser.add_option("-f", dest="fasta",  default="",
                      help="multifasta file                   [mandatory]" )
    parser.add_option("-o", dest="out",   default="out/out",
                      help="output fname                      [%default]" )
    parser.add_option("-s", dest="split",  default=False, action="store_true",
                      help="split fasta for ids from every line")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    for f in ( o.input,o.fasta ):
        if not f:
            parser.error( "Specify mandatory parameters!" )
        if not f.isdigit and not os.path.isfile( f ):
            parser.error( "No such file: %s" % f )

    #check if outdir exists
    outdir = os.path.dirname(o.out)
    if not os.path.isdir( outdir ):
        os.makedirs( outdir )

    #get id2fasta
    sys.stderr.write( "Loading multifasta...\n" )
    id2fasta = genome2dict( o.fasta )

    sys.stderr.write( "Saving fastas...\n" )    
    #load ids
    i = 1
    #open common output
    if not o.split:
        outfn = "%s_%5i.fasta" % (o.out,i)
        outfn = outfn.replace(" ","0")
        out   = open( outfn,"w" )
    for l in open( o.input ):
        #open output for each line if requested
        if o.split:
            outfn = "%s_%5i.fasta" % (o.out,i)
            outfn = outfn.replace(" ","0")
            out   = open( outfn,"w" )
        for id in l.split():
            sys.stderr.write( " %s   \r" % id )
            if   id      in id2fasta:
                seq = id2fasta[id]
            elif id+"_1" in id2fasta:
                seq = id2fasta[id+"_1"]
            else:
                sys.stderr.write( "  No fasta for: %s\n" % id )
                continue
            out.write( ">%s\n%s\n" % (id,seq) )
        i += 1
コード例 #6
0
def main():
    
    usage  = "%(prog)s [options]" 
    parser  = argparse.ArgumentParser(usage=usage, description=desc, epilog=epilog, \
                                      formatter_class=argparse.RawTextHelpFormatter)

    parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose")    
    parser.add_argument('--version', action='version', version='1.1')
    parser.add_argument("-g", "--gtf",
                        help="genome annotation gtf/gff [requires -f]" )
    parser.add_argument("-f", "--fasta",
                        help="genome fasta [can be gzipped]" )
    parser.add_argument("-i", "--input", type=file,  #default=sys.stdin,
                        help="input stream [stdin]")
    parser.add_argument("-o", "--out", default=sys.stdout, 
                        help="output stream [stdout]")
    parser.add_argument("-p", "--pfam", default="", 
                        help="pfam tblout file") 
    parser.add_argument("-q", "--faa", default="", 
                        help="proteome fasta (to get protein annotation)") 
    parser.add_argument("-t", "--tab", default="", 
                        help="tab-delimited annotation") 
    
    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n"%str(o))

    ctg2cds, id2gene, ctg2seq = {},{},{}
    if o.gtf: # if annotation
        # load genome
        if not o.fasta: # fasta has to be provided
            parser.errer("Fasta file (-f) is requeired!")
        elif not os.path.isfile( o.fasta ):
            parser.error("No such file: %s"%o.fasta)
        ctg2seq        = genome2dict(o.fasta)

        # load genome annotation
        if not os.path.isfile(o.gtf): # check if correct file
            parser.error("No such file: %s"%o.gtf)
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff(o.gtf)
        else:
            id2gene,ctg2cds = load_gtf(o.gtf)
        if o.verbose:
            sys.stderr.write("Loaded annotation of %s CDS from %s\n"%(len(id2gene), o.gtf))

    #load function annotation
    trans2ann = trans2pfam = trans2tab = {}
    if o.faa:
        trans2ann = load_fasta_headers(o.faa)
    if o.pfam:
        trans2pfam = load_pfam(o.pfam)
    if o.tab:
        trans2tab = load_tab(o.tab)
    # parse pileup
    parse_snps(o.input, o.out, ctg2cds, id2gene, ctg2seq, trans2ann, trans2pfam, \
               trans2tab, o.verbose)
コード例 #7
0
ファイル: csv2fasta.py プロジェクト: jpmtavares/bin
def main():
    usage = "usage: %prog [options]" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True

    parser.add_option("-i", dest="input",  default="",
                      help="input file with tab-separated ids [mandatory]" )
    parser.add_option("-f", dest="fasta",  default="",
                      help="multifasta file                   [mandatory]" )
    parser.add_option("-o", dest="out",   default="out/out",
                      help="output fname                      [%default]" )
    parser.add_option("-s", dest="split",  default=False, action="store_true",
                      help="split fasta for ids from every line")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    for f in ( o.input,o.fasta ):
        if not f:
            parser.error( "Specify mandatory parameters!" )
        if not f.isdigit and not os.path.isfile( f ):
            parser.error( "No such file: %s" % f )

    #check if outdir exists
    outdir = os.path.dirname(o.out)
    if not os.path.isdir( outdir ):
        os.makedirs( outdir )

    #get id2fasta
    sys.stderr.write( "Loading multifasta...\n" )
    id2fasta = genome2dict( o.fasta )

    sys.stderr.write( "Saving fastas...\n" )    
    #load ids
    i = 1
    #open common output
    if not o.split:
        outfn = "%s_%5i.fasta" % (o.out,i)
        outfn = outfn.replace(" ","0")
        out   = open( outfn,"w" )
    for l in open( o.input ):
        #open output for each line if requested
        if o.split:
            outfn = "%s_%5i.fasta" % (o.out,i)
            outfn = outfn.replace(" ","0")
            out   = open( outfn,"w" )
        for id in l.split():
            sys.stderr.write( " %s   \r" % id )
            if   id      in id2fasta:
                seq = id2fasta[id]
            elif id+"_1" in id2fasta:
                seq = id2fasta[id+"_1"]
            else:
                sys.stderr.write( "  No fasta for: %s\n" % id )
                continue
            out.write( ">%s\n%s\n" % (id,seq) )
        i += 1
コード例 #8
0
ファイル: blat2collapse.py プロジェクト: jpmtavares/bin
def main():
    usage  = "usage: %prog [options]"
    desc   = """Parse multi-fasta file and report sequences without overlap
with already reported sequences. Starts from the longest."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-i", dest="infile",  
                      help="multi-fasta file       [mandatory]")
    parser.add_option("-m", dest="minIdentity",  default=90, type=int,
                      help="min identity           [%default]")
    parser.add_option("-o", dest="overlap",  default=0.3, type=float,
                      help="max overlap allowed    [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true" )
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,args ) )

    for fn in [ o.infile, ]:
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #load fastas
    fastas = genome2dict( o.infile )

    #contigs by descending length
    contigs = sorted( fastas.keys(),key=lambda x: len(fastas[x]), reverse=True )

    #report non-overlapping
    i = 0
    added,skipped = set(), set()
    ##remove outfile if exists
    outfn = o.infile + ".collapsed_o%s_i%s.fa" % ( o.overlap,o.minIdentity )
    if os.path.isfile( outfn ):
        os.unlink( outfn )
    ##execute blat vs itself
    pslfn = run_blat( o.infile,o.infile,o.minIdentity,o.verbose )
    matches = parse_blat( pslfn,o.verbose,header=0,skipSelfMatches=1 )
    ##add contigs without overlap
    for c in contigs:
        i += 1
        if o.verbose:
            sys.stderr.write( " %3s %20s [ %7.2f kb]\n" % (i,c,len(fastas[c])/1000.0) )
        #get fasta entry
        fasta = ">%s\n%s\n" % (c,_get_formatted_seq(fastas[c]))
        #save contig if first or if no overlapping already processed
        if not added or not overlapping( c,added,matches,o.overlap,o.verbose ):
            added.add( c )
            out = open(outfn,"a"); out.write( fasta ); out.close()
        else:
            skipped.add( c )

    sys.stderr.write( "Selected %s [ %7.2f kb] out of %s [ %7.2f kb] contigs.\n" % ( len(added),sum([len(fastas[c]) for c in added])/10.0**3,len(fastas),sum([len(fastas[c]) for c in fastas])/10.0**3) )
コード例 #9
0
ファイル: nucmer2ordered.py プロジェクト: jpmtavares/bin
def main():
    usage  = "usage: %prog [options]"
    desc   = """Order contigs based on nucmer output."""
    epilog = """Make sure, coords file is sorted by reference (show-coords -r).
Monoploid number (-x) has to be specified correctly. For details look at: http://en.wikipedia.org/wiki/Ploidy"""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-o", dest="outfn",  default="out",
                      help="output base name       [%default]")
    parser.add_option("-i", dest="coords", default="",
                      help="coords file name       [mandatory]")
    parser.add_option("-q", dest="query",  default="",
                      help="query file name        [mandatory]")
    parser.add_option("-r", dest="ref",    default="",
                      help="reference file name    [mandatory]")
    parser.add_option("-c", dest="qOverlap", default=0.05, type=float,
                      help="fract of query aligned [%default]")
    parser.add_option("-n", dest="haploid", default=2, type=int,
                      help="haploid number         [%default]")
    parser.add_option("-x", dest="monoploid", default=2, type=int,
                      help="monoploid number       [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true" )
  
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nFastQ files: %s\n" % ( o,fnames ) )
        
    # check input files
    for fn in [ o.coords,o.query,o.ref ]:
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #load query genome
    query2fasta = genome2dict( o.query )
    ref2fasta   = genome2dict( o.ref )
        
    # load nucmer
    matches = nucmer2list( o.coords )

    # sort hits by ref position
    sort_hits( matches,query2fasta,ref2fasta,o.outfn,o.qOverlap,o.haploid,o.monoploid,o.verbose )
コード例 #10
0
ファイル: bam2counts.py プロジェクト: lpryszcz/bin
def main():

    usage = "usage: %prog [options]\nfor f in *.bam; do echo `date` $f; bam2counts.py -rv -i $f -g F.oxysporum.gtf > $f.genecounts.txt; done"
    parser = OptionParser(usage=usage,
                          version="%prog 1.0")  #allow_interspersed_args=True

    parser.add_option("-i", dest="bam", default="", help="bam file")
    parser.add_option("-g",
                      dest="gtf",
                      default="",
                      help="genome annotation gtf/gff")
    parser.add_option(
        "-r",
        dest="rpkm",
        default=False,
        action="store_true",
        help=
        "RPKM normalisation (reads per kb of gene per million of aligned reads)"
    )
    parser.add_option("-f",
                      dest="fasta",
                      default="",
                      help="genome fasta [required if -r]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true")

    (o, fnames) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\nArgs: %s\n" % (o, fnames))

    for fn in (o.bam, o.gtf):
        if not fn:
            parser.error("Provide input file!")
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    ctg2cds, id2gene, ctg2seq = {}, {}, {}
    # load gtf/gff
    if o.gtf:
        if o.gtf.endswith(".gff"):
            id2gene, ctg2cds = load_gff(o.gtf)
        else:
            id2gene, ctg2cds = load_gtf(o.gtf)
    if o.verbose:
        sys.stderr.write("Loaded annotation of %s CDS from %s\n" %
                         (len(id2gene), o.gtf))

    if o.rpkm:
        if not o.fasta:
            parser.error("Specify genome fasta file!")
        if not os.path.isfile(o.fasta):
            parser.error("No such file: %s" % o.fasta)
        ctg2seq = genome2dict(o.fasta)

    bam2counts(o.bam, o.rpkm, id2gene, ctg2cds, ctg2seq, o.verbose)
コード例 #11
0
ファイル: snp2annotate.py プロジェクト: jpmtavares/bin
def main():

    usage = "%(prog)s [options]"
    parser = argparse.ArgumentParser(
        usage=usage, description=desc, epilog=epilog, formatter_class=argparse.RawTextHelpFormatter
    )

    parser.add_argument("-v", "--verbose", default=False, action="store_true", help="verbose")
    parser.add_argument("--version", action="version", version="1.1")
    parser.add_argument("-g", "--gtf", help="genome annotation gtf/gff [requires -f]")
    parser.add_argument("-f", "--fasta", help="genome fasta [can be gzipped]")
    parser.add_argument("-i", "--input", type=file, help="input stream [stdin]")  # default=sys.stdin,
    parser.add_argument("-o", "--out", default=sys.stdout, help="output stream [stdout]")
    parser.add_argument("-p", "--pfam", default="", help="pfam tblout file")
    parser.add_argument("-q", "--faa", default="", help="proteome fasta (to get protein annotation)")
    parser.add_argument("-t", "--tab", default="", help="tab-delimited annotation")

    o = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\n" % str(o))

    ctg2cds, id2gene, ctg2seq = {}, {}, {}
    if o.gtf:  # if annotation
        # load genome
        if not o.fasta:  # fasta has to be provided
            parser.errer("Fasta file (-f) is requeired!")
        elif not os.path.isfile(o.fasta):
            parser.error("No such file: %s" % o.fasta)
        ctg2seq = genome2dict(o.fasta)

        # load genome annotation
        if not os.path.isfile(o.gtf):  # check if correct file
            parser.error("No such file: %s" % o.gtf)
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene, ctg2cds = load_gff(o.gtf)
        else:
            id2gene, ctg2cds = load_gtf(o.gtf)
        if o.verbose:
            sys.stderr.write("Loaded annotation of %s CDS from %s\n" % (len(id2gene), o.gtf))

    # load function annotation
    trans2ann = trans2pfam = trans2tab = {}
    if o.faa:
        trans2ann = load_fasta_headers(o.faa)
    if o.pfam:
        trans2pfam = load_pfam(o.pfam)
    if o.tab:
        trans2tab = load_tab(o.tab)
    # parse pileup
    parse_snps(o.input, o.out, ctg2cds, id2gene, ctg2seq, trans2ann, trans2pfam, trans2tab, o.verbose)
コード例 #12
0
ファイル: vcf2snp.py プロジェクト: jpmtavares/bin
def main():
    
    usage = "usage: %prog [options]" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) # allow_interspersed_args=True

    parser.add_option("-g", dest="gtf",
                      help="genome annotation gtf/gff [requires -f]" )
    parser.add_option("-f", dest="fasta",
                      help="genome fasta [can be gzipped]" )
    parser.add_option("-i", dest="fpath",
                      help="input file [stdin]")
    parser.add_option("-o", dest="outfn",
                      help="output fname [stdout]")
    parser.add_option("-d", dest="minDepth", default=10,  type=int,
                      help="minimal depth [%default]")
    parser.add_option("-m", dest="minFreq",  default=0.8, type=float,
                      help="min frequency of alternative base [%default]")
    parser.add_option("-n", dest="indels",   default=True, action="store_false", 
                      help="ignore indels")
    parser.add_option("-b", dest="bothStrands", default=True, action="store_false", 
                      help="report events confirmed by single strand algs")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    ctg2cds,id2gene,ctg2seq = {},{},{}
    if o.gtf: # if annotation
        # load genome
        if not o.fasta: # fasta has to be provided
            parser.errer( "Fasta file (-f) is requeired!" )
        elif not os.path.isfile( o.fasta ):
            parser.error( "No such file: %s" % o.fasta )
        ctg2seq        = genome2dict( o.fasta )

        # load genome annotation
        if not os.path.isfile( o.gtf ): # check if correct file
            parser.error( "No such file: %s" % o.gtf )
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff( o.gtf )
        else:
            id2gene,ctg2cds = load_gtf( o.gtf )
        if o.verbose:
            sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )

    # parse pileup
    parse_vcf( o.fpath,o.outfn,ctg2cds,id2gene,ctg2seq,o.minDepth,o.minFreq,o.indels,o.bothStrands )
コード例 #13
0
ファイル: vcf2snp.py プロジェクト: lpryszcz/bin
def main():
    
    usage = "usage: %prog [options]" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) # allow_interspersed_args=True

    parser.add_option("-g", dest="gtf",
                      help="genome annotation gtf/gff [requires -f]" )
    parser.add_option("-f", dest="fasta",
                      help="genome fasta [can be gzipped]" )
    parser.add_option("-i", dest="fpath",
                      help="input file [stdin]")
    parser.add_option("-o", dest="outfn",
                      help="output fname [stdout]")
    parser.add_option("-d", dest="minDepth", default=10,  type=int,
                      help="minimal depth [%default]")
    parser.add_option("-m", dest="minFreq",  default=0.8, type=float,
                      help="min frequency of alternative base [%default]")
    parser.add_option("-n", dest="indels",   default=True, action="store_false", 
                      help="ignore indels")
    parser.add_option("-b", dest="bothStrands", default=True, action="store_false", 
                      help="report events confirmed by single strand algs")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    ctg2cds,id2gene,ctg2seq = {},{},{}
    if o.gtf: # if annotation
        # load genome
        if not o.fasta: # fasta has to be provided
            parser.errer( "Fasta file (-f) is requeired!" )
        elif not os.path.isfile( o.fasta ):
            parser.error( "No such file: %s" % o.fasta )
        ctg2seq        = genome2dict( o.fasta )

        # load genome annotation
        if not os.path.isfile( o.gtf ): # check if correct file
            parser.error( "No such file: %s" % o.gtf )
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff( o.gtf )
        else:
            id2gene,ctg2cds = load_gtf( o.gtf )
        if o.verbose:
            sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )

    # parse pileup
    parse_vcf( o.fpath,o.outfn,ctg2cds,id2gene,ctg2seq,o.minDepth,o.minFreq,o.indels,o.bothStrands )
コード例 #14
0
ファイル: bam2counts.py プロジェクト: jpmtavares/bin
def main():

    usage  = "usage: %prog [options]\nfor f in *.bam; do echo `date` $f; bam2counts.py -rv -i $f -g F.oxysporum.gtf > $f.genecounts.txt; done" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) #allow_interspersed_args=True

    parser.add_option("-i", dest="bam", default="",
                      help="bam file")
    parser.add_option("-g", dest="gtf",default="",
                      help="genome annotation gtf/gff" )
    parser.add_option("-r", dest="rpkm",  default=False, action="store_true",
                      help="RPKM normalisation (reads per kb of gene per million of aligned reads)" )
    parser.add_option("-f", dest="fasta", default="",
                      help="genome fasta [required if -r]")
    parser.add_option("-v", dest="verbose",  default=False, action="store_true" )    
  
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) )

    for fn in ( o.bam,o.gtf ):
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )
        
    ctg2cds,id2gene,ctg2seq = {},{},{}
    # load gtf/gff
    if o.gtf:
        if o.gtf.endswith(".gff"):      
            id2gene,ctg2cds = load_gff( o.gtf )
        else:
            id2gene,ctg2cds = load_gtf( o.gtf )        
    if o.verbose:
        sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )

    if o.rpkm:
        if not o.fasta:
            parser.error( "Specify genome fasta file!" )
        if not os.path.isfile( o.fasta ):
            parser.error( "No such file: %s" % o.fasta )
        ctg2seq = genome2dict( o.fasta )

    bam2counts( o.bam,o.rpkm,id2gene,ctg2cds,ctg2seq,o.verbose )
コード例 #15
0
def main():
    usage = "usage: %prog [options]"
    desc = """Order contigs based on nucmer output."""
    epilog = """Make sure, coords file is sorted by reference (show-coords -r).
Monoploid number (-x) has to be specified correctly. For details look at: http://en.wikipedia.org/wiki/Ploidy"""
    parser = OptionParser(usage=usage,
                          version="%prog 1.0",
                          description=desc,
                          epilog=epilog)

    parser.add_option("-o",
                      dest="outfn",
                      default="out",
                      help="output base name       [%default]")
    parser.add_option("-i",
                      dest="coords",
                      default="",
                      help="coords file name       [mandatory]")
    parser.add_option("-q",
                      dest="query",
                      default="",
                      help="query file name        [mandatory]")
    parser.add_option("-r",
                      dest="ref",
                      default="",
                      help="reference file name    [mandatory]")
    parser.add_option("-c",
                      dest="qOverlap",
                      default=0.05,
                      type=float,
                      help="fract of query aligned [%default]")
    parser.add_option("-n",
                      dest="haploid",
                      default=2,
                      type=int,
                      help="haploid number         [%default]")
    parser.add_option("-x",
                      dest="monoploid",
                      default=2,
                      type=int,
                      help="monoploid number       [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true")

    (o, fnames) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\nFastQ files: %s\n" % (o, fnames))

    # check input files
    for fn in [o.coords, o.query, o.ref]:
        if not fn:
            parser.error("Provide input file!")
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    #load query genome
    query2fasta = genome2dict(o.query)
    ref2fasta = genome2dict(o.ref)

    # load nucmer
    matches = nucmer2list(o.coords)

    # sort hits by ref position
    sort_hits(matches, query2fasta, ref2fasta, o.outfn, o.qOverlap, o.haploid,
              o.monoploid, o.verbose)
コード例 #16
0
ファイル: vcf2snp.ref.py プロジェクト: jpmtavares/bin
def main():
    
    usage = "usage: %prog [options] *.vcf" 
    parser = OptionParser( usage=usage,version="%prog 1.0" ) # allow_interspersed_args=True

    parser.add_option("-g", dest="gtf",
                      help="genome annotation gtf/gff [requires -f]" )
    parser.add_option("-f", dest="fasta",
                      help="genome fasta" )
    parser.add_option("-1", dest="bam1",
                      help="sample bam")
    parser.add_option("-2", dest="bam2",
                      help="reference bam")
    parser.add_option("-o", dest="outfn",
                      help="output fname [stdout]")
    parser.add_option("-d", dest="minDepth", default=5,  type=int,
                      help="""minimal depth; note both samples need to have pass depth filtering [%default]""")
    parser.add_option("-m", dest="minFreq",  default=0.8, type=float,
                      help="min frequency of alternative base [%default]")
    parser.add_option("-n", dest="indels",   default=True, action="store_false", 
                      help="ignore indels [%default]")
    parser.add_option("-b", dest="bothStrands", default=True, action="store_false", 
                      help="report events confirmed by single strand algs")
    parser.add_option("-v", dest="verbose",  default=True, action="store_false")
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "%s\n" % ( str(o), ) )

    if not args:
        parser.error( "At least one vcf file has to be specified!" )

    for fn in args:
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    ctg2cds,id2gene,ctg2seq = {},{},{}
    if o.gtf: # if annotation
        # load genome
        if not o.fasta: # fasta has to be provided
            parser.errer( "Fasta file (-f) is requeired!" )
        elif not os.path.isfile( o.fasta ):
            parser.error( "No such file: %s" % o.fasta )
        ctg2seq        = genome2dict( o.fasta )

        # load genome annotation
        if not os.path.isfile( o.gtf ): # check if correct file
            parser.error( "No such file: %s" % o.gtf )
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene,ctg2cds = load_gff( o.gtf )
        else:
            id2gene,ctg2cds = load_gtf( o.gtf )
        if o.verbose:
            sys.stderr.write( "Loaded annotation of %s CDS from %s\n" % ( len(id2gene),o.gtf ) )

    # load possible SNPs coordinates
    coords = load_vcf( args,o.indels )
            
    # check with mpileup
    check_snps( coords,o.bam1,o.bam2,o.fasta,o.outfn,ctg2cds,id2gene,ctg2seq,o.minDepth,o.minFreq,o.indels,o.bothStrands )
コード例 #17
0
ファイル: blat2collapse.py プロジェクト: lpryszcz/bin
def main():
    usage = "usage: %prog [options]"
    desc = """Parse multi-fasta file and report sequences without overlap
with already reported sequences. Starts from the longest."""
    epilog = ""
    parser = OptionParser(usage=usage,
                          version="%prog 1.0",
                          description=desc,
                          epilog=epilog)

    parser.add_option("-i",
                      dest="infile",
                      help="multi-fasta file       [mandatory]")
    parser.add_option("-m",
                      dest="minIdentity",
                      default=90,
                      type=int,
                      help="min identity           [%default]")
    parser.add_option("-o",
                      dest="overlap",
                      default=0.3,
                      type=float,
                      help="max overlap allowed    [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true")

    (o, args) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\nArgs: %s\n" % (o, args))

    for fn in [
            o.infile,
    ]:
        if not fn:
            parser.error("Provide input file!")
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    #load fastas
    fastas = genome2dict(o.infile)

    #contigs by descending length
    contigs = sorted(fastas.keys(), key=lambda x: len(fastas[x]), reverse=True)

    #report non-overlapping
    i = 0
    added, skipped = set(), set()
    ##remove outfile if exists
    outfn = o.infile + ".collapsed_o%s_i%s.fa" % (o.overlap, o.minIdentity)
    if os.path.isfile(outfn):
        os.unlink(outfn)
    ##execute blat vs itself
    pslfn = run_blat(o.infile, o.infile, o.minIdentity, o.verbose)
    matches = parse_blat(pslfn, o.verbose, header=0, skipSelfMatches=1)
    ##add contigs without overlap
    for c in contigs:
        i += 1
        if o.verbose:
            sys.stderr.write(" %3s %20s [ %7.2f kb]\n" %
                             (i, c, len(fastas[c]) / 1000.0))
        #get fasta entry
        fasta = ">%s\n%s\n" % (c, _get_formatted_seq(fastas[c]))
        #save contig if first or if no overlapping already processed
        if not added or not overlapping(c, added, matches, o.overlap,
                                        o.verbose):
            added.add(c)
            out = open(outfn, "a")
            out.write(fasta)
            out.close()
        else:
            skipped.add(c)

    sys.stderr.write(
        "Selected %s [ %7.2f kb] out of %s [ %7.2f kb] contigs.\n" %
        (len(added), sum([len(fastas[c]) for c in added]) / 10.0**3,
         len(fastas), sum([len(fastas[c]) for c in fastas]) / 10.0**3))
コード例 #18
0
def main():

    usage = "usage: %prog [options] *.vcf"
    parser = OptionParser(usage=usage,
                          version="%prog 1.0")  # allow_interspersed_args=True

    parser.add_option("-g",
                      dest="gtf",
                      help="genome annotation gtf/gff [requires -f]")
    parser.add_option("-f", dest="fasta", help="genome fasta")
    parser.add_option("-1", dest="bam1", help="sample bam")
    parser.add_option("-2", dest="bam2", help="reference bam")
    parser.add_option("-o", dest="outfn", help="output fname [stdout]")
    parser.add_option(
        "-d",
        dest="minDepth",
        default=5,
        type=int,
        help=
        """minimal depth; note both samples need to have pass depth filtering [%default]"""
    )
    parser.add_option("-m",
                      dest="minFreq",
                      default=0.8,
                      type=float,
                      help="min frequency of alternative base [%default]")
    parser.add_option("-n",
                      dest="indels",
                      default=True,
                      action="store_false",
                      help="ignore indels [%default]")
    parser.add_option("-b",
                      dest="bothStrands",
                      default=True,
                      action="store_false",
                      help="report events confirmed by single strand algs")
    parser.add_option("-v", dest="verbose", default=True, action="store_false")

    (o, args) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("%s\n" % (str(o), ))

    if not args:
        parser.error("At least one vcf file has to be specified!")

    for fn in args:
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    ctg2cds, id2gene, ctg2seq = {}, {}, {}
    if o.gtf:  # if annotation
        # load genome
        if not o.fasta:  # fasta has to be provided
            parser.errer("Fasta file (-f) is requeired!")
        elif not os.path.isfile(o.fasta):
            parser.error("No such file: %s" % o.fasta)
        ctg2seq = genome2dict(o.fasta)

        # load genome annotation
        if not os.path.isfile(o.gtf):  # check if correct file
            parser.error("No such file: %s" % o.gtf)
        # load gtf/gff
        if o.gtf.endswith(".gff"):
            id2gene, ctg2cds = load_gff(o.gtf)
        else:
            id2gene, ctg2cds = load_gtf(o.gtf)
        if o.verbose:
            sys.stderr.write("Loaded annotation of %s CDS from %s\n" %
                             (len(id2gene), o.gtf))

    # load possible SNPs coordinates
    coords = load_vcf(args, o.indels)

    # check with mpileup
    check_snps(coords, o.bam1, o.bam2, o.fasta, o.outfn, ctg2cds, id2gene,
               ctg2seq, o.minDepth, o.minFreq, o.indels, o.bothStrands)