Beispiel #1
0
def main():

    usage  = "usage: %prog [options] [ > out.bed ]"
    desc   = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-i", dest="infile",  default="",
                      help="blast output")
    parser.add_option("-j", dest="query",  default="",
                      help="query fasta")
    parser.add_option("-k", dest="target",  default="",
                      help="target fasta")
    parser.add_option("-e", dest="evalue", default=1e-05, type=float,
                      help="E-value cut-off [%default]" )
    parser.add_option("-q", dest="qcov",   default=0, type=float,
                      help="query coverage  [%default]")
    parser.add_option("-t", dest="tcov",   default=0, type=float,
                      help="target coverage [%default]")
    #parser.add_option("-s", dest="tsplit", default=3, type=int,
    #                  help="split target name by '|' and print s postition [%default]")        
    parser.add_option("-v", dest="verbose",  default=False, action="store_true" )    
  
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) )

    #check files
    for fn in ( o.infile,o.query,o.target ):
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #get sizes of queries and targets
    q2len = get_contig2size( o.query  )
    t2len = get_contig2size( o.target )
    #get significant matches
    matches = parse_blast( o.infile,q2len,t2len,o.evalue,o.qcov,o.tcov,o.verbose )

    #parse matches
    print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t"
    for qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches:
        '''#chr start end name score strand
        name = "%s:%s-%s" % ( tlocus,tstart,tend )
        #get strand
        strand = "+"
        if qstart>qend:
            strand = "-"
            qstart,qend = qend,qstart
        #define bed
        bed  = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( qlocus,qstart-1,qend,name,score,strand ) '''
        out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % (qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov*100,tcov*100, )
        sys.stdout.write( out )
Beispiel #2
0
def main():

    usage  = "usage: %prog [options] [ > out.bed ]"
    desc   = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-i", dest="infile", help="blast output")
    parser.add_option("-j", dest="query", help="query fasta")
    parser.add_option("-k", dest="target", help="target fasta")
    parser.add_option("-e", dest="evalue", default=1e-05, type=float, help="E-value cut-off [%default]" )
    parser.add_option("-q", dest="qcov", default=0, type=float, help="query coverage  [%default]")
    parser.add_option("-t", dest="tcov", default=0, type=float, help="target coverage [%default]")
    #parser.add_option("-s", dest="tsplit", default=3, type=int,
    #                  help="split target name by '|' and print s postition [%default]")        
    parser.add_option("-v", dest="verbose", default=False, action="store_true" )    
  
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) )

    #check files
    for fn in ( o.infile,o.query,o.target ):
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #get sizes of queries and targets
    q2len = get_contig2size( o.query  )
    t2len = get_contig2size( o.target )
    #get significant matches
    matches = parse_blast( o.infile,q2len,t2len,o.evalue,o.qcov,o.tcov,o.verbose )

    #parse matches
    print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t"
    for qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches:
        '''#chr start end name score strand
        name = "%s:%s-%s" % ( tlocus,tstart,tend )
        #get strand
        strand = "+"
        if qstart>qend:
            strand = "-"
            qstart,qend = qend,qstart
        #define bed
        bed  = "%s\t%s\t%s\t%s\t%s\t%s\n" % ( qlocus,qstart-1,qend,name,score,strand ) '''
        out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % (qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov*100,tcov*100, )
        sys.stdout.write( out )
Beispiel #3
0
def main():

    usage = "usage: %prog [options] blastout1 [blastout2 ... blastoutN]  [ > out ]"
    desc = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser(usage=usage,
                          version="%prog 1.0",
                          description=desc,
                          epilog=epilog)

    parser.add_option("-k", dest="target", default="", help="target fasta")
    parser.add_option("-e",
                      dest="evalue",
                      default=1e-05,
                      type=float,
                      help="E-value cut-off [%default]")
    parser.add_option("-q",
                      dest="qcov",
                      default=0,
                      type=float,
                      help="query coverage  [%default]")
    parser.add_option("-t",
                      dest="tcov",
                      default=0,
                      type=float,
                      help="target coverage [%default]")
    parser.add_option("-s",
                      dest="fnsplit",
                      default=True,
                      action="store_false",
                      help="split fnames    [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true")

    (o, fnames) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\nArgs: %s\n" % (o, fnames))

    #check files
    for fn in fnames + [
            o.target,
    ]:
        if not fn:
            parser.error("Provide input file!")
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    #get sizes of targets
    t2len = get_contig2size(o.target)

    #dict to store matches and list of targets
    s2matches = []
    targets = sorted(t2len.keys())

    #process all files
    samples = []
    for fn in fnames:
        #define sample name
        s = fn
        #split by dot if requested
        if o.fnsplit:
            s = fn.split(".")[0]
        samples.append(s)

        #define empty matches
        smatch = []
        for i in range(len(targets)):
            smatch.append([])

        #get sizes of queries
        q2len = {}  #get_contig2size( fn )
        #get significant matches
        matches = parse_blast(fn, q2len, t2len, o.evalue, o.qcov, o.tcov,
                              o.verbose)

        #parse matches
        for qlocus, tlocus, identity, algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qcov, tcov in matches:
            i = targets.index(tlocus)
            #add match info if not match for given target
            if not smatch[i]:
                smatch[i] = (qlocus, e, score, identity, tcov)
            #or better match found
            elif score > smatch[i][2]:
                smatch[i] = (qlocus, e, score, identity, tcov)
        #store matches
        s2matches.append(smatch)

    #write header
    header = "Target"
    for s in samples:
        header += "\t%s\t" % s
    print header
    print "\t" + "identity [%]\tcoverage [%]\t" * len(samples)
    #write data
    for i in range(len(targets)):
        line = targets[i]
        for smatch in s2matches:
            if smatch[i]:
                qlocus, e, score, identity, tcov = smatch[i]
            else:
                identity = tcov = 0
            line += "\t%6.2f\t%6.2f" % (identity, tcov * 100)
        print line
Beispiel #4
0
def main():

    usage  = "usage: %prog [options] blastout1 [blastout2 ... blastoutN]  [ > out ]"
    desc   = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-k", dest="target",  default="",
                      help="target fasta")
    parser.add_option("-e", dest="evalue", default=1e-05, type=float,
                      help="E-value cut-off [%default]" )
    parser.add_option("-q", dest="qcov",   default=0, type=float,
                      help="query coverage  [%default]")
    parser.add_option("-t", dest="tcov",   default=0, type=float,
                      help="target coverage [%default]")
    parser.add_option("-s", dest="fnsplit", default=True, action="store_false",
                      help="split fnames    [%default]")        
    parser.add_option("-v", dest="verbose",  default=False, action="store_true" )    
  
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) )

    #check files
    for fn in fnames + [ o.target, ]:
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #get sizes of targets
    t2len = get_contig2size( o.target )

    #dict to store matches and list of targets
    s2matches = []
    targets   = sorted( t2len.keys() ) 
    
    #process all files
    samples = []
    for fn in fnames:
        #define sample name
        s = fn
        #split by dot if requested
        if o.fnsplit:
            s = fn.split(".")[0]
        samples.append( s )
        
        #define empty matches
        smatch = []
        for i in range( len(targets) ):
            smatch.append( [] )
            
        #get sizes of queries
        q2len = {}#get_contig2size( fn )        
        #get significant matches
        matches = parse_blast( fn,q2len,t2len,o.evalue,o.qcov,o.tcov,o.verbose )

        #parse matches
        for qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches:
            i = targets.index( tlocus )
            #add match info if not match for given target
            if not smatch[i]:
                smatch[i] = ( qlocus,e,score,identity,tcov )
            #or better match found
            elif score > smatch[i][2]:
                smatch[i] = ( qlocus,e,score,identity,tcov )
        #store matches
        s2matches.append( smatch )

    #write header
    header = "Target"
    for s in samples:
        header += "\t%s\t" % s
    print header
    print "\t" + "identity [%]\tcoverage [%]\t" * len(samples)
    #write data
    for i in range( len(targets) ):
        line = targets[i]
        for smatch in s2matches:
            if smatch[i]:
                qlocus,e,score,identity,tcov = smatch[i]
            else:
                identity=tcov=0
            line += "\t%6.2f\t%6.2f" % ( identity,tcov*100 )
        print line
Beispiel #5
0
def main():

    usage  = "usage: %prog [options] [ 1> matches.table.txt ]"
    desc   = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-i", dest="infile",  default="",
                      help="blast output")
    parser.add_option("-j", dest="query",  default="",
                      help="query fasta")
    parser.add_option("-k", dest="target",  default="",
                      help="target fasta")
    parser.add_option("-e", dest="evalue", default=1e-05, type=float,
                      help="E-value cut-off [%default]" )
    parser.add_option("-q", dest="qcov",   default=0.3, type=float,
                      help="query coverage  [%default]")
    parser.add_option("-t", dest="tcov",   default=0, type=float,
                      help="target coverage [%default]")
    #parser.add_option("-s", dest="tsplit", default=3, type=int,
    #                  help="split target name by '|' and print s postition [%default]")        
    parser.add_option("-v", dest="verbose",  default=False, action="store_true" )    
  
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) )

    #check files
    for fn in ( o.infile,o.query,o.target ):
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #queries = get_
    #get sizes of queries and targets
    q2len = get_contig2size( o.query  )
    t2len = get_contig2size( o.target )
    #get significant matches
    matches = parse_blast( o.infile,q2len,t2len,o.evalue,0,0,o.verbose )

    #parse matches
    matches_collapsed = {}
    print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t"
    for qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches:
        #add qlocus to matches
        if qlocus not in matches_collapsed:
            matches_collapsed[qlocus]={}
        if tlocus not in matches_collapsed[qlocus]:
            matches_collapsed[qlocus][tlocus]=[]
        #store data
        matches_collapsed[qlocus][tlocus].append( (algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov) )

    #
    matched_queries = set()
    for qlocus in sorted( matches_collapsed.keys() ):
        for tlocus in sorted( matches_collapsed[qlocus].keys() ):
            qCov=tCov=0
            for algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches_collapsed[qlocus][tlocus]:
                qCov += qcov
                tCov += tcov

            if qCov<o.qcov or tCov<o.tcov:
                continue
            out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % (qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qCov*100,tCov*100, )
            sys.stdout.write( out )
            matched_queries.add( qlocus )

    #get with no valid match
    sys.stderr.write( "Queries without valid matches:\n" )
    i = 0
    out = open( o.query + ".nomatch.fa","w" )
    for r in SeqIO.parse( open(o.query),"fasta" ):
        if r.id in matched_queries:
            continue
        i+=1
        line = "%s\t%s\t%s" % (i,r.id,len(r.seq))
        if r.id in matches_collapsed:
            line += "\t%s" % str( matches_collapsed[r.id] )
        sys.stderr.write( line+"\n" )
        #save fasta
        out.write( ">%s\n%s\n" % ( r.id,_get_formatted_seq( r.seq ) ) ) 
Beispiel #6
0
def main():

    usage = "usage: %prog [options] [ 1> matches.table.txt ]"
    desc = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser(usage=usage,
                          version="%prog 1.0",
                          description=desc,
                          epilog=epilog)

    parser.add_option("-i", dest="infile", default="", help="blast output")
    parser.add_option("-j", dest="query", default="", help="query fasta")
    parser.add_option("-k", dest="target", default="", help="target fasta")
    parser.add_option("-e",
                      dest="evalue",
                      default=1e-05,
                      type=float,
                      help="E-value cut-off [%default]")
    parser.add_option("-q",
                      dest="qcov",
                      default=0.3,
                      type=float,
                      help="query coverage  [%default]")
    parser.add_option("-t",
                      dest="tcov",
                      default=0,
                      type=float,
                      help="target coverage [%default]")
    #parser.add_option("-s", dest="tsplit", default=3, type=int,
    #                  help="split target name by '|' and print s postition [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true")

    (o, fnames) = parser.parse_args()
    if o.verbose:
        sys.stderr.write("Options: %s\nArgs: %s\n" % (o, fnames))

    #check files
    for fn in (o.infile, o.query, o.target):
        if not fn:
            parser.error("Provide input file!")
        if not os.path.isfile(fn):
            parser.error("No such file: %s" % fn)

    #queries = get_
    #get sizes of queries and targets
    q2len = get_contig2size(o.query)
    t2len = get_contig2size(o.target)
    #get significant matches
    matches = parse_blast(o.infile, q2len, t2len, o.evalue, 0, 0, o.verbose)

    #parse matches
    matches_collapsed = {}
    print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t"
    for qlocus, tlocus, identity, algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qcov, tcov in matches:
        #add qlocus to matches
        if qlocus not in matches_collapsed:
            matches_collapsed[qlocus] = {}
        if tlocus not in matches_collapsed[qlocus]:
            matches_collapsed[qlocus][tlocus] = []
        #store data
        matches_collapsed[qlocus][tlocus].append(
            (algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score,
             qcov, tcov))

    #
    matched_queries = set()
    for qlocus in sorted(matches_collapsed.keys()):
        for tlocus in sorted(matches_collapsed[qlocus].keys()):
            qCov = tCov = 0
            for algLen, mismatches, gaps, qstart, qend, tstart, tend, e, score, qcov, tcov in matches_collapsed[
                    qlocus][tlocus]:
                qCov += qcov
                tCov += tcov

            if qCov < o.qcov or tCov < o.tcov:
                continue
            out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % (
                qlocus,
                tlocus,
                identity,
                algLen,
                mismatches,
                gaps,
                qstart,
                qend,
                tstart,
                tend,
                e,
                score,
                qCov * 100,
                tCov * 100,
            )
            sys.stdout.write(out)
            matched_queries.add(qlocus)

    #get with no valid match
    sys.stderr.write("Queries without valid matches:\n")
    i = 0
    out = open(o.query + ".nomatch.fa", "w")
    for r in SeqIO.parse(open(o.query), "fasta"):
        if r.id in matched_queries:
            continue
        i += 1
        line = "%s\t%s\t%s" % (i, r.id, len(r.seq))
        if r.id in matches_collapsed:
            line += "\t%s" % str(matches_collapsed[r.id])
        sys.stderr.write(line + "\n")
        #save fasta
        out.write(">%s\n%s\n" % (r.id, _get_formatted_seq(r.seq)))