Example #1
0
def main():
    usage  = "usage: %prog [options]"
    desc   = """Parse multi-fasta file and report sequences without overlap
with already reported sequences. Starts from the longest."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-i", dest="infile",  
                      help="multi-fasta file       [mandatory]")
    parser.add_option("-m", dest="minIdentity",  default=90, type=int,
                      help="min identity           [%default]")
    parser.add_option("-o", dest="overlap",  default=0.3, type=float,
                      help="max overlap allowed    [%default]")
    parser.add_option("-v", dest="verbose", default=False, action="store_true" )
    
    ( o, args ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,args ) )

    for fn in [ o.infile, ]:
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #load fastas
    fastas = genome2dict( o.infile )

    #contigs by descending length
    contigs = sorted( fastas.keys(),key=lambda x: len(fastas[x]), reverse=True )

    #report non-overlapping
    i = 0
    added,skipped = set(), set()
    ##remove outfile if exists
    outfn = o.infile + ".collapsed_o%s_i%s.fa" % ( o.overlap,o.minIdentity )
    if os.path.isfile( outfn ):
        os.unlink( outfn )
    ##execute blat vs itself
    pslfn = run_blat( o.infile,o.infile,o.minIdentity,o.verbose )
    matches = parse_blat( pslfn,o.verbose,header=0,skipSelfMatches=1 )
    ##add contigs without overlap
    for c in contigs:
        i += 1
        if o.verbose:
            sys.stderr.write( " %3s %20s [ %7.2f kb]\n" % (i,c,len(fastas[c])/1000.0) )
        #get fasta entry
        fasta = ">%s\n%s\n" % (c,_get_formatted_seq(fastas[c]))
        #save contig if first or if no overlapping already processed
        if not added or not overlapping( c,added,matches,o.overlap,o.verbose ):
            added.add( c )
            out = open(outfn,"a"); out.write( fasta ); out.close()
        else:
            skipped.add( c )

    sys.stderr.write( "Selected %s [ %7.2f kb] out of %s [ %7.2f kb] contigs.\n" % ( len(added),sum([len(fastas[c]) for c in added])/10.0**3,len(fastas),sum([len(fastas[c]) for c in fastas])/10.0**3) )
Example #2
0
def main():

    usage  = "usage: %prog [options] [ 1> matches.table.txt ]"
    desc   = """Blast has to be run with -m8."""
    epilog = ""
    parser = OptionParser( usage=usage,version="%prog 1.0",description=desc,epilog=epilog ) 

    parser.add_option("-i", dest="infile",  default="",
                      help="blast output")
    parser.add_option("-j", dest="query",  default="",
                      help="query fasta")
    parser.add_option("-k", dest="target",  default="",
                      help="target fasta")
    parser.add_option("-e", dest="evalue", default=1e-05, type=float,
                      help="E-value cut-off [%default]" )
    parser.add_option("-q", dest="qcov",   default=0.3, type=float,
                      help="query coverage  [%default]")
    parser.add_option("-t", dest="tcov",   default=0, type=float,
                      help="target coverage [%default]")
    #parser.add_option("-s", dest="tsplit", default=3, type=int,
    #                  help="split target name by '|' and print s postition [%default]")        
    parser.add_option("-v", dest="verbose",  default=False, action="store_true" )    
  
    ( o, fnames ) = parser.parse_args()
    if o.verbose:
        sys.stderr.write( "Options: %s\nArgs: %s\n" % ( o,fnames ) )

    #check files
    for fn in ( o.infile,o.query,o.target ):
        if not fn:
            parser.error( "Provide input file!" )
        if not os.path.isfile( fn ):
            parser.error( "No such file: %s" % fn )

    #queries = get_
    #get sizes of queries and targets
    q2len = get_contig2size( o.query  )
    t2len = get_contig2size( o.target )
    #get significant matches
    matches = parse_blast( o.infile,q2len,t2len,o.evalue,0,0,o.verbose )

    #parse matches
    matches_collapsed = {}
    print "#Query\tTarget\tIndentity\tAlignment length\tMismatches\tGaps\tQuery start\tQuery end\tTarget start\tTarget end\tE-value\tScore\tQuery aligned [%]\tTarget aligned [%]\t"
    for qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches:
        #add qlocus to matches
        if qlocus not in matches_collapsed:
            matches_collapsed[qlocus]={}
        if tlocus not in matches_collapsed[qlocus]:
            matches_collapsed[qlocus][tlocus]=[]
        #store data
        matches_collapsed[qlocus][tlocus].append( (algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov) )

    #
    matched_queries = set()
    for qlocus in sorted( matches_collapsed.keys() ):
        for tlocus in sorted( matches_collapsed[qlocus].keys() ):
            qCov=tCov=0
            for algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qcov,tcov in matches_collapsed[qlocus][tlocus]:
                qCov += qcov
                tCov += tcov

            if qCov<o.qcov or tCov<o.tcov:
                continue
            out = "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%.1f\t%.1f\n" % (qlocus,tlocus,identity,algLen,mismatches,gaps,qstart,qend,tstart,tend,e,score,qCov*100,tCov*100, )
            sys.stdout.write( out )
            matched_queries.add( qlocus )

    #get with no valid match
    sys.stderr.write( "Queries without valid matches:\n" )
    i = 0
    out = open( o.query + ".nomatch.fa","w" )
    for r in SeqIO.parse( open(o.query),"fasta" ):
        if r.id in matched_queries:
            continue
        i+=1
        line = "%s\t%s\t%s" % (i,r.id,len(r.seq))
        if r.id in matches_collapsed:
            line += "\t%s" % str( matches_collapsed[r.id] )
        sys.stderr.write( line+"\n" )
        #save fasta
        out.write( ">%s\n%s\n" % ( r.id,_get_formatted_seq( r.seq ) ) ) 
Example #3
0
def sort_hits( matches,query2fasta,ref2fasta,outbase,qOverlapTh,haploid,monoploid,verbose ):
    """Return sorted multifasta of monoploid genomes.
    Contigs are sorted based on reference alignment.
    """
    #get best query to reference pairs
    ##prepare nested dictionary
    q2r = {}
    for q in query2fasta:
        q2r[q] = {}
    ##all query to ref
    for r,rStart,rStop,q,qStart,qStop,identity in matches:
        #for rStart,rStop,q,qStart,qStop,identity in matches[r]:
        qAligned = abs( qStop-qStart )
        #define if forward or reverse alg
        fwd = rev = 0        
        if qStop<qStart:
            rev = qAligned
        else:
            fwd = qAligned
        #store alg info
        if r not in q2r[q]:
            q2r[q][r] = [qAligned,rStart,rStop,fwd,rev]
            continue
        q2r[q][r][0] += qAligned
        if rStart<q2r[q][r][1]:
            q2r[q][r][1] = rStart
        if rStop>q2r[q][r][2]:
            q2r[q][r][2] = rStop
        #add fwd,rev
        q2r[q][r][3] += fwd
        q2r[q][r][4] += rev
        
    ##get best match for each query
    q2rBest = {}
    for q in query2fasta: #qSorted:
        refs = sorted( q2r[q].iteritems(), key=lambda x: q2r[q][x[0]][0], reverse=True )
        #skip contigs without a match or with too small fraction aligned
        if not refs or refs[0][1][0] < qOverlapTh * len(query2fasta[q]):
            continue
        q2rBest[q] = refs[0]
        print q,refs

    qOut = open( "%s.sorted.fa" % outbase,"w" )
    for r,rStart,rStop,q,qStart,qStop,identity in matches:
        if q not in q2rBest:
            continue
        #check if current r is the best for given q
        if r != q2rBest[q][0]:
            continue
        #pop given q from dictionary so it's saved only once
        qAligned,rStart,rStop,fwd,rev = q2rBest.pop(q)[1]
        #save sequence
        if fwd>rev:
            qOut.write( ">%s\n%s\n" % ( q, _get_formatted_seq( query2fasta[q] ) ) )
        #or it's reverse complement if more reverse aligned
        else:
            qOut.write( ">%s|rev\n%s\n" % ( q, _get_formatted_seq( query2fasta[q].reverse_complement() ) ) )
    return
    
    #separate monoploids
    ##how many monoploid sets
    noFiles = monoploid/haploid
    #define matches on reference global start and stops for each query
    matchesGlobal = {}
    for q,data in q2rBest.iteritems():
        r = data[0]
        qAligned,rGlobStart,rGlobStop = data[1]
        if r not in matchesGlobal:
            matchesGlobal[r]=[]
        matchesGlobal[r].append( (rGlobStart,rGlobStop,q) )
    #sort
    rOut = open( "%s.ref.sorted.fa" % outbase,"w" )
    qOut = open( "%s.query.sorted.fa" % outbase,"w" )
    for r in matchesGlobal:
        matchesGlobal[r].sort()

    for r in sorted( matchesGlobal.keys() ):
        rOut.write( ">%s\n%s\n" % ( r, _get_formatted_seq( ref2fasta[r] ) ) )
        for rGlobStart,rGlobStop,q in matchesGlobal[r]:
            print r,rGlobStart,rGlobStop,q
            qOut.write( ">%s\n%s\n" % ( q, _get_formatted_seq( query2fasta[q] ) ) )

    rOut.close()
    qOut.close()
    return
    for r in matchesGlobal:
        for rGlobStart,rGlobStop,q in matchesGlobal[r]:
            matchesInRegion = filter( lambda x: x[0]<rGlobStart<x[1] or x[0]<rGlobStop<x[1], matchesGlobal[r] )

            print r,rGlobStart,rGlobStop,q
            print matchesInRegion
            
    
    #save to file
    sys.stderr.write( "Going to save chromosomes in %s files:\n" % noFiles )
    outfiles=[]
    for i in range( noFiles ):
        outfn = "%s.%s.fa" % (outbase,i+1)
        sys.stderr.write( "  %s\n" % outfn )
        outfiles.append( open( outfn,"w" ) )