def main(): parser = optparse.OptionParser() parser.add_option( '-i', '--inputfile', dest='inputfilename', help='blast output file, in xml format.', metavar='FILE.xml' ) parser.add_option( '-o', '--outputfile', dest='outputfilename', help='base output filename', metavar='FILE' ) parser.add_option( '-d', '--db', dest='database', help='database from which the sequences should be fetched.', metavar='FILE' ) parser.add_option( '-e', '--evalue', dest='evalue', type='float', help='e-value threshold.', metavar='FLOAT' ) parser.add_option( '-E', '--start_expo_evalue', dest='startexpoeval', type='int', help='exponent of the evalue threshold used when refiltering.', metavar='INT' ) parser.add_option( '-b', '--blast_version', dest='blastversion', help='set the blast version to use, either `legacy` or `plus`.', metavar='VERSION' ) parser.add_option( '-f', '--filter', action='store_true', dest='dofilter', default=False, help='do the filter step.') parser.add_option( '-p', '--keep_patterns_iff', dest='keeppatiff', help='Keep only if patterns match exactly. The patterns should be coma seperated.', metavar='keyword1:pat1,pat2,pat3,,keyword2:pat1,pat2' ) parser.add_option( '-q', '--keep_patterns', dest='keeppat', help='Keep patterns that match exactly, no matter what. The patterns should be coma seperated.', metavar='keyword1:pat1,pat2,pat3,,keyword2:pat1,pat2' ) parser.add_option( '-g', '--gis', dest='gis', help='pickle file containing the gis that should match', metavar='FILE') parser.add_option( '-F', '--format', dest='formatop', help='format of the output. default is `header,evalue`', metavar='INTEGER' ) parser.add_option( '-M', '--max_num_start_seq', dest='maxnumstartseq', type='int', help='maximum number of sequences in the first alignement to be' +\ 'processed. If set, a new input file with the top sequences ordered' +\ 'by evalue is created and used.', metavar='INTEGER' ) parser.add_option( '-k', '--keep_U', action='store_true', dest='keepu', default=False, help='Should U containing sequences be kept regardless of their evalues ?.'+\ 'Use in conjunction of -M') parser.add_option( '-T', '--temp', dest='temp', help='set the temp folder to use.', metavar='FOLDER' ) parser.add_option( '-P', '--parse', dest='parse', action='store_true', default=False, help='do not do extra fancy steps. Just parse the file and return the disired output in a file.' ) parser.add_option( '-U', '--uniq', dest='uniq', action='store_true', default=False, help='remove duplicates.' ) parser.add_option( '-v', '--verbose', dest='verbosity', type='int', help='verbosity level : 0=none ; 1=standard ; 2=detailed ; 3=full', metavar='INTEGER' ) parser.set_defaults( verbosity = 1, database = 'nr', evalue = 10, startexpoeval = -10, keeppat = None, blastversion = 'legacy', temp = '/tmp/', maxnumstartseq = None, formatop = 'header,evalue') (options, args) = parser.parse_args() verbosity = options.verbosity database = options.database evalue = options.evalue temp = options.temp maxnumstartseq = options.maxnumstartseq blastindexfile = ''.join(( options.outputfilename, '.index.0' )) blastfastafile = ''.join(( options.outputfilename, '.fasta.0' )) os.system(' '.join(( 'touch', blastindexfile ))) os.system(' '.join(( 'touch', blastfastafile ))) if options.blastversion == 'legacy': fetcher = FastaCmdWrapper( entry=[], db=database, outfile=blastfastafile ) else: fetcher = BlastDbCmdWrapper( entry=[], db=database, outfile=blastfastafile ) ## Parse the blast output file. if options.parse: if verbosity >= 1: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Parsing blast output : ' +\ options.inputfilename + '\n' ) with open(options.inputfilename, 'r') as infile: blastparser = PsiBlastXMLParser(infile) blastparser.parse() if verbosity >= 2: sys.stderr.write(' >>> Extracting required data.\n') if options.dofilter: sequences = blastparser.extractData( evalue=evalue, fmt=options.formatop, outfile=blastindexfile, includepatternsiff=fmtOptPat(options.keeppatiff), includepatterns=fmtOptPat(options.keeppat), excludepatterns=({'title':['hypothetical', 'predicted', 'PREDICTED']})) else: sequences = blastparser.extractData( evalue=evalue, fmt=options.formatop, outfile=blastindexfile ) ## Only keep one copy of a header, the one with the best evalue. if options.uniq: if verbosity >= 1: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Keeping only best evalues.\n' ) uniq(blastindexfile) ## Gather all GIs in list if verbosity >= 2: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Gathering all Gis.\n' ) entries = [] with open(blastindexfile, 'r') as bif: for line in bif: entries.append(line.split('|')[1]) fetcher.entry = entries ## Fetch the sequences from the local databases. ## TODO : Fetch failed from the web. if verbosity >= 1: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Building fasta.0 file by fetching sequences from local database.\n' ) fetcher.run() ## Apply final filters : keep only top evalues and U containing until a threshold is reached if maxnumstartseq: if verbosity >= 1: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Applying final filters on ' + \ blastfastafile + '.\n' ) if verbosity >= 2: sys.stderr.write( ' >>> Adding evalue to headers.\n' ) ### TODO : use .fasta.fh in tmp dir. tmpfullheadfasta = blastfastafile + '.fh' addheaders = AddFullHeadersWrapper2(blastfastafile, tmpfullheadfasta, blastindexfile) addheaders.run() if verbosity >= 3: sys.stderr.write( ' >>> Loading sequences.\n' ) with open(tmpfullheadfasta, 'r') as ff: allseqs = Fasta.loadSequences(ff) if verbosity >= 2: sys.stderr.write( ' >>> Keeping valid sequences.\n' ) tmppat = None if options.keepu: tmppat = 'U' validseqs = getTopSeqs(seqs=allseqs, maxnumseqs=maxnumstartseq, startevalue=options.startexpoeval, pattern=tmppat, verbose=verbosity>=4 ) keptseqs = '.'.join(( options.outputfilename, str(validseqs[1]), str(len(validseqs[0])), 'fasta' )) if verbosity >= 2: sys.stderr.write( ' >>> Found ' + str(len(validseqs[0])) + \ ' sequences with evalue <= 1e' + \ str(validseqs[1]) + '\n' ) with open(keptseqs, 'w') as ff: validseqs[0].save(ff) sys.stderr.write( '\n' )
def main(): parser = optparse.OptionParser() fetchgroup = optparse.OptionGroup(parser, 'Options to work with a GI') blastgroup = optparse.OptionGroup(parser, 'Blast related options') fetchgroup.add_option('-s', '--entry', dest='gi_entry', help='GI to check against the database', metavar='GI') fetchgroup.add_option('-D', '--database_fetch', dest='dbf', help='location of the database that should be used for fetching the sequence from the gi provided.', metavar='DB') blastgroup.add_option('-b', '--blast_flavour', dest='blast_flavour', help='what kind of blast should be performed ?', metavar='BLAST') blastgroup.add_option('-d', '--database_check', dest='dbc', help='location of the database that should be used for checking.', metavar='DB') blastgroup.add_option('-a', '--ncore', dest='ncore', type='int', help='number of cores to use for the blast.', metavar='INT') parser.add_option('-q', '--query', dest='fasta_query', help='query in fasta format', metavar='FILE') parser.add_option('-o', '--output', dest='outputfile', help='name of the output file. default is stdout', metavar='FILE') parser.add_option('-n', '--num_top_hits', dest='num_top_hits', type='int', help='number of top hits to consider.', metavar='INT') parser.add_option('-f', '--filters_file', dest='filters_file', help='location of the file containing filters.', metavar='FILE') parser.add_option('-v', '--verbosity', dest='verbosity', action='count', help='set verbosity level') parser.add_option('-T', '--temp', dest='temp', help='temporary folder.', metavar='DIR') parser.add_option_group(fetchgroup) parser.add_option_group(blastgroup) parser.set_defaults(temp = '/tmp/', ncore = 1, num_top_hits = 1, outputfile = sys.stdout) (options, args) = parser.parse_args() if len(sys.argv) == 1: parser.error('No options specified. check_with_blast.py --help for details.') log_level = logging.WARNING if options.verbosity == 1: log_level = logging.INFO elif options.verbosity >= 2: log_level = logging.DEBUG logging.basicConfig(level=log_level, format='%(levelname)-6s:%(filename)s %(message)s') if options.filters_file: includefilters = parsefilters(options.filters_file) else: includefilters = None logging.info('Filters : '+str(includefilters)) if options.gi_entry: outputentryfa = os.path.join(options.temp, options.gi_entry + '.fa') outputblast = os.path.join(options.temp, options.gi_entry + '.xml') outputpf = os.path.join(options.temp, options.gi_entry + '.index') fetcher = FastaCmdWrapper([options.gi_entry], db=options.dbf, outfile=outputentryfa) blastqueryfile = outputentryfa elif options.fasta_query: outputblast = os.path.join(options.temp, os.path.basename(options.fasta_query) \ + '.xml') outputpf = os.path.join(options.temp, os.path.basename(options.fasta_query) \ + '.index') blastqueryfile = options.fasta_query blaster = BlastAllWrapper(blastqueryfile, outputblast, flavour=options.blast_flavour, db=options.dbc, gis=True, ncore=options.ncore) xmlparser = PsiBlastXMLParser(outputblast) if options.gi_entry: logging.info('Fetching the sequence from local database : '+fetcher.cline) fetcher.run() logging.info('Running blast : '+blaster.cline) blaster.run() with open(outputblast, 'r') as iff: logging.info('Parsing the xml output -> '+outputpf) xmlparser = PsiBlastXMLParser(iff) xmlparser.parse() xmlparser.extractData(fmt='evalue,header', outfile=outputpf) results = HeadEvalueDict() with open(outputpf, 'r') as iff: for line in iff: evalue = line.split()[0] header = ' '.join(line.split()[1:]) if evalue not in results: results[evalue] = [header] else: results[evalue].append(header) topindexes = results.keys() topindexes.sort(lambda e1, e2: cmp(float(e1), float(e2))) topseqs = [(e, results[e]) for e in topindexes[:options.num_top_hits]] finaloutput = [] for eseq in topseqs: for header in eseq[1]: if options.filters_file: for ikw in includefilters: if ikw in header: finaloutput.append((eseq[0], header)) else: finaloutput.append((eseq[0], header)) logging.debug(str(finaloutput)) try: if options.outputfile != sys.stdout: off = open(options.outputfile, 'w') else: off = sys.stdout off.write('# '+str(includefilters)+'\n') for oo in finaloutput: off.write(oo[0]+' : '+oo[1]+'\n') except Exception, (e): print e