def spDiff( fileA, fileB ): sfileA = open( fileA, 'r' ) sfileB = open( fileB, 'r' ) seqsA = Fasta.loadSequences( sfileA ) seqsB = Fasta.loadSequences( sfileB ) sfileA.close() sfileB.close() spA = seqsA.findPattern('U', mode='full') spB = seqsB.findPattern('U', mode='full') return spA.symetric_difference(spB, method='raw')
def main(): parser = optparse.OptionParser() parser.add_option( '-i', '--inputfile', dest='inputfilename', help='blast output file, in xml format.', metavar='FILE.xml' ) parser.add_option( '-o', '--outputfile', dest='outputfilename', help='base output filename', metavar='FILE' ) parser.add_option( '-d', '--db', dest='database', help='database from which the sequences should be fetched.', metavar='FILE' ) parser.add_option( '-e', '--evalue', dest='evalue', type='float', help='e-value threshold.', metavar='FLOAT' ) parser.add_option( '-E', '--start_expo_evalue', dest='startexpoeval', type='int', help='exponent of the evalue threshold used when refiltering.', metavar='INT' ) parser.add_option( '-b', '--blast_version', dest='blastversion', help='set the blast version to use, either `legacy` or `plus`.', metavar='VERSION' ) parser.add_option( '-f', '--filter', action='store_true', dest='dofilter', default=False, help='do the filter step.') parser.add_option( '-p', '--keep_patterns_iff', dest='keeppatiff', help='Keep only if patterns match exactly. The patterns should be coma seperated.', metavar='keyword1:pat1,pat2,pat3,,keyword2:pat1,pat2' ) parser.add_option( '-q', '--keep_patterns', dest='keeppat', help='Keep patterns that match exactly, no matter what. The patterns should be coma seperated.', metavar='keyword1:pat1,pat2,pat3,,keyword2:pat1,pat2' ) parser.add_option( '-g', '--gis', dest='gis', help='pickle file containing the gis that should match', metavar='FILE') parser.add_option( '-F', '--format', dest='formatop', help='format of the output. default is `header,evalue`', metavar='INTEGER' ) parser.add_option( '-M', '--max_num_start_seq', dest='maxnumstartseq', type='int', help='maximum number of sequences in the first alignement to be' +\ 'processed. If set, a new input file with the top sequences ordered' +\ 'by evalue is created and used.', metavar='INTEGER' ) parser.add_option( '-k', '--keep_U', action='store_true', dest='keepu', default=False, help='Should U containing sequences be kept regardless of their evalues ?.'+\ 'Use in conjunction of -M') parser.add_option( '-T', '--temp', dest='temp', help='set the temp folder to use.', metavar='FOLDER' ) parser.add_option( '-P', '--parse', dest='parse', action='store_true', default=False, help='do not do extra fancy steps. Just parse the file and return the disired output in a file.' ) parser.add_option( '-U', '--uniq', dest='uniq', action='store_true', default=False, help='remove duplicates.' ) parser.add_option( '-v', '--verbose', dest='verbosity', type='int', help='verbosity level : 0=none ; 1=standard ; 2=detailed ; 3=full', metavar='INTEGER' ) parser.set_defaults( verbosity = 1, database = 'nr', evalue = 10, startexpoeval = -10, keeppat = None, blastversion = 'legacy', temp = '/tmp/', maxnumstartseq = None, formatop = 'header,evalue') (options, args) = parser.parse_args() verbosity = options.verbosity database = options.database evalue = options.evalue temp = options.temp maxnumstartseq = options.maxnumstartseq blastindexfile = ''.join(( options.outputfilename, '.index.0' )) blastfastafile = ''.join(( options.outputfilename, '.fasta.0' )) os.system(' '.join(( 'touch', blastindexfile ))) os.system(' '.join(( 'touch', blastfastafile ))) if options.blastversion == 'legacy': fetcher = FastaCmdWrapper( entry=[], db=database, outfile=blastfastafile ) else: fetcher = BlastDbCmdWrapper( entry=[], db=database, outfile=blastfastafile ) ## Parse the blast output file. if options.parse: if verbosity >= 1: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Parsing blast output : ' +\ options.inputfilename + '\n' ) with open(options.inputfilename, 'r') as infile: blastparser = PsiBlastXMLParser(infile) blastparser.parse() if verbosity >= 2: sys.stderr.write(' >>> Extracting required data.\n') if options.dofilter: sequences = blastparser.extractData( evalue=evalue, fmt=options.formatop, outfile=blastindexfile, includepatternsiff=fmtOptPat(options.keeppatiff), includepatterns=fmtOptPat(options.keeppat), excludepatterns=({'title':['hypothetical', 'predicted', 'PREDICTED']})) else: sequences = blastparser.extractData( evalue=evalue, fmt=options.formatop, outfile=blastindexfile ) ## Only keep one copy of a header, the one with the best evalue. if options.uniq: if verbosity >= 1: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Keeping only best evalues.\n' ) uniq(blastindexfile) ## Gather all GIs in list if verbosity >= 2: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Gathering all Gis.\n' ) entries = [] with open(blastindexfile, 'r') as bif: for line in bif: entries.append(line.split('|')[1]) fetcher.entry = entries ## Fetch the sequences from the local databases. ## TODO : Fetch failed from the web. if verbosity >= 1: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Building fasta.0 file by fetching sequences from local database.\n' ) fetcher.run() ## Apply final filters : keep only top evalues and U containing until a threshold is reached if maxnumstartseq: if verbosity >= 1: sys.stderr.write( '\n' ) sys.stderr.write( '>>> Applying final filters on ' + \ blastfastafile + '.\n' ) if verbosity >= 2: sys.stderr.write( ' >>> Adding evalue to headers.\n' ) ### TODO : use .fasta.fh in tmp dir. tmpfullheadfasta = blastfastafile + '.fh' addheaders = AddFullHeadersWrapper2(blastfastafile, tmpfullheadfasta, blastindexfile) addheaders.run() if verbosity >= 3: sys.stderr.write( ' >>> Loading sequences.\n' ) with open(tmpfullheadfasta, 'r') as ff: allseqs = Fasta.loadSequences(ff) if verbosity >= 2: sys.stderr.write( ' >>> Keeping valid sequences.\n' ) tmppat = None if options.keepu: tmppat = 'U' validseqs = getTopSeqs(seqs=allseqs, maxnumseqs=maxnumstartseq, startevalue=options.startexpoeval, pattern=tmppat, verbose=verbosity>=4 ) keptseqs = '.'.join(( options.outputfilename, str(validseqs[1]), str(len(validseqs[0])), 'fasta' )) if verbosity >= 2: sys.stderr.write( ' >>> Found ' + str(len(validseqs[0])) + \ ' sequences with evalue <= 1e' + \ str(validseqs[1]) + '\n' ) with open(keptseqs, 'w') as ff: validseqs[0].save(ff) sys.stderr.write( '\n' )
def main(): parser = optparse.OptionParser() parser.add_option( '-i', '--inputfile', dest='inputfilename', help='fasta file in which selenoproteins should be looked for.', metavar='FILE' ) parser.add_option( '-o', '--outputfile', dest='outputfilename', help='fasta file containing the selenoproteins', metavar='FILE' ) parser.add_option( '-v', '--verbose', dest='verbosity', help='verbosity level : 0=none ; 1=standard ; 2=detailed ; 3=full', metavar='INTEGER' ) parser.set_defaults( verbosity = '1' ) (options, args) = parser.parse_args() stdoutflag = False verbosity = int( options.verbosity ) if options.inputfilename: inputfilenames = options.inputfilename.split(',') infiles = [] for i in inputfilenames: infiles.append( open( i, 'r' ) ) else: sys.exit( 'You must provide an input filename.') if options.outputfilename: outfile = open( options.outputfilename, 'w' ) stdoutflag = True else: outfile = sys.stdout for f in infiles: if verbosity >= 1: print print '>>> Searching for selenoproteins in file ' + f.name print if verbosity >= 2: print '>>> Loading sequences ...' sequences = FastaLib.loadSequences( f ) if verbosity >= 2: print '>>> ... Done.' print if verbosity >= 2: print '>>> Searching for U containing sequences ...' selenoproteins = findSelenoproteins( sequences ) if verbosity >= 2: print '>>> ... Done.' print FastaLib.saveSequences(selenoproteins, outfile) for selP in selenoproteins: if verbosity >= 3 and stdoutflag: print selP.header.strip() print selP.sequence.strip() if verbosity >= 1: print print 'Found ' + str( len( selenoproteins ) ) + ' selenoproteins' print for i in infiles: i.close() outfile.close()
def main(): parser = optparse.OptionParser() parser.add_option( '-i', '--inputfile', dest='inputfilename', help='file containing the alignments that will be used to build the PSSM using prepare_alignment_selenoprofiles.py.', metavar='FILE' ) parser.add_option( '-r', '--datadir', dest='datadir', help='directory containing, for each familly FAM, a directory FAM.blast and a directory FAM.selenoprofiles.prep', metavar='DIR' ) parser.add_option( '-o', '--outputfile', dest='outputfilename', help='base name used for outputs', metavar='NAME' ) parser.add_option( '-a', '--n_core', dest='ncore', type='int', help='number of cores to use during the various operations.', metavar='INTEGER' ) parser.add_option( '-M', '--mafft', action='store_true', dest='domafft', default=False, help='do the mafft step.') parser.add_option( '-T', '--trimal', action='store_true', dest='dotrimal', default=False, help='do the trimal step.') parser.add_option( '-C', '--tcoffee', action='store_true', dest='dotcoffee', default=False, help='do the t_coffee step.') parser.add_option( '-B', '--headers', action='store_true', dest='doheaders', default=False, help='do the addheaders step.') parser.add_option( '-p', '--patternfile', dest='patternfile', help='pattern file to use if the -D option is used.', metavar='FILE' ) parser.add_option( '-F', '--filter', action='store_true', dest='dofilter', default=False, help='do the filter step.') parser.add_option( '-P', '--prepare', action='store_true', dest='doprepal', default=False, help='do the prepare_alignment_selenoprofiles step.') parser.add_option( '-g', '--tag_threshold', dest='tagthreshold', type='float', help='tag threshold to use if the -P or --prepare is used.', metavar='FLOAT' ) parser.add_option( '-A', '--all', action='store_true', dest='doall', default=False, help='do all steps.') parser.add_option( '-Y', '--dry', action='store_true', dest='dryrun', default=False, help="Prints the commands without executing them.") parser.add_option( '-D', '--debug', action='store_true', dest='debug', default=False, help="Debug mode. Nothing is cleaned.") parser.add_option( '-t', '--temp', dest='temp', help='set the temp folder to use.', metavar='FOLDER' ) parser.add_option( '-v', '--verbose', dest='verbosity', type='int', help='verbosity level : 0=none ; 1=standard ; 2=detailed ; 3=full', metavar='INTEGER' ) parser.set_defaults( verbosity = 1, ncore = 1, tagthreshold = 0.5, temp = '/tmp/', patternfile = 'None' ) (options, args) = parser.parse_args() if options.doall: options.doheaders = True options.dofilter = True options.domafft = True options.dotrimal = True options.dotcoffee = True infile = options.inputfilename tmpinitfilename = genTempfilename(options.temp, 'ungapped_') with open(infile, 'r') as iff: tmpseqs = Fasta.loadSequences(iff) with open(tmpinitfilename, 'w') as ugf: for seq in tmpseqs: removeGaps(seq).prints(ugf) tmpinfile = tmpinitfilename mafftoutfile = ''.join((options.outputfilename, '_mafft.fasta')) trimaloutfile1 = ''.join((options.outputfilename, '_trimmed_native.fasta')) trimaloutfile2 = ''.join((options.outputfilename, '_trimmed_spadded.fasta')) trimaloutfile = trimaloutfile1 tcoffeeoutfile = ''.join((options.outputfilename, '_tcoffee.fasta')) fullheadoutfile = ''.join((options.outputfilename, '.det.fasta')) # patternfile = ''.join(('.'.join(options.inputfilename.split('.')[:2]), '.index.0')) patternfile = options.patternfile filteroutfile = ''.join((options.outputfilename, '.filt.fasta')) ncore = options.ncore verbosity = options.verbosity temp = options.temp addheaders = UtilityWrappers.AddFullHeadersWrapper2(tmpinfile, fullheadoutfile, patternfile) filterseqs = UtilityWrappers.FilterWrapper(tmpinfile, filteroutfile, inverse=True, titlematch=('PREDICTED', 'predicted', 'hypothetical')) mafft = UtilityWrappers.MafftWrapper(tmpinfile, mafftoutfile, auto=True) trimal = UtilityWrappers.TrimalWrapper(tmpinfile, trimaloutfile1, clusters=100) tcoffee = UtilityWrappers.TcoffeeWrapper(trimaloutfile2, tcoffeeoutfile, ncore=ncore) prepsp = UtilityWrappers.SelenoprofilesPreWrapper(tmpinfile, options.outputfilename, all=True, tagthreshold=options.tagthreshold, temp=temp) try: if options.dryrun: print('\nThis is a dry run. Relaunch the command without the option -Y to do the actual stuff.\n') ## Add full headers ## if options.doheaders: ## addheader.infile = tmpinfile ## tmpinfile = fullheadoutfile ## if options.dryrun: ## print addheaders.cline ## else: ## if verbosity >= 1: ## sys.stderr.write('\n >>> Adding headers\n\n') ## addheaders.run() ## Filter out the 'fake' proteins if options.dofilter: time.sleep(0.5) filterseqs.infile = tmpinfile tmpinfile = filteroutfile if options.dryrun: print filterseqs.cline else: if verbosity >= 1: sys.stderr.write('\n >>> Filtering out\n\n') filterseqs.run() ## run mafft numseqinmafftoutput = 0 if options.domafft: time.sleep(0.5) mafft.infile = tmpinfile tmpinfile = mafftoutfile if options.dryrun: print mafft.cline else: if verbosity >= 1: sys.stderr.write('\n >>> Running Mafft\n\n') mafft.run() with open(mafftoutfile, 'r') as mfo: seqs = Fasta.loadSequences(mfo) numseqinmafftoutput = len(seqs) ## run trimal if options.dotrimal and numseqinmafftoutput > 200: time.sleep(0.5) trimal.infile = tmpinfile tmpinfile = trimaloutfile1 if options.dryrun: print trimal.cline else: if verbosity >= 1: sys.stderr.write('\n >>> Running Trimal\n\n') trimal.run() if not options.dryrun and options.dotcoffee and options.dotrimal: if verbosity >= 1: sys.stderr.write('\n >>> Removing gaps\n\n') ti = open(tmpinfile, 'r') tmpinfile = trimaloutfile2 to = open(tmpinfile, 'w') si = Fasta.loadSequences(ti) ti.close() refs = Fasta.SequenceList() ## saves the sequences with no gaps for s in si: refs.append(removeGaps(s)) Fasta.saveSequences(refs, to) if options.dotrimal and numseqinmafftoutput > 200: if verbosity >= 1: sys.stderr.write('\n >>> Adding ommited selenoproteins\n') ## Gather the non intersecting proteins from the 2 files diffSelenoproteins = spDiff( mafftoutfile, trimaloutfile1 ) spDiffr = Fasta.SequenceList() ## remove gaps from selenoproteins for s in diffSelenoproteins: spDiffr.append(removeGaps(s)) ## append to the file the selenoproteins that were not present Fasta.saveSequences(spDiffr, to) to.close() ## run t_coffee if options.dotcoffee: time.sleep(0.5) tcoffee.infile = tmpinfile tmpinfile = tcoffeeoutfile if options.dryrun: print tcoffee.cline else: if verbosity >= 1: sys.stderr.write('\n >>> Running T_coffee\n\n') tcoffee.run() ## Add full headers if options.doheaders: time.sleep(0.5) addheaders.infile = tmpinfile tmpinfile = fullheadoutfile if options.dryrun: print addheaders.cline else: if verbosity >= 1: sys.stderr.write('\n >>> Adding headers\n\n') addheaders.run() ## prepare alignments for selenoprofiles if options.doprepal: time.sleep(0.5) prepsp.infile = tmpinfile if options.dryrun: print prepsp.cline else: if verbosity >= 1: sys.stderr.write('\n >>> preparing for selenoprofiles\n\n') prepsp.run() except KeyboardInterrupt: sys.exit('manual exit.') finally: if not options.debug: if verbosity >= 2: sys.stderr.write('\n >>> Removing temporary file ' + tmpinitfilename +'\n\n') os.remove(tmpinitfilename)
def main(): parser = optparse.OptionParser() parser.add_option( '-i', '--inputfile', dest='inputfilename', help='file with incomplete headers.', metavar='FILE' ) parser.add_option( '-o', '--outputfile', dest='outputfilename', help='outputfile.', metavar='FILE' ) parser.add_option( '-p', '--pattern', dest='patternfilename', help='pattern file containing the complete headers.', metavar='FILE' ) parser.add_option( '-m', '--method', dest='method', help='Method to use when filling the headers.' \ 'gi means match will be done by gi. inplace means that' \ 'header substitution is made by following the order.', metavar='{gi}|inplace' ) parser.set_defaults( outputfilename = None, method = 'gi') (options, args) = parser.parse_args() if not (options.inputfilename and options.patternfilename): parser.error('You have to provide two files, check help.') with open(options.inputfilename, 'r') as iff: inlines = Fasta.loadSequences(iff) with open(options.patternfilename, 'r') as pff: patlines = [line for line in pff.readlines() \ if line.startswith('>')] if not options.outputfilename: outfile = sys.stdout else: outfile = open(options.outputfilename, 'w') if options.method == 'gi': GI_REGEX = re.compile(r'gi\|(\d+)\|') for iseq in inlines: nofound = True for phead in patlines: try: giq = GI_REGEX.search(iseq.header).group(1) gis = GI_REGEX.search(phead).group(1) if giq == gis: tmpseq = Fasta.Sequence(phead, iseq.sequence) tmpseq.prints(outfile) nofound = False break except AttributeError as e: sys.stderr.write(iseq.header + ' ' + phead) sys.exit(-1) except IndexError as e: sys.stderr.write( '\nError while processing the files:\n' ) sys.stderr.write( pline + '\n' ) sys.stderr.write( line + '\n' ) break if nofound: sys.stderr.write('\n' + iseq.header + '\n') elif options.method == 'inplace': if len(inlines) != len(patlines): raise Exception, 'Different number of sequences' for seq, pat in zip(inlines, patlines): Fasta.Sequence(pat, seq.sequence).prints(outfile, 60) else: parser.error('Wrong method') outfile.close()
def main(): parser = optparse.OptionParser() parser.add_option( '-i', '--inputfile', dest='inputfilename', help='fasta file in which selenoproteins should be looked for.', metavar='FILE' ) parser.add_option( '-a', '--alignmentfile', dest='alfilename', help='alignment file used when details are requested.', metavar='FILE' ) parser.add_option( '-o', '--outputfile', dest='outputfilename', help='base output filename', metavar='FILE' ) parser.add_option( '-f', '--keep_prefilter', dest='keepprefilter', help='prefilters all sequences that have the given pattern in their name and keep them.', metavar='PATTERN' ) parser.add_option( '-F', '--throw_prefilter', dest='throwprefilter', help='prefilters all sequences that have the given pattern in their name and throw them.', metavar='PATTERN' ) parser.add_option( '-b', '--autothrow_abscents', action='store_true', dest='atabscent', default=False, help='Throw all sequences not present in the alignment provided.') parser.set_defaults( keepprefilter = False, throwprefilter = False, alfilename = False ) (options, args) = parser.parse_args() with open(options.inputfilename, 'r') as inf: sequences = Fasta.loadSequences(inf) if options.alfilename: with open(options.alfilename, 'r') as alf: alignment = Fasta.Alignment(Fasta.loadSequences(alf)) nrdetail = alignment.findPositions(('U','C','-'), False) rdetail = alignment.findPositions(('U','C','-'), True) if options.keepprefilter: kpatterns = options.keepprefilter.split(',') if options.throwprefilter: tpatterns = options.throwprefilter.split(',') kept_seq = Fasta.SequenceList() thrown_seq = Fasta.SequenceList() man_check_list = Fasta.SequenceList() for seq in sequences: kept = False thrown = False if options.keepprefilter: for pattern in kpatterns: if pattern in seq.header: kept_seq.append(seq) kept = True if options.throwprefilter: for pattern in tpatterns: if pattern in seq.header: thrown_seq.append(seq) thrown = True if not kept and not thrown: man_check_list.append(seq) idx = 0 while idx < len(man_check_list): seq = man_check_list[idx] gi = seq.header.split('|')[1] choice = 'r' decided = False print seq.header while not decided: print len(kept_seq), len(thrown_seq) choice = getch('# '+str(idx+1)+' / '+str(len(man_check_list))+' -- Keep ? [Y/n]') if choice == 'b': if idx > 0: idx -= 1 seq = man_check_list[idx] gi = seq.header.split('|')[1] print seq.header try: thrown_seq.remove(seq) except: pass try: kept_seq.remove(seq) except: pass elif choice in ('y', '\n'): kept_seq.append(seq) decided = True idx += 1 elif choice == 'n': thrown_seq.append(seq) decided = True idx += 1 elif choice == 's': os.system('fetch_seq.g -v TITLE="'+gi+'" -v ALL=1 '+options.inputfilename ) elif choice == 'd' and options.alfilename: print print 'General Detail :' for pos in rdetail['U']: sys.stdout.write(' '+str(pos) + ' ') for xpos in rdetail: try: sys.stdout.write(str(xpos)+': ') sys.stdout.write(str(len(rdetail[xpos][pos])) + ' ; ') except KeyError: sys.stdout.write('0 ; ') sys.stdout.write('\n') print tmpseq = None for seqal in alignment: if seqal.header == seq.header: tmpseq = seqal if tmpseq: tmppos = [i for i, x in enumerate(tmpseq.sequence) if x == 'U'] print 'In the sequence provided :' print ' U :', tmppos print ' U in those positions :', [len(rdetail['U'][(l,)]) for l in tmppos] print ' C in those positions:', [len(rdetail['C'][(l,)]) for l in tmppos] print ' - in those positions:', [len(rdetail['-'][tuple((l,))]) for l in tmppos] print print ' Symbols present at the positions of each U :' for pos in [p for p in rdetail['U'] if p != ()]: spos = str(pos[0]) print ' Position :', spos, '---', tmpseq.sequence[int(spos)] else: print 'Not present in the alignment provided' print elif choice == 'q': cc = 'r' while cc not in ('y', 'n'): cc = raw_input('Manual quit. Would you like to save your changes ? [y/N]') if cc in 'y': pass if cc in 'n': sys.exit('Quiting without saving.') else: print 'Wrong command' with open(options.outputfilename, 'w') as of: kept_seq.prints(of, 80)