def analyzeMali(mali, options, prefix_row=""): if len(mali) == 0: raise "not analyzing empty multiple alignment" # count empty sequences row_data = map( lambda x: Mali.MaliData(x.mString, options.gap_chars, options. mask_chars), mali.values()) col_data = map( lambda x: Mali.MaliData(x, options.gap_chars, options.mask_chars), mali.getColumns()) if len(row_data) == 0 or len(col_data) == 0: return False if options.loglevel >= 2: for row in row_data: options.stdlog.write("# row: %s\n" % str(row)) for col in col_data: options.stdlog.write("# col: %s\n" % str(col)) options.stdout.write(prefix_row) # calculate average column occupancy col_mean = scipy.mean(map(lambda x: x.mNChars, col_data)) col_median = scipy.median(map(lambda x: x.mNChars, col_data)) length = mali.getLength() if float(int(col_median)) == col_median: options.stdout.write("%5.2f\t%5.2f\t%i\t%5.2f" % (col_mean, 100.0 * col_mean / length, col_median, 100.0 * col_median / length)) else: options.stdout.write("%5.2f\t%5.2f\t%5.1f\t%5.2f" % (col_mean, 100.0 * col_mean / length, col_median, 100.0 * col_median / length)) row_mean = scipy.mean(map(lambda x: x.mNChars, row_data)) row_median = scipy.median(map(lambda x: x.mNChars, row_data)) width = mali.getWidth() if float(int(row_median)) == row_median: options.stdout.write("\t%5.2f\t%5.2f\t%i\t%5.2f" % (row_mean, 100.0 * row_mean / width, row_median, 100.0 * row_median / width)) else: options.stdout.write("\t%5.2f\t%5.2f\t%5.1f\t%5.2f" % (row_mean, 100.0 * row_mean / width, row_median, 100.0 * row_median / width)) options.stdout.write("\n") return True
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: malis2profiles.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.set_defaults() (options, args) = E.Start(parser) mali = Mali.SequenceCollection() last_id = None ninput, noutput, nskipped = 0, 0, 0 for line in sys.stdin: if line[0] == "#": continue start, ali, end, id = line[:-1].split("\t") ninput += 1 if id != last_id: if last_id: mali.setName(last_id) mali.writeToFile(sys.stdout, format="profile") noutput += 1 mali = Mali.SequenceCollection() last_id = id mali.addSequence(id, start, end, ali) if last_id: mali.setName(last_id) mali.writeToFile(sys.stdout, format="profile") noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) E.Stop()
def runXrate(mali, has_non_overlaps, pairs, map_old2new, options): """run xrate on a multiple alignment.""" ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 ## do pairwise run for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) ## remove empty columns and masked columns if options.clean_mali: temp_mali.mGapChars = temp_mali.mGapChars + ("n", "N") temp_mali.removeGaps(minimum_gaps=1, frame=3) if temp_mali.getWidth() < options.min_overlap: if options.loglevel >= 1: options.stdlog.write( "# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, mali.getEntry( ids[y]).mId, temp_mali.getWidth())) nskipped += 1 continue if options.xrate_model in ("sn", ): runXrateSN(xgram, temp_mali, options) elif options.xrate_model in ("akaksgc"): runXrateAKaKsGc(xgram, temp_mali, options) else: runXrateF3X4(xgram, temp_mali, options) if options.loglevel >= 1 and ninput % options.report_step == 0: options.stdlog.write( "# pairwise computation: %i/%i -> %i%% in %i seconds.\n" % (ninput, ntotal, 100.0 * ninput / ntotal, time.time() - tstart)) options.stdlog.flush() noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# pairwise computation: ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) options.stdlog.flush()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: jalview.py 2782 2009-09-10 11:40:29Z andreas $", usage = globals()["__doc__"] ) parser.add_option("-m", "--method", dest="method", type="choice", choices=("list2annotation", ), help="methods.") parser.add_option("--filename-mali", dest="filename_mali", type="string", help="filename with multiple alignment used for calculating sites - used for filtering" ) parser.add_option("--jalview-title", dest="jalview_title", type="string", help="title for jalview annotation." ) parser.set_defaults( method = None, jalview_symbol = "*", jalview_title = "anno", filename_mali = None, ) (options, args) = E.Start( parser, add_pipe_options = True ) if not options.filename_mali: raise "please specify a multiple alignment." mali = Mali.Mali() mali.readFromFile( open(options.filename_mali, "r") ) if options.method == "list2annotation": options.stdout.write("JALVIEW_ANNOTATION\n" ) options.stdout.write("# Created: %s\n\n" % (time.asctime(time.localtime(time.time())))) codes = [""] * mali.getWidth() first = True for line in sys.stdin: if line[0] == "#": continue if first: first= False continue position = int(line[:-1].split("\t")[0]) codes[position-1] = options.jalview_symbol options.stdout.write("NO_GRAPH\t%s\t%s\n" % (options.jalview_title, "|".join( codes ) )) E.Stop()
def loadPair(self, seq1, seq2): temp_mali = Mali.Mali() temp_mali.addSequence("seq1", 0, len(seq1), seq1) temp_mali.addSequence("seq2", 0, len(seq2), seq2) try: self.mResult = self.mBaseml.Run(temp_mali, tree="(seq1,seq2);", dump=self.mDump, test=self.mTest) except WrapperCodeML.UsageError: self.mResult = None
def getMali(mali, columns, block_size=1): new_mali = Mali.Mali() for id, val in mali.items(): sequence = val.mString chars = [] for c in columns: chars.append(sequence[c * block_size:c * block_size + block_size]) new_sequence = "".join(chars) new_mali.addSequence(id, 0, mali.countCharacters(new_sequence), new_sequence) return new_mali
def Run(self, mali, tree=None, dump=0, test=False, options={}): self.mTempdir = tempfile.mkdtemp() self.mFilenameInput = "input" self.mFilenameOutput = "output" if test: print("# temporary directory is %s" % self.mTempdir) mali.writeToFile(open(self.mTempdir + "/" + self.mFilenameInput, "w"), format="fasta") statement = " ".join((self.mExecutable, "-in %s" % self.mFilenameInput, "-out %s" % self.mFilenameOutput)) s = subprocess.Popen(statement, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=self.mTempdir, close_fds=True) (out, err) = s.communicate() if s.returncode != 0: raise UsageError("Error in running %s \n%s\n%s\nTemporary directory in %s" % ( self.mExecutable, err, out, self.mTempdir)) if dump: print("# stdout output of %s:\n%s\n######################################" % ( self.mExecutable, out)) result = Mali.Mali() result.readFromFile( open("%s/%s" % (self.mTempdir, self.mFilenameOutput), "r"), format="fasta") if not test: shutil.rmtree(self.mTempdir) return result
def filterMali(mali, method="3rd"): """build a new multiple alignment based on a filter. valid methods are 3rd: only third positions 4d: only four-fold degenerate sites """ if method not in ("3rd", "4d"): raise "unknown method %s" % method if method == "3rd": columns = range(2, mali.getWidth(), 3) elif method == "4d": # translate trans_mali = Mali.Mali() for id, seq in mali.items(): s = [] sequence = seq.mString l = len(sequence) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) s.append(aa) trans_mali.addSequence(id, 0, l, "".join(s)) # get four-fold (or higher) degenerate amino acids aa_columns = trans_mali.getColumns() columns = [] for c in range(len(aa_columns)): chars = set(aa_columns[c]) chars = chars.difference(set(mali.mGapChars)) if len(chars) == 1: char = list(chars)[0].upper() try: deg = Genomics.DegeneracyAA[char] except KeyError: continue if deg >= 4: columns.append(c * 3) mali.takeColumns(columns)
def outputAnnotations( result, options ): """output the annotations in the model.""" mali = Mali.Mali() mali.readFromFile( result.getData(), format="stockholm" ) annotation = mali.getAnnotation( "STATE" ) l,c,f = 0, None, [] for x in annotation: if x != c: if c: f.append( "%s:%i" % (c,l) ) c = x l = 0 l += 1 f.append( "%s:%i" % (c,l) ) options.stdout.write( "\t%s" % ",".join(f))
def create(self, infile): """create profile library from file.""" self.mOutfileDatabase = open(self.mFilenameProfiles, "wb") outfile_index = open(self.mFilenameIndex, "w") ninput, noutput = 0, 0 while mali.readFromFile(sys.stdin, format="profile"): ninput += 1 m = Mali.convertMali2Alignlib(mali) p = alignlib_lite.py_makeProfile(m, weightor=self.mWeightor) p.prepare() self.appendProfile(mali.getName(), p) noutput += 1 return ninput, noutput
def verify(self, infile): """verify data in database against original data.""" if not self.mIndex: self.__loadIndex() ninput, nfound, nnotfound, ndifferent = 0, 0, 0, 0 while mali.readFromFile(sys.stdin, format="profile"): ninput += 1 m = Mali.convertMali2Alignlib(mali) p1 = alignlib_lite.py_makeProfile(m) p1.prepare() p2 = self.getProfile(mali.getName()) if p1.getLength() != p2.getLength() or \ str(p1) != str(p2): ndifferent += 1 continue nfound += 1 return ninput, nfound, nnotfound, ndifferent
def _alignToProfile( infile, outfile, min_score = 0 ): '''align sequences in *infile* against mali Only alignments with a score higher than *min_score* are accepted. Output multiple alignment in fasta format to *outfile* and a table in :file:`outfile.log`. ''' mali = Mali.Mali() mali.readFromFile( open("../data/mouse.fasta") ) src_mali = Mali.convertMali2Alignlib( mali ) E.debug( "read mali: %i sequences x %i columns" % (mali.getNumSequences(), mali.getNumColumns() )) # add pseudocounts profile_mali = mali.getClone() n = profile_mali.getNumColumns() for x in "ACGT": for y in range(0,2): profile_mali.addSequence( "%s%i" % (x,y), 0, n, x * n ) profile_mali = Mali.convertMali2Alignlib( profile_mali ) alignlib.setDefaultEncoder( alignlib.getEncoder( alignlib.DNA4 ) ) alignlib.setDefaultLogOddor( alignlib.makeLogOddorUniform() ) # bg = alignlib.FrequencyVector() # bg.extend( ( 0.3, 0.1, 0.2, 0.2, 0.2) ) # alignlib.setDefaultRegularizor( alignlib.makeRegularizorTatusov( # alignlib.makeSubstitutionMatrixDNA4(), # bg, # "ACGTN", # 10.0, 1.0) ) profile = alignlib.makeProfile( profile_mali ) alignment_mode = alignlib.ALIGNMENT_WRAP alignator = alignlib.makeAlignatorDPFull( alignment_mode, -5.0, -0.5 ) map_seq2profile = alignlib.makeAlignmentVector() map_rseq2profile = alignlib.makeAlignmentVector() profile.prepare() # print profile build_mali = alignlib.makeMultAlignment() m = alignlib.makeAlignmentVector() m.addDiagonal( 0, n, 0 ) build_mali.add( src_mali, m ) outf = open( outfile, "w" ) outf_log = open( outfile + ".info", "w" ) outf_log.write( "read_id\tlength\tstart\tend\tparts\tcovered\tpcovered\tscore\tmali_start\tmali_end\tmali_covered\tmali_pcovered\n" ) sequences, aa = alignlib.StringVector(), alignlib.AlignandumVector() ids = [] for pid in mali.getIdentifiers(): sequences.append( re.sub( "-", "", mali[pid] ) ) ids.append( pid ) # print str(alignlib.MultAlignmentFormatPlain( build_mali, sequences )) c = E.Counter() for s in FastaIterator.FastaIterator( open(infile)): E.debug("adding %s" % s.title ) c.input += 1 rsequence = Genomics.complement(s.sequence) seq = alignlib.makeSequence( s.sequence ) rseq = alignlib.makeSequence( rsequence ) alignator.align( map_seq2profile, seq, profile ) alignator.align( map_rseq2profile, rseq, profile ) if map_seq2profile.getScore() > map_rseq2profile.getScore(): m, seq, sequence = map_seq2profile, seq, s.sequence else: m, seq, sequence = map_rseq2profile, rseq, rsequence if m.getLength() == 0: c.skipped += 1 continue if m.getScore() < min_score: c.skipped += 1 continue r = getParts( m ) covered = 0 for mm in r: build_mali.add( mm ) sequences.append( sequence ) ids.append( s.title ) covered += mm.getLength() - mm.getNumGaps() mali_covered = m.getColTo() - m.getColFrom() outf_log.write( "\t".join( map(str, ( s.title, len(s.sequence), m.getRowFrom(), m.getRowTo(), len(r), covered, "%5.2f" % (100.0 * covered / len(s.sequence) ), m.getScore(), m.getColFrom(), m.getColTo(), mali_covered, "%5.2f" % ((100.0 * mali_covered) / mali.getNumColumns()) ) ) ) + "\n" ) c.output += 1 #build_mali.expand( aa ) result = str(alignlib.MultAlignmentFormatPlain( build_mali, sequences, alignlib.UnalignedStacked )) for pid, data in zip(ids, result.split("\n") ): start, sequence, end = data.split("\t") outf.write(">%s/%i-%i\n%s\n" % (pid, int(start)+1, int(end), sequence) ) outf.close() outf_log.close() E.info( "%s\n" % str(c) )
def getMali(component_id, map_component2seq_id, map_component2input_id, id_filter, options): global master_mali rx_component = re.compile(options.pattern_component) mali = Mali.Mali() nsubstitutions = len(re.findall("%s", options.pattern_mali)) input_id = rx_component.search(component_id).groups()[0] input_id = map_component2input_id[input_id] if nsubstitutions == 0: if master_mali == None: master_mali = Mali.Mali() E.debug("retrieving multiple alignment from file %s" % (options.pattern_mali)) master_mali.readFromFile(open(options.pattern_mali, "r"), format=options.input_format) for s in map_component2seq_id[component_id]: if options.pattern_filter and id_filter: f = re.search(options.pattern_filter, s).groups()[0] if f not in id_filter: E.debug("removing %s from %s: not in filter" % (f, component_id)) continue if options.output_format == "codeml": if len(master_mali[s]) % 3 != 0: raise ValueError( "length of sequence %s is not a multiple of 3: %i" % (s, len(master_mali[s]))) if s in mali: if options.skip_doubles: E.warn("skipped double entry %s in component %s" % (s, component_id)) return None else: raise ValueError("duplicate entry %s in component %s" % (s, component_id)) mali.addEntry(master_mali.getEntry(s)) else: input_filename = options.pattern_mali % tuple( [input_id] * nsubstitutions) E.debug("retrieving multiple alignment for component %s from file %s" % (component_id, input_filename)) if not os.path.exists(input_filename): if options.ignore_missing: E.warn("alignment %s not found" % input_filename) return None else: raise OSError("alignment %s not found" % input_filename) mali.readFromFile(open(input_filename, "r"), format=options.input_format) ## get identifiers (and make a copy) s = tuple(mali.getIdentifiers()) for ss in s: if options.pattern_filter and id_filter: f = re.search(options.pattern_filter, ss).groups()[0] if f not in id_filter: mali.deleteEntry(ss) if options.loglevel >= 5: options.stdlog.write( "# removing %s from %s: not in filter.\n" % (ss, component_id)) continue if ss not in map_component2seq_id[component_id]: if options.loglevel >= 5: options.stdlog.write( "# removing %s from %s: not in component list.\n" % (ss, component_id)) mali.deleteEntry(ss) else: if options.output_format == "codeml": if len(mali[ss]) % 3 != 0: raise "length of sequence %s is not a multiple of 3: %i" % ( ss, len(mali[ss])) mali.setName(component_id) return mali
if options.filename_map: map_species2sp = IOTools.ReadMap(open(options.filename_map, "r")) E.debug("species map: %s" % str(map_species2sp)) identifier_parser = IdentifierParserGPipe(map_species2sp=map_species2sp) njtree = NJTree(identifier_parser=identifier_parser) njtree.SetLog(options.stdlog) njtree.SetErr(options.stderr) if options.filename_tree: njtree.SetSpeciesTree(options.filename_tree) mali = Mali.Mali() if options.filename_alignment == "-": infile = sys.stdin else: infile = open(options.filename_alignment, "r") mali.readFromFile(infile, format="fasta") if mali.getLength() == 1: if options.loglevel >= 1: options.stdlog.write("# Warning: single gene tree\n") options.stdout.write("(%s:1);\n" % tuple(mali.getIdentifiers())) elif mali.getLength() == 2: if options.loglevel >= 1: options.stdlog.write("# Warning: two gene tree\n") options.stdout.write("(%s:1,%s:1);\n" % tuple(mali.getIdentifiers()))
def selectPositiveSites(results, selection_mode, options, mali=None): """returns sites, which are consistently estimated to be positively selected. Depending on the option selection_mode, various sites are selected: 'all': all positive sites are returned 'consistent': only positive sites that are positive in all models and runs 'emes': only sites that are > 0.9 in one model and at least > 0.5 in all other models If mali is given, positions that are not fully aligned are removed. """ ## filter and extract functions if selection_mode == "emes": filter_f = lambda x: x.mProbability >= 0.5 and x.mOmega >= options.filter_omega else: filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega extract_f = lambda x: x.mResidue ## maximum significance per site (for emes) max_per_site = {} total_sites = set() first = True for result in results: for model in options.models: sites = result.mSites[model] s1, s2 = set(), set() if "neb" in options.analysis: s1 = set( map(extract_f, filter(filter_f, sites.mNEB.mPositiveSites))) for x in filter(filter_f, sites.mNEB.mPositiveSites): if x.mResidue not in max_per_site: max_per_site[x.mResidue] = 0 max_per_site[x.mResidue] = max(x.mProbability, max_per_site[x.mResidue]) if "beb" in options.analysis: s2 = set( map(extract_f, filter(filter_f, sites.mBEB.mPositiveSites))) for x in filter(filter_f, sites.mBEB.mPositiveSites): if x.mResidue not in max_per_site: max_per_site[x.mResidue] = 0 max_per_site[x.mResidue] = max(x.mProbability, max_per_site[x.mResidue]) s = s1.union(s2) if first: total_sites = s first = False else: if selection_mode == "all": total_sites = total_sites.union(s) elif selection_mode == "consistent": total_sites = total_sites.intersection(s) elif selection_mode == "emes": total_sites = total_sites.intersection(s) if selection_mode == "emes": if options.loglevel >= 2: options.stdlog.write( "# before EMES filtering %i positive sites: mode %s, P>%5.2f\n" % (len(total_sites), selection_mode, 0.5)) # filter according to emes: maximum significance larger than 0.9 total_sites = set(filter(lambda x: max_per_site[x] > 0.9, total_sites)) if options.loglevel >= 2: options.stdlog.write( "# after EMES filtering %i positive sites: mode %s, P>%5.2f\n" % (len(total_sites), selection_mode, 0.9)) else: if options.loglevel >= 2: options.stdlog.write( "# extracted %i positive sites: mode %s, P>%5.2f\n" % (len(total_sites), selection_mode, options.filter_probabiltiy)) if mali and options.filter_mali: if options.filter_mali == "gaps": nfiltered = 0 mali_length = mali.getLength() column_data = map( lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."), mali.getColumns()) new_sites = set() for x in total_sites: ## PAML uses one-based coordinates column = column_data[x - 1] if column.mNChars != mali_length: nfiltered += 1 if options.loglevel >= 3: options.stdlog.write( "# rejected position %i due to mali\n" % x) continue new_sites.add(x) total_sites = new_sites if options.loglevel >= 2: options.stdlog.write("# after MALI filtering %i positive sites\n" % (len(total_sites))) return total_sites, max_per_site
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codemls2tsv.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--methods", dest="methods", type="choice", action="append", choices=("summary-numbers", "jalview", "positive-site-table", "positive-site-list", "count-positive-sites"), help="methods for analysis.") parser.add_option("--selection-mode", dest="selection_mode", type="choice", choices=("all", "consistent", "emes"), help="how to select positive sites.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("--pattern-input-filenames", dest="pattern_input_filenames", type="string", help="input pattern.") parser.add_option( "--filter-probability", dest="filter_probability", type="float", help= "threshold for probability above which to include positive sites [default=%default]." ) parser.add_option( "--filter-omega", dest="filter_omega", type="float", help= "threshold for omega above which to include positive sites [default=%default]." ) parser.add_option("--models", dest="models", type="string", help="restrict output to set of site specific models.") parser.add_option("--analysis", dest="analysis", type="string", help="restrict output to set of analysis [beb|neb].") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="significance threshold for log-likelihood test.") parser.add_option("--filter-mali", dest="filter_mali", type="choice", choices=("none", "gaps"), help="filter by mali to remove gapped positions.") parser.add_option( "--filename-mali", dest="filename_mali", type="string", help= "filename with multiple alignment used for calculating sites - used for filtering" ) parser.add_option( "--filename-map-mali", dest="filename_map_mali", type="string", help="filename with multiple alignment to map sites onto.") parser.add_option( "--jalview-titles", dest="jalview_titles", type="string", help="comma separated list of jalview annotation titles.") parser.add_option("--jalview-symbol", dest="jalview_symbol", type="string", help="symbol to use in jalview.") parser.set_defaults( methods=[], prefix=None, filter_probability=0, filter_omega=0, models="", analysis="", significance_threshold=0.05, selection_mode="consistent", filename_mali=None, filename_map_mali=None, jalview_symbol="*", jalview_titles="", filter_mali=None, ) (options, args) = E.Start(parser) if options.jalview_titles: options.jalview_titles = options.jalview_titles.split(",") else: options.jalview_titles = args options.models = options.models.split(",") options.analysis = options.analysis.split(",") for a in options.analysis: if a not in ("beb", "neb"): raise "unknown analysis section: '%s', possible values are 'beb' and/or 'neb'" % a for a in options.models: if a not in ("8", "2", "3"): raise "unknown model: '%s', possible values are 2, 3, 8" % a codeml = WrapperCodeML.CodeMLSites() ## filter and extract functions filter_f = lambda x: x.mProbability >= options.filter_probability and x.mOmega >= options.filter_omega extract_f = lambda x: x.mResidue ## read multiple results results = [] ninput, noutput, nskipped = 0, 0, 0 headers = [] for f in args: ninput += 1 try: results.append(codeml.parseOutput(open(f, "r").readlines())) except WrapperCodeML.UsageError: if options.loglevel >= 1: options.stdlog.write("# no input from %s\n" % f) nskipped += 1 continue noutput += 1 headers.append(f) ## map of nested model (key) to more general model map_nested_models = {'8': '7', '2': '1', '3': '0'} if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: mali = None ############################################################### ############################################################### ############################################################### ## use multiple alignment to map residues to a reference mali ## or a sequence. ############################################################### if options.filename_map_mali: if not mali: raise "please supply the input multiple alignment, if residues are to be mapped." ## translate the alignments def translate(s): sequence = s.mString seq = [] for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) s.mString = "".join(seq) tmali = Mali.Mali() tmali.readFromFile(open(options.filename_mali, "r")) tmali.apply(translate) tmap_mali = Mali.Mali() tmap_mali.readFromFile(open(options.filename_map_mali, "r")) if tmap_mali.getAlphabet() == "na": tmap_mali.apply(translate) map_old2new = alignlib_lite.py_makeAlignmentVector() mali1 = alignlib_lite.py_makeProfileFromMali(convertMali2Mali(tmali)) if tmap_mali.getLength() == 1: s = tmap_mali.values()[0].mString mali2 = alignlib_lite.py_makeSequence(s) ## see if you can find an identical subsequence and then align to thisD for x in tmali.values(): if s in re.sub("[- .]+", "", x.mString): mali1 = alignlib_lite.py_makeSequence(x.mString) break else: mali2 = alignlib_lite.py_makeProfileFromMali( convertMali2Mali(tmap_mali)) alignator = alignlib_lite.py_makeAlignatorDPFull( alignlib_lite.py_ALIGNMENT_LOCAL, -10.0, -2.0) alignator.align(map_old2new, mali1, mali2) consensus = tmap_mali.getConsensus() if options.loglevel >= 4: options.stdlog.write("# alphabet: %s\n" % tmap_mali.getAlphabet()) options.stdlog.write("# orig : %s\n" % tmali.getConsensus()) options.stdlog.write("# mapped: %s\n" % consensus) options.stdlog.write("# alignment: %s\n" % map_old2new.Write()) else: map_old2new = None for method in options.methods: if method == "summary-numbers": options.stdlog.write( \ """# Numbers of positive sites. # # The consistent row/column contains positive sites that are significant # (above thresholds for probability and omega) for all models/analysis # that have been selected (label: cons). # # The log-likelihood ratio test is performed for model pairs, depending # on the output chosen. # Significance threshold: %6.4f # The pairs are 8 versus 7 and 2 versus 1 and 3 versus 0. # """ % options.significance_threshold ) ## write header if options.prefix: options.stdout.write("prefix\t") options.stdout.write("method\tnseq\t") h = [] for model in options.models: for analysis in options.analysis: h.append("%s%s" % (analysis, model)) h.append("p%s" % (model)) h.append("df%s" % (model)) h.append("chi%s" % (model)) h.append("lrt%s" % (model)) options.stdout.write("\t".join(h)) options.stdout.write("\tcons\tpassed\tfilename\n") nmethod = 0 consistent_cols = [None for x in range(len(options.analysis))] passed_tests = {} for m in options.models: passed_tests[m] = 0 for result in results: row_consistent = None if options.prefix: options.stdout.write("%s" % (options.prefix)) options.stdout.write("%i" % nmethod) options.stdout.write("\t%i" % (result.mNumSequences)) npassed = 0 for model in options.models: sites = result.mSites[model] ## do significance test full_model, null_model = model, map_nested_models[model] lrt = Stats.doLogLikelihoodTest( result.mSites[full_model].mLogLikelihood, result.mSites[full_model].mNumParameters, result.mSites[null_model].mLogLikelihood, result.mSites[null_model].mNumParameters, options.significance_threshold) x = 0 for analysis in options.analysis: if analysis == "neb": s = set( map( extract_f, filter(filter_f, sites.mNEB.mPositiveSites))) elif analysis == "beb": s = set( map( extract_f, filter(filter_f, sites.mBEB.mPositiveSites))) options.stdout.write("\t%i" % (len(s))) if not lrt.mPassed: s = set() if row_consistent == None: row_consistent = s else: row_consistent = row_consistent.intersection(s) if consistent_cols[x] == None: consistent_cols[x] = s else: consistent_cols[x] = consistent_cols[ x].intersection(s) x += 1 if lrt.mPassed: c = "passed" passed_tests[model] += 1 npassed += 1 else: c = "failed" options.stdout.write("\t%5.2e\t%i\t%5.2f\t%s" %\ (lrt.mProbability, lrt.mDegreesFreedom, lrt.mChiSquaredValue, c)) options.stdout.write( "\t%i\t%i\t%s\n" % (len(row_consistent), npassed, headers[nmethod])) nmethod += 1 if options.prefix: options.stdout.write("%s\t" % options.prefix) options.stdout.write("cons") row_consistent = None total_passed = 0 for model in options.models: x = 0 for analysis in options.analysis: s = consistent_cols[x] if s == None: s = set() options.stdout.write("\t%i" % (len(s))) if row_consistent == None: row_consistent = s else: row_consistent = row_consistent.intersection(s) x += 1 options.stdout.write("\tna\t%i" % passed_tests[model]) total_passed += passed_tests[model] options.stdout.write("\t%i\t%i\n" % (len(row_consistent), total_passed)) elif method == "jalview": options.stdout.write("JALVIEW_ANNOTATION\n") options.stdout.write("# Created: %s\n\n" % (time.asctime(time.localtime(time.time())))) l = 1 x = 0 for result in results: sites, significance = selectPositiveSites( [result], options.selection_mode, options, mali) codes = [""] * result.mLength if len(sites) == 0: continue for site in sites: codes[site - 1] = options.jalview_symbol options.stdout.write( "NO_GRAPH\t%s\t%s\n" % (options.jalview_titles[x], "|".join(codes))) x += 1 elif method == "count-positive-sites": sites, significance = selectPositiveSites(results, options.selection_mode, options, mali) options.stdout.write("%i\n" % (len(sites))) elif method in ("positive-site-table", ): sites, significance = selectPositiveSites(results, options.selection_mode, options, mali) headers = ["site", "P"] if map_old2new: headers.append("mapped") headers.append("Pm") options.stdout.write("\t".join(headers) + "\n") sites = list(sites) sites.sort() nmapped, nunmapped = 0, 0 for site in sites: values = [site, "%6.4f" % significance[site]] if map_old2new: r = map_old2new.mapRowToCol(site) if r == 0: values.append("na") values.append("") nunmapped += 1 if options.loglevel >= 2: options.stdlog.write("# unmapped residue: %i\n" % site) else: values.append(r) values.append(consensus[r - 1]) nmapped += 1 options.stdout.write("\t".join(map(str, (values))) + "\n") if options.loglevel >= 1: options.stdlog.write( "# sites: ninput=%i, noutput=%i, nskipped=%i\n" % (len(sites), nmapped, nunmapped)) E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment [default=%default].") parser.add_option( "-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"), help="output format of multiple alignment [default=%default].") parser.add_option( "--with-ranges", dest="with_ranges", action="store_true", help= "output alignment ranges (suffix /from-to after identifier) [default=%default]." ) parser.add_option( "--without-ranges", dest="with_ranges", action="store_false", help= "do not output alignment ranges (suffix /from-to after identifier) [default=%default]." ) parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true", help="permit duplicate entries [default=%default].") parser.add_option( "-m", "--method", dest="methods", type="string", help= """methods to apply. Several methods can be specified in a ','-separated list [default=%default].""" ) parser.add_option( "-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one [default=%default]." ) parser.add_option( "-a", "--mask-char", dest="mask_char", type="string", help="character to identify/set masked characters [default=%default].") parser.set_defaults( input_format="fasta", output_format="fasta", methods="", parameters="", mask_char="x", gap_chars="-.nN", with_ranges=True, allow_duplicates=False, ) (options, args) = E.Start(parser) options.methods = options.methods.split(",") options.parameters = options.parameters.split(",") # 1. read multiple alignment in various formats if options.allow_duplicates: mali = Mali.SequenceCollection() else: mali = Mali.Mali() t1 = time.time() mali.readFromFile(options.stdin, format=options.input_format) E.info("read mali with %i entries in %i seconds." % (len(mali), time.time() - t1)) if len(mali) == 0: raise ValueError("empty multiple alignment") for method in options.methods: t1 = time.time() if method == "remove-unaligned-ends": mali.removeUnalignedEnds() elif method == "remove-end-gaps": mali.removeEndGaps() elif method == "remove-all-gaps": mali.removeGaps(minimum_gaps=len(mali)) elif method == "remove-any-gaps": mali.removeGaps(minimum_gaps=1) elif method == "remove-some-gaps": minimum_gaps = int(options.parameters[0]) del options.parameters[0] mali.removeGaps(minimum_gaps=minimum_gaps) elif method == "remove-empty-sequences": mali.removeEmptySequences() elif method == "upper": mali.upperCase() elif method == "lower": mali.lowerCase() elif method == "mark-codons": mali.markCodons() elif method == "remove-stops": mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"), allowed_matches=0, minimum_matches=1, delete_frame=3, search_frame=3) elif method == "shift-alignment": map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"), map_functions=(str, int)) del options.parameters[0] mali.shiftAlignment(map_id2offset) elif method == "propagate-masks": mali.propagateMasks(mask_char=options.mask_char) elif method == "recount": mali.recount() elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions", "keep-even-segments", "keep-odd-segments"): if os.path.exists(options.parameters[0]): map_id2transitions = IOTools.readMultiMap( open(options.parameters[0], "r"), map_functions=(str, int)) else: map_id2transitions = {} r = map(int, options.parameters[0].split(':')) r.sort() map_id2transitions["mali"] = r del options.parameters[0] if method == "mark-transitions": mali.markTransitions(map_id2transitions) elif method in ("filter-odd-transitions", "keep-even-segments"): mali.markTransitions(map_id2transitions, mode="keep-odd") elif method in ("filter-even-transitions", "keep-odd-segments"): mali.markTransitions(map_id2transitions, mode="keep-even") elif method == "propagate-transitions": mali.propagateTransitions() elif method == "map-annotation": # map annotations in one mali (stockholm-format) to the annotations in another. # Note: the first two sequence identifiers must be shared and the sequence of the # same length other_mali = Mali.Mali() other_mali.readFromFile(open(options.parameters[0], "r"), format="stockholm") del options.parameters[0] mali.copyAnnotations(other_mali) elif method == "add-annotation": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] AddAnnotation(mali, annotation_type, annotation_file) elif method == "mask-columns": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] maskColumns(mali, annotation_type, annotation_file) elif method == "remove-unaligned-pairs": removeUnalignedPairs(mali, options) elif method == "filter-3rd": filterMali(mali, "3rd") elif method == "filter-4d": filterMali(mali, "4d") elif method in ("mask-seg", "mask-bias"): a, b = method.split("-") maskMali(mali, b) elif method == "exclude-with-stop": mali.filter(method="with-stop") elif method == "exclude-with-stop": mali.filter(method="with-frameshift") E.info("applied method %s in %i seconds." % (method, time.time() - t1)) mali.writeToFile(options.stdout, format=options.output_format, write_ranges=options.with_ranges) E.Stop()
def runXrate(mali, pairs, options): from XGram.Generator.Prebuilt import DNA from XGram.Model import Annotation import XGram.Run xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 tempdir = tempfile.mkdtemp() data = tempdir + "/data" if options.distance == "K80": model = DNA.buildModel(substitution_model="k80") elif options.distance == "JC69": model = DNA.buildModel(substitution_model="jc69") elif options.distance == "REV": model = DNA.buildModel(substitution_model="gtr") else: raise "distance %s not implemented for xrate" % (options.distance) writeModel(model, "input", options) if options.output_format == "list": options.stdout.write("\t".join( ("seq1", "seq2", "distance", "lnL", "alpha", "kappa", "msg"))) if options.with_counts: options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\n") for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(m1.mId, m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(m2.mId, m2.mFrom, m2.mTo, m2.mString) # if temp_mali.getWidth() < options.min_overlap: # if options.loglevel >= 1: # options.stdlog.write("# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, # mali.getEntry(ids[y]).mId, # temp_mali.getWidth()) ) ## nskipped += 1 # continue outfile = open(data, "w") temp_mali.writeToFile(outfile, format="stockholm", write_ranges=False, options=("#=GF NH (%s:1.0)%s;" % tuple(temp_mali.getIdentifiers()), )) outfile.close() o_alpha, o_kappa = "na", "na" o_distance = "na" msg = "" if options.test_xrate: for alpha in (0.1, 0.5, 1.0, 1.5): for beta in (0.1, 0.5, 1.0, 1.5): model.mGrammar.setParameter("alpha", alpha) model.mGrammar.setParameter("beta", beta) result = xgram.train(model, data) trained_model = result.getModel() xalpha, xbeta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * xbeta + xalpha) o_kappa = options.format % (xalpha / xbeta) msg = "alpha=%6.4f, beta=%6.4f" % (xalpha, xbeta) options.stdout.write("\t".join( ("%f" % alpha, "%f" % beta, o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) options.stdout.write("\n") continue options.stdout.write("%s\t%s\t" % (m1.mId, m2.mId)) if options.distance in ("K80", ): result = xgram.train(model, data) trained_model = result.getModel() elif options.distance in ("REV", ): result = xgram.train(model, data) trained_model = result.getModel() alpha, beta, gamma, delta, epsilon, theta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta'), trained_model.mGrammar.getParameter('gamma'), trained_model.mGrammar.getParameter('delta'), trained_model.mGrammar.getParameter('epsilon'), trained_model.mGrammar.getParameter('theta')) pi = trained_model.evaluateTerminalFrequencies(('A0', ))[('A0', )] matrix = trained_model.evaluateRateMatrix(('A0', ))[('A0', )] q, d = RateEstimation.getDistanceGTR(pi, matrix) o_distance = options.format % (d) o_kappa = "" msg = "alpha=%6.4f, beta=%6.4f, gamma=%6.4f, delta=%6.4f, epsilon=%6.4f, theta=%6.4f" % ( alpha, beta, gamma, delta, epsilon, theta) elif options.distance in ('JC69', ): result = xgram.buildTree(model, data) if options.distance == "K80": alpha, beta = \ (trained_model.mGrammar.getParameter('alpha'), trained_model.mGrammar.getParameter('beta')) # this assumes that the branch length in the input is normalized to 1 # this is the normalization constant o_distance = options.format % (2 * beta + alpha) o_kappa = options.format % (alpha / beta) msg = "alpha=%6.4f, beta=%6.4f" % (alpha, beta) alpha = "na" elif options.distance == "JC69": tree = result.getTree() # multiply distance by tree, as rates are set to 1 and # thus the matrix is scaled by a factor of 3 o_distance = options.format % ( 3.0 * float(re.search("\(\S+:([0-9.]+)\)", tree).groups()[0])) o_kappa = "na" msg = "" writeModel(result.mModel, "trained", options) options.stdout.write("\t".join( (o_distance, options.format % result.getLogLikelihood(), o_alpha, o_kappa, msg))) if options.with_counts: info = Genomics.CalculatePairIndices(mali[ids[x]], mali[ids[y]], with_codons=options.is_codons) options.stdout.write("\t%s" % (str(info))) options.stdout.write("\n") shutil.rmtree(tempdir)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2predictions.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-l", "--filename-locations", dest="filename_locations", type="string", help="filename with locations") parser.add_option("-m", "--master", dest="master", type="string", help="the master determines the frame.") parser.set_defaults(filename_locations=None, gap_chars="-.", master=None) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) mali = Mali.Mali() mali.readFromFile(sys.stdin) identifiers = mali.getIdentifiers() aligned_columns, aligned_exons = getAlignedColumns(mali, options) map_id2location = {} if options.filename_locations: map_id2location = IOTools.ReadMap(open(options.filename_locations, "r")) options.stdout.write(Prediction.Prediction().getHeader() + "\n") nid = 1 for identifier in identifiers: if options.loglevel >= 2: options.stdlog.write("# processing %s\n" % (identifier)) entry = mali.getEntry(identifier) sequence = entry.mString if sequence[0] not in string.lowercase: raise "all sequences should start with an exon." was_exon = True d = 0 alignment = [] carry_over = 0 last_codon = [] codon = [] nchars_in_codon = 0 n = 0 last_master_residue = 0 master_residue = 0 for column in range(len(sequence)): c = sequence[column] is_gap = c in options.gap_chars is_aligned = column in aligned_columns is_exon = column in aligned_exons if is_gap: continue if is_exon: master_residue = aligned_exons[column] codon.append((n, master_residue)) n += 1 # check if we have a complete codon if is_exon: # A codon is complete, if it ends at frame 2 or # it spans more than one codons in the master. # Gaps in the master that are a multiple of 3 are ignored d = master_residue - last_master_residue - 1 if master_residue % 3 == 2 or (d % 3 != 0 and d > 0): if last_codon: d = codon[0][0] - last_codon[-1][0] - 1 if d > 0: # add in-frame introns if d > 10: alignment.append(["5", 0, 2]) alignment.append(["I", 0, d - 4]) alignment.append(["3", 0, 2]) else: raise "untreated case" alignment += processCodon(codon) last_codon = codon codon = [] last_master_residue = master_residue last = alignment[0] new_alignment = [] for this in alignment[1:]: if this[0] == last[0]: last[1] += this[1] last[2] += this[2] continue new_alignment.append(last) last = this new_alignment.append(last) if options.loglevel >= 4: options.stdlog.write("# output=%s\n" % (str(new_alignment))) assert (new_alignment[-1][2] % 3 == 0) lalignment = sum(map(lambda x: x[2], new_alignment)) prediction = Prediction.Prediction() prediction.mQueryToken = identifier genomic_sequence = re.sub("[%s]" % options.gap_chars, "", mali[identifier]) prediction.mPredictionId = nid nid += 1 if identifier in map_id2location: prediction.mSbjctToken, prediction.mSbjctStrand, sfrom, sto = map_id2location[ identifier].split(":")[:4] prediction.mSbjctGenomeFrom = int(sfrom) + entry.mFrom prediction.mSbjctGenomeTo = int(sto) else: prediction.mSbjctToken = "unk" prediction.mSbjctStrand = "+" prediction.mSbjctGenomeFrom = 0 prediction.mQueryCoverage = 100 prediction.mPercentIdentity = 100 prediction.mPercentSimilarity = 100 prediction.mQueryLength = prediction.mQueryTo prediction.mSbjctGenomeTo = prediction.mSbjctGenomeFrom + lalignment prediction.mMapPeptide2Genome = new_alignment prediction.mAlignmentString = string.join( map(lambda x: string.join(map(str, x), " "), prediction.mMapPeptide2Genome), " ") prediction.mMapPeptide2Translation, prediction.mTranslation = Genomics.Alignment2PeptideAlignment( prediction.mMapPeptide2Genome, 0, 0, genomic_sequence) (prediction.mNIntrons, prediction.mNFrameShifts, prediction.mNGaps, prediction.mNSplits, prediction.mNStopCodons, disruptions) = \ Genomics.CountGeneFeatures(0, prediction.mMapPeptide2Genome, genomic_sequence) options.stdout.write(str(prediction) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2kaks.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option("--set-omega", dest="omega", type="float", help="initial omega value.") parser.add_option("--set-kappa", dest="kappa", type="float", help="initial kappa value.") parser.add_option("--fix-kappa", dest="fix_kappa", action="store_true", help="do not estimate kappa.") parser.add_option("--fix-omega", dest="fix_omega", action="store_true", help="do not estimate omega.") parser.add_option("--set-codon-frequencies", dest="codon_frequencies", type="choice", choices=("uniform", "fequal", "f3x4", "f1x4", "f61"), help="set codon frequencies.") parser.add_option("--set-method", dest="paml_method", type="int", help="set paml optimization method [0|1].") parser.add_option("--set-sequence-type", dest="seqtype", type="choice", choices=("codon", "aa", "trans"), help="sequence type.") parser.add_option( "--set-clean-data", dest="clean_data", type="choice", choices=("0", "1"), help= "PAML should cleanup data: 0=only gaps within pair are removed, 1=columns in the mali with gaps are removed." ) parser.add_option("--dump", dest="dump", action="store_true", help="dump raw output [%default].") parser.add_option("--set-optimization-threshold", dest="optimization_threshold", type="string", help="set paml optimization threshold [%default].") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment [%default].") parser.add_option("--pairwise", dest="pairwise", action="store_true", help="force pairwise comparison [%default].") parser.add_option("--iteration", dest="iteration", type="choice", choices=("all-vs-all", "first-vs-all", "pairwise", "tree"), help="iteration mode [%default].") parser.add_option( "--no-clean", dest="clean_mali", action="store_false", help= "do not clean multiple alignment before submitting to codeml. It might take too long for very large sequences." ) parser.add_option("--method", dest="method", type="choice", choices=("paml", "xrate"), help="choose method for rate computation [%default]") parser.add_option("--xrate-model", dest="xrate_model", type="choice", choices=("f3x4-two", "f3x4-four", "sn", "akaksgc", "ef3x4-four", "f3x4-fourproducts"), help="models to use [%default].") parser.add_option("-w", "--write", dest="write", type="choice", action="append", choices=("input_fixed", "trained_fixed", "input_variable", "trained_variable", "all"), help="output sections to write [%default].") parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string", help="output pattern for output files [%default].") parser.add_option("--xrate-insert-frequencies", dest="xrate_insert_frequencies", action="store_true", help="estimate codon frequencies from input [%default].") parser.add_option("--xrate-uniform-frequencies", dest="xrate_insert_frequencies", action="store_false", help="use uniform codon frequencies [%default].") parser.add_option("--xrate-fix-frequencies", dest="xrate_fix_frequencies", action="store_true", help="set initial frequencies to const [%default].") parser.add_option("--xrate-estimate-frequencies", dest="xrate_fix_frequencies", action="store_false", help="estimate nucleotide frequencies [%default].") parser.add_option( "--xrate-fix-rates", dest="fix_rates", type="string", help= """fix rates to specified values. Note that the number of rates has to match the ones in the model. Provide values in a comma-separated list [%default].""") parser.add_option( "--xrate-min-increment", dest="xrate_min_increment", type=float, help="minimum increment to stop iteration in xrate [%default].") parser.add_option( "--min-overlap", dest="min_overlap", type="int", help="minimum overlap between a sequence pair in residues [%default].") parser.add_option( "--with-rho", dest="with_rho", action="store_true", help= "output rho values (substitution rates per codon). This requires a patched version of PAML [%default]." ) parser.add_option( "--with-counts", dest="with_counts", action="store_true", help= "output counts of aligned positions, transitions and transversions [%default]." ) parser.add_option("--remove-stops", dest="remove_stops", action="store_true", help="remove stop codons [%default].") parser.add_option( "--replicates", dest="replicates", type="int", help="in benchmarking mode expect ## replicates [%default].") parser.add_option("--tree", dest="tree", type="string", help="use tree for estimation [%default].") parser.set_defaults( input_format="fasta", omega=None, codon_frequencies=None, paml_method=None, optimization_threshold=None, seqtype="codon", dump=False, clean_data=False, min_overlap=60, gap_chars="-.", mask_chars="nN", pairwise=False, kappa=None, fix_kappa=False, fix_omega=False, clean_mali=True, method="paml", report_step=1000, loglevel=1, xrate_insert_frequencies=False, xrate_fix_frequencies=False, write=[], output_pattern="%s.eg", value_format="%6.4f", fix_rates=None, xrate_from_parameters=False, xrate_model="f3x4-four", with_rho=False, with_counts=False, iteration="all-vs-all", remove_stops=False, xrate_min_increment=0.000001, replicates=None, tree=None, ) (options, args) = E.Start(parser) if options.method == "xrate": # imports for xrate computation from XGram.Generator.Prebuilt import Codons from XGram.Model import Annotation import XGram.Run import Bio.Data.CodonTable # paml like estimation using xrate if options.codon_frequencies == "uniform": options.xrate_fix_frequencies = True options.xrate_insert_frequencies = False elif options.codon_frequencies == "f3x4": options.xrate_fix_frequencies = True options.xrate_insert_frequencies = True elif options.method == "paml": if not options.codon_frequencies: options.codon_frequencies = "F3X4" if options.fix_rates: options.fix_rates = map(float, options.fix_rates.split(",")) if options.pairwise or options.replicates: ## read sequences, but not as a multiple alignment. This permits multiple names. mali = Mali.SequenceCollection() else: mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) E.info("read multiple alignment") if mali.getLength() == 0: raise "refusing to process empty alignment." ################################################################ ################################################################ ################################################################ ## setup methods ################################################################ options.stdout.write( "seq1\tseq2\tdN\tdS\tdNdS\tN\tS\tdN_err\tdS_err\tkappa\tlnL\ttau") if options.with_rho: options.stdout.write("\trN\trS\tt\trN0\trS0\tt0") if options.with_counts: options.stdout.write("\t%s" % Genomics.SequencePairInfo().getHeader()) options.stdout.write("\terror_str\n") if options.replicates != None: ids = mali.getIdentifiers() assert (len(ids) % options.replicates == 0) s = len(ids) / options.replicates for x in range(0, len(ids), s): m = Mali.Mali() for id in ids[x:x + s]: m.addEntry(mali.getEntry(id)) processMali(m, options) else: processMali(mali, options) E.Stop()
def __init__(self): self.mStockholm = None self.mMali = Mali.Mali()
def ProcessResult(result, options, mali=None, prefix=None, p_value=None): counts = None if options.method == "summary-slr": thresholds = "95%", "99%", "95% corrected", "99% corrected" if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write("%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t" % ( result.mTreeLength, result.mOmega, result.mKappa, result.mLogLikelihood, len(result.mSites), result.mNSitesSynonymous, result.mNSitesGaps + result.mNSitesSingleChar, )) options.stdout.write("\t".join( map(lambda x: "%i" % result.mNPositiveSites[x][0], thresholds))) options.stdout.write("\t") options.stdout.write("\t".join( map(lambda x: "%i" % result.mNNegativeSites[x], thresholds))) options.stdout.write("\n") elif options.method in ("summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"): mali_length = mali.getLength() mali_width = mali.getWidth() column_data = map( lambda x: Mali.MaliData(x, gap_chars="Nn", mask_chars="-."), mali.getColumns()) # sanity check: do lengths of mali and # of sites correspond if len(result.mSites) * 3 != mali_width: raise "mali (%i) and # of sites (%i) do not correspond." % ( mali_width, len(result.mSites)) if options.method == "summary-filtered": # count sites, but filter with multiple alignment ntotal = 0 npositive = 0 nnegative = 0 nneutral = 0 nfiltered = 0 nsynonymous = 0 if prefix: options.stdout.write("%s\t" % prefix) for x in range(len(result.mSites)): site = result.mSites[x] column = column_data[x * 3] if column.mNChars != mali_length: nfiltered += 1 continue if site.isPositive(options.significance_threshold, options.use_adjusted): npositive += 1 elif site.isNegative(options.significance_threshold, options.use_adjusted): nnegative += 1 if site.isSynonymous(): nsynonymous += 1 ntotal += 1 options.stdout.write( "%5.2f\t%5.2f\t%5.2f\t%6.4f\t%i\t%i\t%i\t%i\t%i\t%i\n" % (result.mTreeLength, result.mOmega, result.mKappa, result.mLogLikelihood, len(result.mSites), nfiltered, ntotal, nsynonymous, nnegative, npositive)) counts = Result(nfiltered, ntotal, nsynonymous, nnegative, npositive) elif options.method in ( "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list", ): select_positive_sites = options.method in ("positive-site-table", "positive-site-list") select_negative_sites = options.method in ("negative-site-table", "negative-site-list") # iterate over sites and output those under xxx selection identifiers = mali.getIdentifiers() chars_per_row = [[] for x in range(mali_length)] sites = [] for col in range(len(result.mSites)): site = result.mSites[col] column = column_data[col * 3] if column.mNChars != mali_length: continue keep = False if select_positive_sites and site.isPositive( options.significance_threshold, options.use_adjusted): keep = True elif select_negative_sites and site.isNegative( options.significance_threshold, options.use_adjusted): keep = True if not keep: continue sites.append((col, site)) nsites = len(sites) if options.truncate_sites_list: # truncate sites list, sort by significance sites.sort(lambda x, y: cmp(x[1].mPValue, y[1].mPValue)) sites = sites[:options.truncate_sites_list] for col, site in sites: site = result.mSites[col] xcol = col * 3 for row in range(mali_length): id = identifiers[row] x = max(xcol - options.context_size * 3, 0) y = min(xcol + 3 + options.context_size * 3, mali_width) segment = mali[id][x:y] codon = mali[id][xcol:xcol + 3] pos = mali.getResidueNumber(id, xcol) pos /= 3 # save as real-world coordinates chars_per_row[row].append( PositionInformation( Genomics.MapCodon2AA(codon), pos + 1, xcol, Genomics.TranslateDNA2Protein(segment).upper())) if p_value is not None: pp_value = p_value else: pp_value = "na" if options.method in ("positive-site-table", "negative-site-table", "neutral-site-table"): if options.context_size: for row in range(mali_length): if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write( "%s\t%i\t%s\t%s\n" % (identifiers[row], nsites, pp_value, ";".join([ "%s%i in %s" % (x.mAA, x.mSequencePosition, x.mContext) for x in chars_per_row[row] ]))) else: for row in range(mali_length): if prefix: options.stdout.write("%s\t" % prefix) options.stdout.write( "%s\t%i\t%s\t%s\n" % (identifiers[row], nsites, pp_value, ";".join([ "%s%i" % (x.mAA, x.mSequencePosition) for x in chars_per_row[row] ]))) elif options.method in ("positive-site-list", "negative-site-list", "neutral-site-list"): for row in range(mali_length): if prefix: xprefix = "%s\t%s" % (prefix, identifiers[row]) else: xprefix = "%s" % (identifiers[row]) x = 0 for chars in chars_per_row[row]: x += 1 options.stdout.write( "%s\t%i\t%s\t%i\t%i\t%s\n" % (xprefix, x, chars.mAA, chars.mSequencePosition, chars.mMaliPosition, chars.mContext)) options.stdout.flush() return counts
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2rates.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option( "-s", "--sites", dest="sites", type="string", help="sites to use [default=%default].", ) parser.add_option( "-f", "--file", dest="filename", type="string", help="filename of multiple alignment (- for stdin) [default=%default].", metavar="FILE") parser.add_option("-o", "--format", dest="format", type="string", help="format [default=%default].", metavar="format") parser.add_option( "-d", "--distance", dest="distance", type="choice", choices=("PID", "T92", "JC69", "POVL", "F84", "LogDet", "K80", "F81", "HKY85", "TN93", "REV", "UNREST", "REVU", "UNRESTU", "JTT", "PMB", "PAM", "Kimura", "CategoriesModel"), help="method to use for distance calculation [default=%default].") parser.add_option("--method", dest="method", type="choice", choices=("phylip", "baseml", "own", "xrate"), help="program to use for rate calculation.") parser.add_option("--output-format", dest="output_format", type="choice", choices=("list", "tree"), help="output format.") parser.add_option( "-m", "--min-sites", dest="min_sites", type="int", help="minimum number of sites for output[default=%default].", ) parser.add_option( "-a", "--alphabet", dest="alphabet", type="choice", choices=("aa", "na", "auto"), help="alphabet to use.", ) parser.add_option("-t", "--filename-tree", dest="filename_tree", type="string", help="filename with tree information.") parser.add_option("--set-alpha", dest="alpha", type="float", help="initial alpha value.") parser.add_option("--fix-alpha", dest="fix_alpha", action="store_true", help="do not estimate alpha.") parser.add_option("--set-kappa", dest="kappa", type="float", help="initial kappa value.") parser.add_option("--fix-kappa", dest="fix_kappa", action="store_true", help="do not estimate kappa.") parser.add_option("--dump", dest="dump", action="store_true", help="dump output.") parser.add_option("--test", dest="test", action="store_true", help="test run - does not clean up.") parser.add_option("--pairwise", dest="pairwise", action="store_true", help="force pairwise comparison.") parser.add_option( "--set-clean-data", dest="clean_data", type="choice", choices=("0", "1"), help= "PAML should cleanup data: 0=only gaps within pair are removed, 1=columns in the mali with gaps are removed." ) parser.add_option( "--with-counts", dest="with_counts", action="store_true", help= "output counts of aligned positions, transitions and transversions.") parser.add_option("-w", "--write", dest="write", type="choice", action="append", choices=("input", "trained", "all"), help="output sections to write for xrate.") parser.add_option("--output-pattern", dest="output_pattern", type="string", help="output pattern for output files.") parser.add_option("--xrate-min-increment", dest="xrate_min_increment", type=float, help="minimum increment to stop iteration in xrate.") parser.set_defaults( input_format="fasta", filename_tree=None, with_counts=False, sites="d4", distance="T92", min_sites=1, filename="-", alphabet="auto", format="%6.4f", method="phylip", kappa=None, fix_kappa=False, alpha=None, fix_alpha=False, dump=False, clean_data=None, output_format="list", iteration="all-vs-all", pairwise=False, report_step=1000, output_pattern="%s.eg", write=[], test_xrate=False, xrate_min_increment=None, is_codons=False, ) (options, args) = E.Start(parser) if options.filename != "-": infile = open(options.filename, "r") else: infile = sys.stdin # read multiple alignment if options.pairwise: # read sequences, but not as a multiple alignment. This permits # multiple names. mali = Mali.SequenceCollection() options.iteration = "pairwise" else: mali = Mali.Mali() mali.readFromFile(infile, format=options.input_format) ids = mali.getIdentifiers() if options.alphabet == "auto": s = "".join(map(lambda x: x.mString, mali.values())).lower() ss = re.sub("[acgtxn]", "", s) if float(len(ss)) < (len(s) * 0.1): options.alphabet = "na" if mali.getNumColumns() % 3 == 0: options.is_codons = True else: options.alphabet = "aa" if options.loglevel >= 1: options.stdlog.write("# autodetected alphabet: %s\n" % options.alphabet) if options.filename != "-": infile.close() npairs = 0 nskipped_length = 0 nskipped_distance = 0 pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids) - 1): for y in range(x + 1, len(ids)): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) if options.alphabet == "na": if options.method == "baseml": runBaseML(mali, pairs, options) elif options.method == "phylip" and options.distance in ("F84", "K80", "JC69", "LogDet"): runDNADIST(mali, pairs, options) elif options.method == "xrate": runXrate(mali, pairs, options) else: if options.is_codons: h = Genomics.SequencePairInfoCodons().getHeader() else: h = Genomics.SequencePairInfo().getHeader() options.stdout.write("seq1\tseq2\tdist\tvar\t%s\n" % (h)) for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] info = Genomics.CalculatePairIndices( mali[id_x], mali[id_y], with_codons=options.is_codons) if options.distance in ("T92", "JC69"): if options.sites == "d4": seq1, seq2 = Genomics.GetDegenerateSites(mali[id_x], mali[id_y], position=3, degeneracy=4) if len(seq1) < options.min_sites: nskipped_length += 1 continue else: raise "unknown sites %s" % options.sites if options.distance == "T92": distance, variance = CalculateDistanceT92(info) elif options.distance == "JC69": distance, variance = CalculateDistanceJC69(info) elif options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( map(str, (id_x, id_y, options.format % distance, options.format % variance, info))) + "\n") else: nskipped_distance += 1 elif options.alphabet == "aa": if options.distance in ("JTT", "PMB", "PAM", "Kimura", "CategoriesModel"): # use phylip for these phylip = WrapperPhylip.Phylip() phylip.setProgram("protdist") phylip.setMali(mali) phylip_options = [] if options.distance == "PMG": phylip_options += ["D"] * 1 elif options.distance == "PAM": phylip_options += ["D"] * 2 elif options.distance == "Kimura": phylip_options += ["D"] * 3 elif options.distance == "CategoriesModel": phylip_options += ["D"] * 4 phylip_options.append("Y") phylip.setOptions(phylip_options) result = phylip.run() writePhylipResult(result, options) else: options.stdout.write("id1\tid2\tdist\tvar\n") # iterate over all pairs of sequences for x, y in pairs: id_x = ids[x] npairs += 1 id_y = ids[y] if options.distance == "PID": distance, variance = CalculateDistancePID( mali[id_x], mali[id_y]) elif options.distance == "POVL": # percentage overlap distance, variance = CalculateDistancePOVL( mali[id_x], mali[id_y]) if distance >= 0: options.stdout.write("\t".join( (id_x, id_y, options.format % distance, options.format % variance)) + "\n") else: nskipped_distance += 1 if options.loglevel >= 1: options.stdlog.write( "# nseqs=%i, npairs=%i, nskipped_length=%i, nskipped_distance=%i\n" % (len(ids), npairs, nskipped_length, nskipped_distance)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2bootstrap.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip"), help="output format of multiple alignment") parser.add_option( "-p", "--output-filename-pattern", dest="output_filename_pattern", type="string", help= "pattern for output filenames. Should contain a %(id)i. If not given, the output is to stdout with --separator [default=%default]." ) parser.add_option("-n", "--samples", dest="samples", type="int", help="number of samples.") parser.add_option( "-r", "--no-replacement", dest="no_replacement", type="int", help= "sample without replacement. The parameter gives the size of the multiple alignment [default=%default]." ) parser.add_option("-b", "--block-size", dest="block_size", type="int", help="block size. Use 3 for sampling from codons.") parser.set_defaults(input_format="fasta", output_format="fasta", samples=10, block_size=1, output_filename_pattern=None, no_replacement=None, separator="//") (options, args) = E.Start(parser) mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) for x in range(options.samples): if options.no_replacement != None: new_mali = getSampledMali(mali, options.no_replacement, options.block_size) else: new_mali = getBootstrappedMali(mali, options.block_size) if options.output_filename_pattern: filename = options.output_filename_pattern % {"id": x + 1} target_directory = os.path.dirname(filename) if not os.path.exists(target_directory): os.makedirs(target_directory) outfile = open(filename, "w") E.info("creating mali %s" % filename) else: outfile = options.stdout new_mali.writeToFile(outfile, format=options.output_format) if outfile == sys.stdout: if options.separator and x < options.samples - 1: options.stdout.write(options.separator + "\n") else: outfile.close() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.add_option( "-m", "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode, global=nw, local=sw [default=%default].") parser.set_defaults( gop=-12.0, gep=-2.0, format="fasta", mode="local", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info("read 2 multiple alignments") mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format) mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format) cmali1 = Mali.convertMali2Alignlib(mali1) cmali2 = Mali.convertMali2Alignlib(mali2) if options.mode == "local": mode = alignlib_lite.py_ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib_lite.py_ALIGNMENT_GLOBAL alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop, options.gep) alignlib_lite.py_setDefaultEncoder( alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20)) alignlib_lite.py_setDefaultLogOddor( alignlib_lite.py_makeLogOddorDirichlet(0.3)) alignlib_lite.py_setDefaultRegularizor( alignlib_lite.py_makeRegularizorDirichletPrecomputed()) cprofile1 = alignlib_lite.py_makeProfile(cmali1) cprofile2 = alignlib_lite.py_makeProfile(cmali2) result = alignlib_lite.py_makeAlignmentVector() alignator.align(result, cprofile1, cprofile2) E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result)) cmali1.add(cmali2, result) outmali = Mali.convertAlignlib2Mali(cmali1, identifiers=mali1.getIdentifiers() + mali2.getIdentifiers()) outmali.writeToFile(options.stdout, format=options.format) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/prune_multiple_alignment.py 2654 2009-05-06 13:51:22Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--master", dest="master", type="string", help="master sequence.") parser.add_option("-p", "--master-pattern", dest="master_pattern", type="string", help="master pattern.") parser.add_option("--master-species", dest="master_species", type="string", help="species to use as master sequences.") parser.add_option("-t", "--translate", dest="filename_translation", type="string", help="filename on where to store translated sequences.") parser.add_option("-e", "--exons", dest="filename_exons", type="string", help="filename on where to exon information.") parser.add_option("-c", "--mark-codons", dest="mark_codons", action="store_true", help="mark codons.") parser.add_option( "-i", "--ignore-case", dest="ignore_case", action="store_true", help="ignore case (otherwise: lowercase are unaligned chars).") parser.add_option("--remove-stops", dest="remove_stops", action="store_true", help="remove stop codons.") parser.add_option("--mask-stops", dest="mask_stops", action="store_true", help="mask stop codons.") parser.add_option("--mask-char", dest="mask_char", type="string", help="masking character to use.") parser.add_option("-f", "--remove-frameshifts", dest="remove_frameshifts", action="store_true", help="remove columns corresponding to frameshifts.") parser.add_option( "--mask-master", dest="mask_master", action="store_true", help= "columns in master to be removed are masked to keep residue numbering." ) parser.add_option( "-s", "--split-exons", dest="split_exons", action="store_true", help="split columns aligned to different exons in the same gene.") parser.add_option("-a", "--target", dest="target", type="choice", choices=("paml", ), help="perform cleaning up for certain targets.") parser.set_defaults( gap_char="-", mask_char="n", gap_chars="-.", separator="|", master=None, master_species=None, filename_translation=None, filename_exons=None, master_pattern=None, remove_stops=False, mark_codons=False, mask_unaligned=False, split_exons=False, remove_frameshifts=False, min_segment_length=5, ignore_case=False, mask_stops=False, target=None, mask_master=False, ) (options, args) = E.Start(parser) if options.target == "paml": options.mask_stops = True options.mask_char = "n" options.remove_frameshifts = True if options.loglevel >= 1: options.stdlog.write( "# setting output to paml : removing frameshifts, masking stops with '%s'.\n" % (options.mask_char)) ## 1. read multiple alignment in fasta format mali = Mali.Mali() mali.readFromFile(sys.stdin) if options.loglevel >= 1: options.stdlog.write("# read mali with %i entries.\n" % len(mali)) if len(mali) == 0: raise "empty multiple alignment" identifiers = mali.getIdentifiers() masters = [] if options.master: masters = options.master.split(",") elif options.master_pattern: for id in identifiers: if re.search(options.master_pattern, id): masters.append(id) elif options.master_species: for id in identifiers: if options.master_species == id.split(options.separator)[0]: masters.append(id) else: masters.append(identifiers[0]) if options.loglevel >= 2: options.stdlog.write("# master sequences are: %s\n" % str(masters)) options.stdlog.flush() if options.filename_exons: exons = Exons.ReadExonBoundaries(open(options.filename_exons, "r"), filter=set(identifiers), from_zero=True) if options.loglevel >= 2: options.stdlog.write("# read exons %i sequences.\n" % len(exons)) else: exons = {} ################################################################################# ################################################################################# ################################################################################# ## translate characters to upper/lower case according to exon info. ################################################################################# if exons: for id in identifiers: if id in exons: mali.getSequence(id).mString = AddExonInformation( mali[id], exons[id], mask_char=options.mask_char) elif options.ignore_case: ## convert all to uppercase mali.upper() ################################################################################# ################################################################################# ################################################################################# ## untangle misaligned exons ################################################################################# if exons and options.split_exons: ## first split with masters if len(masters) > 0: SplitExons(mali, exons, masters=masters, options=options) if options.loglevel >= 4: mali.writeToFile(open("log_mali1", "w"), format="fasta") SplitExons(mali, exons, options) ################################################################################# ################################################################################# ################################################################################# ## remove frameshifts ################################################################################# if options.remove_frameshifts: out_of_frame_columns = [] if len(masters) == 1: frame_columns = GetFrameColumns(mali, masters[0], gap_chars=options.gap_chars) else: columns = [] for id in masters: columns += GetFrameColumns(mali, id, gap_chars=options.gap_chars) if len(columns) == 0: columns += GetFrameColumns(mali, identifiers[0], gap_chars=options.gap_chars) # sort all columns by tuple. The "shortest" codon will be first: (1,2,3) before (1,2,100), # and (1,2,100) before (1,3,4). columns.sort(lambda x, y: cmp((x[0], x[2]), (y[0], y[2]))) # select codons frame_columns = [] last_codon = columns[0] for codon in columns[1:]: # skip identical codons if codon == last_codon: continue # take first (shortest) codon in case of identical first residue if codon[0] == last_codon[0]: continue # if not overlapping, keep if codon[0] > last_codon[2]: frame_columns.append(last_codon) else: out_of_frame_columns += last_codon # if overlapping, but out of register: skip last_codon = codon frame_columns.append(last_codon) # build set of skipped columns frame_set = set() for column in frame_columns: for c in column: frame_set.add(c) # columns that contain a master sequence that is out of # frame out_of_frame_set = set(out_of_frame_columns) out_of_frame_set = out_of_frame_set.difference(frame_set) if options.loglevel >= 1: options.stdlog.write("# found %i/%i columns in frame\n" % (len(frame_columns) * 3, mali.getWidth())) if options.loglevel >= 5: options.stdlog.write("# frame columns: %i\n" % (len(frame_columns))) x = 0 for column in frame_columns: options.stdlog.write("# %i\t%s\n" % (x, ",".join(map(str, column)))) x += 1 if options.loglevel >= 5: options.stdlog.write( "# Out-of frame columns with residue of masters: %i\n" % (len(out_of_frame_set))) options.stdlog.write("# %s" % ",".join(map(str, out_of_frame_columns))) mask_chars = (string.upper(options.mask_char), string.lower(options.mask_char)) to_delete = [] ignore_case = exons or options.ignore_case for id in identifiers: ngaps, nmasked = 0, 0 sequence = mali.getSequence(id).mString if options.loglevel >= 7: options.stdlog.write( "# processing sequence %s of length %i with gaps\n" % (id, len(sequence))) ## treat masters differently if they are only to be masked, not ## pruned. ## simple mask all characters that are to skipped fragments = [] nstops, ncodons, naligned = 0, 0, 0 codon = [] chars = [] is_master = id in masters for x in range(len(sequence)): c = sequence[x] ## delete columns that do not align to ## a master. if x not in frame_set and x not in out_of_frame_set: continue chars.append(c) if c not in options.gap_chars: codon.append(c) if len(codon) % 3 == 0: codon = "".join(codon) codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options) if codon_is_aligned: naligned += 1 to_mask = False if codon_is_all_gaps: ngaps += len(chars) elif codon_is_ok: ncodons += 1 if string.upper(codon) in ("TAG", "TAA", "TGA"): nstops += 1 to_mask = True else: to_mask = True nmasked += 1 if to_mask: for i in range(len(chars)): if chars[i] not in options.gap_chars: chars[i] = options.mask_char fragments.append("".join(chars)) chars = [] codon = [] ## mask incomplete codons at the end if chars: for i in range(len(chars)): if chars[i] not in options.gap_chars: chars[i] = options.mask_char fragments.append("".join(chars)) ## else: ## for a,b,c in frame_columns: ## codon = sequence[a] + sequence[b] + sequence[c] ## codon_is_ok, codon_is_aligned, codon_is_all_gaps = checkCodon( codon, options ) ## if codon_is_aligned: naligned += 1 ## if codon_is_all_gaps: ## fragments.append( options.gap_char * 3 ) ## ngaps += 1 ## elif codon_is_ok: ## ncodons += 1 ## if string.upper(codon) in ("TAG", "TAA", "TGA"): ## if options.remove_stops: ## fragments.append( options.gap_char * 3 ) ## elif options.mask_stops: ## fragments.append( options.mask_char * 3 ) ## else: ## fragments.append( codon ) ## nstops += 1 ## else: ## fragments.append( codon ) ## else: ## fragments.append( options.gap_char * 3 ) ## nmasked += 1 ## if options.loglevel >= 7: ## options.stdlog.write("# %s: %i,%i,%i: codon=%s ok=%s is_aligned=%s\n" % (id, ## a,b,c, ## codon, ## str(codon_is_ok), ## str(codon_is_aligned) )) s = string.join(fragments, "") if options.loglevel >= 1: options.stdlog.write( "# sequence: %s\tpositions: %i\taligned:%i\tcodons: %i\t stops: %i\tgaps: %i\tnmasked: %i\n" % (id, len(fragments), naligned, ncodons, nstops, ngaps, nmasked)) options.stdlog.flush() ## postpone deletion in order to not ## confuse the iteration of ids if naligned == 0: options.stdlog.write( "# sequence: %s removed because there are no aligned nucleotides.\n" % id) to_delete.append(id) elif ncodons == 0: options.stdlog.write( "# sequence: %s removed because there are no aligned codons.\n" % id) to_delete.append(id) else: mali.setSequence(id, string.join(fragments, "")) for id in to_delete: del mali[id] for id in identifiers: if options.mark_codons: a = mali[id] f = lambda x: a[x:x + 3] s = string.join([f(x) for x in range(0, len(a), 3)], " ") else: s = mali[id] options.stdout.write(">%s\n%s\n" % (id, s)) if options.filename_translation: outfile = open(options.filename_translation, "w") for id in mali.keys(): outfile.write(">%s\n%s\n" % (id, Genomics.TranslateDNA2Protein(mali[id]))) outfile.close() E.Stop()
def runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options): """setup codeml wrapper. Sets options and returns a wrapper. """ ids = mali.getIdentifiers() ## setup codeml codeml_options = {} if options.seqtype == "codon": codeml_options["seqtype"] = "1" elif options.seqtype == "aa": codeml_options["seqtype"] = "2" elif options.seqtype == "trans": codeml_options["seqtype"] = "3" if options.clean_data: codeml_options["cleandata"] = options.clean_data if options.omega != None: codeml_options["omega"] = str(options.omega) if options.kappa != None: codeml_options["kappa"] = str(options.kappa) if options.fix_kappa: codeml_options["fix_kappa"] = "1" if options.fix_omega: codeml_options["fix_omega"] = "1" if options.codon_frequencies != None: c = options.codon_frequencies.upper() if c == "UNIFORM": a = "0" elif c == "F1X4": a = "1" elif c == "F3X4": a = "2" elif c == "F61": a = "3" else: a = options.codon_frequencies codeml_options["CodonFreq"] = a if options.paml_method != None: codeml_options["paml_method"] = str(options.method) if options.optimization_threshold != None: codeml_options["Small_Diff"] = str(options.optimization_threshold) ninput, noutput, nskipped = 0, 0, 0 tstart = time.time() if pairs and (options.pairwise or has_non_overlaps): wrapper = WrapperCodeML.CodeMLPairwise() ## do pairwise run result = WrapperCodeML.CodeMLResultPairs() ntotal = (len(ids) * (len(ids) - 1)) / 2 for x, y in pairs: m1 = mali.getSequence(ids[x]) ninput += 1 temp_mali = Mali.Mali() m2 = mali.getSequence(ids[y]) temp_mali.addSequence(ids[x], m1.mFrom, m1.mTo, m1.mString) temp_mali.addSequence(ids[y], m2.mFrom, m2.mTo, m2.mString) ## remove empty columns and masked columns if options.clean_mali: temp_mali.mGapChars = temp_mali.mGapChars + ("n", "N") temp_mali.removeGaps(minimum_gaps=1, frame=3) if temp_mali.getWidth() < options.min_overlap: if options.loglevel >= 1: options.stdlog.write( "# pair %s-%s: not computed because only %i residues overlap\n" % (mali.getEntry(ids[x]).mId, mali.getEntry( ids[y]).mId, temp_mali.getWidth())) nskipped += 1 continue sub_result = wrapper.Run(temp_mali, options=codeml_options, dump=options.dump) result.mPairs += sub_result.mPairs if options.loglevel >= 1 and ninput % options.report_step == 0: options.stdlog.write( "# pairwise computation: %i/%i -> %i%% in %i seconds.\n" % (ninput, ntotal, 100.0 * ninput / ntotal, time.time() - tstart)) options.stdlog.flush() noutput += printPairs(sub_result.mPairs, mali, map_new2old, options) options.stdout.flush() if options.loglevel >= 1: options.stdlog.write( "# pairwise computation: ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) options.stdlog.flush() else: wrapper = WrapperCodeML.CodeML() result = wrapper.Run(mali, tree=tree, options=codeml_options, dump=options.dump) result_pairs = WrapperCodeML.CodeMLResultPairs() result_pairs.fromResult(result) noutput += printPairs(result_pairs.mPairs, mali, map_new2old, options) l = mali.getLength() if options.loglevel >= 1: options.stdlog.write("# input=%i, npairs=%i, noutput=%i\n" % (l, l * (l - 1) / 2, len(result_pairs.mPairs)))
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: sequences2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip"), help="output format of multiple alignment") parser.add_option("-m", "--method", dest="method", type="choice", choices=("add", ), help="""method to use to build multiple alignment.""") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one.") parser.add_option("-a", "--alignment-method", dest="alignment_method", type="choice", choices=("sw", "nw"), help="alignment_method [%default].") parser.set_defaults( input_format="fasta", output_format="fasta", method=None, parameters="", gop=-10.0, gep=-1.0, alignment_method="sw", ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") iterator = FastaIterator.iterate(sys.stdin) if options.method == "add": mali = Mali.Mali() mali.readFromFile(open(options.parameters[0], "r"), format=options.input_format) del options.parameters[0] old_length = mali.getLength() new_mali = convertMali2Mali(mali) if options.alignment_method == "sw": alignator = alignlib_lite.py_makeAlignatorFullDP( options.gop, options.gep) else: alignator = alignlib_lite.py_makeAlignatorFullDPGlobal( options.gop, options.gep) while 1: cur_record = iterator.next() if cur_record is None: break map_mali2seq = alignlib_lite.py_makeAlignataVector() sequence = alignlib_lite.py_makeSequence(cur_record.sequence) profile = alignlib_lite.py_makeProfileFromMali(new_mali) if options.loglevel >= 4: options.stdlog.write(profile.Write()) alignator.Align(profile, sequence, map_mali2seq) if options.loglevel >= 3: options.stdlog.write(map_mali2seq.Write()) ## add sequence to mali a = alignlib_lite.py_makeAlignatumFromString(cur_record.sequence) a.thisown = 0 new_mali.addAlignatum(a, map_mali2seq, 1, 1, 1, 1, 1) id = cur_record.title mali.mIdentifiers.append(id) mali.mMali[id] = Mali.AlignedString( id, 0, len(cur_record.sequence), new_mali.getRow(new_mali.getWidth() - 1).getString()) # substitute for x in range(old_length): mali.mMali[mali.mIdentifiers[x]].mString = new_mali.getRow( x).getString() mali.writeToFile(sys.stdout, format=options.output_format) E.Stop()
xrate_min_increment=0.000001, with_rho=True, separator="|", single_omega=False, shared_frequencies=False, shared_rates=False, block_size=None, replicates=None, ) (options, args) = Experiment.Start(parser) if options.replicates != None: # read a sequence collection with possible duplicate names # used for benchmarking mali = Mali.SequenceCollection() else: mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) options.stdout.write( "seq1\tseq2\tdN\tdS\tdNdS\tN\tS\tdN_err\tdS_err\tkappa\tlnL\ttau\tlen") if options.with_rho: options.stdout.write("\trN\trS\tt\trN0\trS0\tt0") options.stdout.write("\terror_str\n") if options.replicates != None: ids = mali.getIdentifiers()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--gop", dest="gop", type="float", help="gap opening penalty [default=%default].") parser.add_option("-e", "--gep", dest="gep", type="float", help="gap extension penalty [default=%default].") parser.add_option("-m", "--mode", dest="mode", type="choice", choices=("global", "local"), help="alignment mode, global=nw, local=sw [default=%default].") parser.set_defaults( gop=-12.0, gep=-2.0, format="fasta", mode="local", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply two multiple alignments in FASTA format.") mali1 = Mali.Mali() mali2 = Mali.Mali() E.info("read 2 multiple alignments") mali1.readFromFile(IOTools.openFile(args[0], "r"), format=options.format) mali2.readFromFile(IOTools.openFile(args[1], "r"), format=options.format) cmali1 = Mali.convertMali2Alignlib(mali1) cmali2 = Mali.convertMali2Alignlib(mali2) if options.mode == "local": mode = alignlib_lite.py_ALIGNMENT_LOCAL elif options.mode == "global": mode = alignlib_lite.py_ALIGNMENT_GLOBAL alignator = alignlib_lite.py_makeAlignatorDPFull(mode, options.gop, options.gep) alignlib_lite.py_setDefaultEncoder( alignlib_lite.py_getEncoder(alignlib_lite.py_Protein20)) alignlib_lite.py_setDefaultLogOddor( alignlib_lite.py_makeLogOddorDirichlet(0.3)) alignlib_lite.py_setDefaultRegularizor( alignlib_lite.py_makeRegularizorDirichletPrecomputed()) cprofile1 = alignlib_lite.py_makeProfile(cmali1) cprofile2 = alignlib_lite.py_makeProfile(cmali2) result = alignlib_lite.py_makeAlignmentVector() alignator.align(result, cprofile1, cprofile2) E.debug("result=\n%s" % alignlib_lite.py_AlignmentFormatEmissions(result)) cmali1.add(cmali2, result) outmali = Mali.convertAlignlib2Mali(cmali1, identifiers=mali1.getIdentifiers() + mali2.getIdentifiers()) outmali.writeToFile(options.stdout, format=options.format) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_sites_slr.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("--method", dest="method", type="choice", choices=("summary-slr", "summary-filtered", "over-representation", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"), help="method to apply.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix for rows.") parser.add_option("-s", "--filename-sites", dest="filename_sites", type="string", help="filename with sites information.") parser.add_option("-l", "--filename-log", dest="filename_log", type="string", help="filename with logging information.") parser.add_option( "-m", "--filename-mali", dest="filename_mali", type="string", help= "filename of multiple alignment, that was input to SLR. If given, is used to filter indels." ) parser.add_option( "--filter-probability", dest="filter_probability", type="float", help="threshold for probability above which to include positive sites." ) parser.add_option("--no-header", dest="write_header", action="store_false", help="only output header.") parser.add_option("--only-header", dest="only_header", action="store_true", help="only output header.") parser.add_option("--significance-threshold", dest="significance_threshold", type="float", help="threshold for significance tests [%default].") parser.add_option("--use-adjusted", dest="use_adjusted", action="store_true", help="use SLR adjusted probability values.") parser.add_option("--truncate-sites-list", dest="truncate_sites_list", type="int", help="truncate sites list after ## entries (0 for all).") parser.add_option( "--context-size", dest="context_size", type="int", help="size of left/right context around a selected residue.") parser.set_defaults( prefix=None, filter_probability=0, filter_omega=0, filename_sites="-", filename_log=None, filename_mali=None, significance_threshold=0.05, write_header=True, only_header=False, use_adjusted=False, context_size=0, truncate_sites_list=0, ) (options, args) = E.Start(parser) slr = WrapperSlr.Slr() # write headers if "%s" in options.filename_sites: options.prefix = True if options.method == "summary-slr": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # This uses the thresholds as set in SLR. Use "counts" for filtering # residues based on your own thresholds """) thresholds = "95%", "99%", "95% corrected", "99% corrected" if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "ltree\tomega\tkappa\tlnL\tnsites\tnsyn\tngap\t") options.stdout.write("\t".join( map(lambda x: "npos_" + x.replace(" ", "_"), thresholds))) options.stdout.write("\t") options.stdout.write("\t".join( map(lambda x: "nneg_" + x.replace(" ", "_"), thresholds))) options.stdout.write("\n") elif options.method == "summary-filtered": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # This method uses the supplied threshold and the multiple alignment to filter. # All positions that are above the threshold (P-Value) and which are located in # indels: >= 1 sequence missing from column, are removed. """) if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "ltree\tomega\tkappa\tlnL\tnsites\tnfiltered\tntotal\tnsyn\tnneg\tnpos\n" ) elif options.method in ("positive-site-table", "negative-site-table", "neutral-site-table"): # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Numbers of positive/neutral/negative sites according to SLR # # Note: sequence positions are 1-based, but mali positions are 0-based. # Residues in indel positions have been removed and signifnicance was determined according # with a threshold of %5.2e """ % options.significance_threshold) if options.prefix: options.stdout.write("prefix\t") options.stdout.write("cluster\tnsites\tp-value\tsites\n") elif options.method in ("positive-site-list", "negative-site-list", "neutral-site-list"): # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write( """# Sites under positive/neutral/negative selection according to SLR # # Note: sequence positions are 1-based, but mali positions are 0-based. # Residues in indel positions have been removed and signifnicance was determined according # with a threshold of %5.2e """ % options.significance_threshold) if options.prefix: options.stdout.write("prefix\t") options.stdout.write( "sequence\tn\taa\tseq_pos\tmali_pos\tcontext\n") elif options.method == "over-representation": # write header if options.write_header or options.only_header: if options.loglevel >= 1: options.stdlog.write("""# Genes with over-represented sites. # # This method uses as input the output of summary-filtered. """) if options.only_header: sys.exit(0) if options.method in ("summary-slr", "summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table", "positive-site-list", "negative-site-list", "neutral-site-list"): ninput, noutput, nskipped = 0, 0, 0 if "%s" in options.filename_sites: headers, table = CSV.ReadTable(sys.stdin) fprefix = headers.index("prefix") try: fsignificance = headers.index("p") except ValueError: fsignificance = None for row in table: id = row[fprefix] if fsignificance is not None: p_value = row[fsignificance] else: p_value = None ninput += 1 fn = re.sub("%s", id, options.filename_sites) if not os.path.exists(fn): nskipped += 1 continue lines_sites = open(fn, "r").readlines() if options.filename_log: lines_log = open(re.sub("%s", id, options.filename_log), "r").readlines() result = slr.parseOutput(lines_sites, lines_log) if options.method in ("summary-filtered", "positive-site-table", "negative-site-table", "neutral-site-table"): mali = Mali.Mali() mali.readFromFile( open(re.sub("%s", id, options.filename_mali), "r")) else: mali = None ProcessResult(result, options, mali, prefix=id, p_value=p_value) noutput += 1 else: if options.filename_sites == "-": lines_sites = sys.stdin.readlines() else: lines_sites = open(options.filename_sites, "r").readlines() ninput += 1 if options.filename_log: lines_log = open(options.filename_log, "r").readlines() result = slr.parseOutput(lines_sites, lines_log) if options.filename_mali: mali = Mali.Mali() mali.readFromFile(open(options.filename_mali, "r")) else: if options.method == "summary-filtered": raise "please supply a multiple alignment for filtering." mali = None ProcessResult(result, options, mali, prefix=options.prefix) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i.\n" % (ninput, noutput, nskipped)) else: if options.method == "over-representation": results = [] for line in sys.stdin: if line[0] == "#": continue data = line[:-1].split("\t") if data[0] == "prefix": continue results.append( Result(data[0], int(data[6]), int(data[7]), int(data[8]), int(data[9]), int(data[10]))) # probability of a single site being positive ntotal = sum(map(lambda x: x.mNTotal, results)) npositives = sum(map(lambda x: x.mNPositive, results)) p = float(npositives) / float(ntotal) if options.loglevel >= 1: options.stdlog.write("# sites: total=%i, positive=%i, p=%f\n" % (ntotal, npositives, p)) new_results = [] for result in results: if result.mNTotal == 0: continue # use -1, because I need P( x >= X) # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) # = P (x > X ). r = scipy.stats.binom.sf(result.mNPositive - 1, result.mNTotal, p) result.mSignificance = r if r < options.significance_threshold: new_results.append(result) new_results.sort( lambda x, y: cmp(x.mSignificance, y.mSignificance)) options.stdlog.write(Result().getHeader() + "\n") for result in new_results: options.stdout.write(str(result) + "\n") if options.loglevel >= 1: options.stdlog.write("# ntotal=%i, npos=%i\n" % (len(results), len(new_results))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2malis.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--pattern-mali", dest="pattern_mali", type="string", help="filename pattern for multiple alignment files.") parser.add_option( "--filename-coordinates", dest="filename_coordinates", type="string", help="filename of coordinates that constitute the multiple alignment.") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal"), help="input format of multiple alignment") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("fasta", "codeml", "phylip"), help="output format of multiple alignment") parser.set_defaults( input_format="fasta", output_format="fasta", filename_coordinates=None, pattern_mali="%s.fasta", ) (options, args) = E.Start(parser) ## read coordinates if options.filename_coordinates: coordinates = [] for line in open(options.filename_coordinates, "r"): if line[0] == "#": continue id, length, position = line[:-1].split("\t") if id == "component": continue coordinates.append((id, int(length), int(position))) mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) ids = mali.getIdentifiers() ninput, noutput = 0, 0 for id, length, position in coordinates: ninput += 1 part_mali = Mali.Mali() for x in ids: part_mali.addSequence(x, 0, length, mali[x][position:position + length]) outfile_name = options.pattern_mali % id outfile = open(outfile_name, "w") part_mali.writeToFile(outfile, format=options.output_format) noutput += 1 if options.loglevel >= 1: options.stdlog.write("# input=%i, output=%i\n" % (ninput, noutput)) E.Stop()