def readComponents(options): """read components from filename supplied in the options. """ if options.filename_components: map_seq_id2component =\ IOTools.ReadMap( open(options.filename_components, "r"), columns = "all", both_directions = False) map_component2seq_id = {} map_component2input_id = {} for key, val in map_seq_id2component.items(): if type(val) == types.StringType: input_id = val output_id = val elif type(val) == types.TupleType: if len(val) == 2: input_id = val[0] output_id = val[1] else: input_id = val[0] output_id = val[0] else: raise ValueError("error in reading %s: %s->%s" % (options.filename_components, key, val)) if output_id not in map_component2seq_id: map_component2seq_id[output_id] = [] map_component2seq_id[output_id].append(key) map_component2input_id[output_id] = input_id return map_seq_id2component, map_component2seq_id, map_component2input_id else: return None, None, None
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $" ) parser.add_option("-f", "--file", dest="input_filename", type="string", help="input filename. If not given, stdin is used.", metavar="FILE") parser.add_option( "-i", "--input-pattern", dest="input_pattern", type="string", help="input pattern. Parses description line in order to extract id.") parser.add_option( "-o", "--output-pattern", dest="output_pattern", type="string", help="output pattern. Gives filename for a given sequence.") parser.add_option( "-n", "--num-sequences", dest="num_sequences", type="int", help="split by number of sequences (not implemented yet).") parser.add_option("-m", "--map", dest="map_filename", type="string", help="map filename. Map identifiers to filenames", metavar="FILE") parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true", help="do not write identifiers.", metavar="FILE") parser.add_option("--min-size", dest="min_size", type="int", help="minimum cluster size.") parser.set_defaults( \ input_filename = None, map_filename = None, skip_identifiers = False, input_pattern = "^(\S+)", min_size = 0, num_sequences = None, output_pattern = "%s" ) (options, args) = E.Start(parser) if options.input_filename: infile = IOTools.openFile(options.input_filename, "r") else: infile = sys.stdin if options.map_filename: map_id2filename = IOTools.ReadMap(open(options.map_filename, "r")) else: map_id2filename = {} if options.num_sequences: files = FilesChunks(chunk_size=options.num_sequences, output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) else: files = Files(output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) if options.input_pattern: rx = re.compile(options.input_pattern) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate(infile): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print "# parsing error in description line %s" % (seq.title) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else: continue files.Write(identifier, seq) noutput += 1 if options.input_filename: infile.close() ## delete all clusters below a minimum size ## Note: this has to be done at the end, because ## clusters sizes are only available once both the fasta ## file and the map has been parsed. if options.min_size: ndeleted = files.DeleteFiles(min_size=options.min_size) else: ndeleted = 0 if options.loglevel >= 1: print "# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted) E.Stop()
def Process(lines, other_trees, options, map_old2new, ntree): nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines)) if options.loglevel >= 1: options.stdlog.write("# read %i trees.\n" % len(nexus.trees)) nskipped = 0 ntotal = len(nexus.trees) extract_pattern = None species2remove = None write_map = False phylip_executable = None phylip_options = None index = 0 # default: do not output internal node names write_all_taxa = False for tree in nexus.trees: if options.outgroup: tree.root_with_outgroup(options.outgroup) for method in options.methods: if options.loglevel >= 3: options.stdlog.write("# applying method %s to tree %i.\n" % (method, index)) if method == "midpoint-root": tree.root_midpoint() elif method == "balanced-root": tree.root_balanced() elif method == "unroot": TreeTools.Unroot(tree) elif method == "phylip": if not phylip_executable: phylip_executable = options.parameters[0] del options.parameters[0] phylip_options = re.split("@", options.parameters[0]) del options.parameters[0] phylip = WrapperPhylip.Phylip() phylip.setProgram(phylip_executable) phylip.setOptions(phylip_options) phylip.setTree(tree) result = phylip.run() nexus.trees[index] = result.mNexus.trees[0] elif method == "normalize": if options.value == 0: v = 0 for n in tree.chain.keys(): v = max(v, tree.node(n).data.branchlength) else: v = options.value for n in tree.chain.keys(): tree.node(n).data.branchlength /= float(options.value) elif method == "divide-by-tree": if len(other_trees) > 1: other_tree = other_trees[ntree] else: other_tree = other_trees[0] # the trees have to be exactly the same!! if options.loglevel >= 2: print tree.display() print other_tree.display() if not tree.is_identical(other_tree): nskipped += 1 continue # even if the trees are the same (in topology), the node numbering might not be # the same. Thus build a map of node ids. map_a2b = TreeTools.GetNodeMap(tree, other_tree) for n in tree.chain.keys(): try: tree.node(n).data.branchlength /= float( other_tree.node(map_a2b[n]).data.branchlength) except ZeroDivisionError: options.stdlog.write( "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n" % (n, map_a2b[n], ntree)) continue elif method == "rename": if not map_old2new: map_old2new = IOTools.ReadMap(open(options.parameters[0], "r"), columns=(0, 1)) if options.invert_map: map_old2new = IOTools.getInvertedDictionary( map_old2new, make_unique=True) del options.parameters[0] unknown = [] for n, node in tree.chain.items(): if node.data.taxon: try: node.data.taxon = map_old2new[node.data.taxon] except KeyError: unknown.append(node.data.taxon) for taxon in unknown: tree.prune(taxon) # reformat terminals elif method == "extract-with-pattern": if not extract_pattern: extract_pattern = re.compile(options.parameters[0]) del options.parameters[0] for n in tree.get_terminals(): node = tree.node(n) node.data.taxon = extract_pattern.search( node.data.taxon).groups()[0] elif method == "set-uniform-branchlength": for n in tree.chain.keys(): tree.node(n).data.branchlength = options.value elif method == "build-map": # build a map of identifiers options.write_map = True for n in tree.get_terminals(): node = tree.node(n) if node.data.taxon not in map_old2new: new = options.template_identifier % (len(map_old2new) + 1) map_old2new[node.data.taxon] = new node.data.taxon = map_old2new[node.data.taxon] elif method == "remove-pattern": if species2remove is None: species2remove = re.compile(options.parameters[0]) del options.parameters taxa = [] for n in tree.get_terminals(): t = tree.node(n).data.taxon skip = False if species2remove.search(t): continue if not skip: taxa.append(t) TreeTools.PruneTree(tree, taxa) elif method == "add-node-names": inode = 0 write_all_taxa = True for n, node in tree.chain.items(): if not node.data.taxon: node.data.taxon = "inode%i" % inode inode += 1 elif method == "newick2nhx": # convert names to species names for n in tree.get_terminals(): t = tree.node(n).data.taxon d = t.split("|") if len(d) >= 2: tree.node(n).data.species = d[0] index += 1 ntree += 1 if options.output_format == "nh": options.stdout.write( TreeTools.Nexus2Newick( nexus, write_all_taxa=True, with_branchlengths=options.with_branchlengths) + "\n") else: for tree in nexus.trees: tree.writeToFile(options.stdout, format=options.output_format) return ntotal, nskipped, ntree
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment [default=%default].") parser.add_option( "-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"), help="output format of multiple alignment [default=%default].") parser.add_option( "--with-ranges", dest="with_ranges", action="store_true", help= "output alignment ranges (suffix /from-to after identifier) [default=%default]." ) parser.add_option( "--without-ranges", dest="with_ranges", action="store_false", help= "do not output alignment ranges (suffix /from-to after identifier) [default=%default]." ) parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true", help="permit duplicate entries [default=%default].") parser.add_option( "-m", "--method", dest="methods", type="string", help= """methods to apply. Several methods can be specified in a ','-separated list [default=%default].""" ) parser.add_option( "-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one [default=%default]." ) parser.add_option( "-a", "--mask-char", dest="mask_char", type="string", help="character to identify/set masked characters [default=%default].") parser.set_defaults( input_format="fasta", output_format="fasta", methods="", parameters="", mask_char="x", gap_chars="-.nN", with_ranges=True, allow_duplicates=False, ) (options, args) = E.Start(parser) options.methods = options.methods.split(",") options.parameters = options.parameters.split(",") # 1. read multiple alignment in various formats if options.allow_duplicates: mali = Mali.SequenceCollection() else: mali = Mali.Mali() t1 = time.time() mali.readFromFile(options.stdin, format=options.input_format) E.info("read mali with %i entries in %i seconds." % (len(mali), time.time() - t1)) if len(mali) == 0: raise ValueError("empty multiple alignment") for method in options.methods: t1 = time.time() if method == "remove-unaligned-ends": mali.removeUnalignedEnds() elif method == "remove-end-gaps": mali.removeEndGaps() elif method == "remove-all-gaps": mali.removeGaps(minimum_gaps=len(mali)) elif method == "remove-any-gaps": mali.removeGaps(minimum_gaps=1) elif method == "remove-some-gaps": minimum_gaps = int(options.parameters[0]) del options.parameters[0] mali.removeGaps(minimum_gaps=minimum_gaps) elif method == "remove-empty-sequences": mali.removeEmptySequences() elif method == "upper": mali.upperCase() elif method == "lower": mali.lowerCase() elif method == "mark-codons": mali.markCodons() elif method == "remove-stops": mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"), allowed_matches=0, minimum_matches=1, delete_frame=3, search_frame=3) elif method == "shift-alignment": map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"), map_functions=(str, int)) del options.parameters[0] mali.shiftAlignment(map_id2offset) elif method == "propagate-masks": mali.propagateMasks(mask_char=options.mask_char) elif method == "recount": mali.recount() elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions", "keep-even-segments", "keep-odd-segments"): if os.path.exists(options.parameters[0]): map_id2transitions = IOTools.readMultiMap( open(options.parameters[0], "r"), map_functions=(str, int)) else: map_id2transitions = {} r = map(int, options.parameters[0].split(':')) r.sort() map_id2transitions["mali"] = r del options.parameters[0] if method == "mark-transitions": mali.markTransitions(map_id2transitions) elif method in ("filter-odd-transitions", "keep-even-segments"): mali.markTransitions(map_id2transitions, mode="keep-odd") elif method in ("filter-even-transitions", "keep-odd-segments"): mali.markTransitions(map_id2transitions, mode="keep-even") elif method == "propagate-transitions": mali.propagateTransitions() elif method == "map-annotation": # map annotations in one mali (stockholm-format) to the annotations in another. # Note: the first two sequence identifiers must be shared and the sequence of the # same length other_mali = Mali.Mali() other_mali.readFromFile(open(options.parameters[0], "r"), format="stockholm") del options.parameters[0] mali.copyAnnotations(other_mali) elif method == "add-annotation": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] AddAnnotation(mali, annotation_type, annotation_file) elif method == "mask-columns": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] maskColumns(mali, annotation_type, annotation_file) elif method == "remove-unaligned-pairs": removeUnalignedPairs(mali, options) elif method == "filter-3rd": filterMali(mali, "3rd") elif method == "filter-4d": filterMali(mali, "4d") elif method in ("mask-seg", "mask-bias"): a, b = method.split("-") maskMali(mali, b) elif method == "exclude-with-stop": mali.filter(method="with-stop") elif method == "exclude-with-stop": mali.filter(method="with-frameshift") E.info("applied method %s in %i seconds." % (method, time.time() - t1)) mali.writeToFile(options.stdout, format=options.output_format, write_ranges=options.with_ranges) E.Stop()
help="aggregation function.") parser.set_defaults(filename_map=None, filename_info=None, filename_tissues=None, headers=True, aggregate="mean", value_format="%5.2f", method="counts") (options, args) = E.Start(parser) if not options.filename_map: raise "please supply filename mapping probesets to identifiers." map_probe2locus = IOTools.ReadMap(open(options.filename_map, "r")) matrix, row_headers, col_headers = MatlabTools.readMatrix( sys.stdin, format="full", headers=options.headers) if options.filename_tissues: tissues, nerrors = IOTools.ReadList(open(options.filename_tissues, "r")) tissues = set(tissues) columns = [] for x in range(len(col_headers)): if col_headers[x] in tissues: columns.append(x) else: columns = range(len(col_headers))
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-w", "--weights-tsv-file", dest="filename_weights", type="string", help="filename with codon frequencies. Multiple filenames " "can be separated by comma.") parser.add_option("-s", "--section", dest="sections", type="choice", action="append", choices=("length", "sequence", "hid", "na", "aa", "cpg", "dn", "degeneracy", "gaps", "codons", "codon-usage", "codon-translator", "codon-bias"), help="which sections to output [%default]") parser.add_option( "-t", "--sequence-type", dest="seqtype", type="choice", choices=("na", "aa"), help="type of sequence: na=nucleotides, aa=amino acids [%default].") parser.add_option( "-e", "--regex-identifier", dest="regex_identifier", type="string", help="regular expression to extract identifier from fasta " "description line.") parser.add_option("--split-fasta-identifier", dest="split_id", action="store_true", help="split fasta description line (starting >) and use " "only text before first space") parser.add_option( "--add-total", dest="add_total", action="store_true", help="add a row with column totals at the end of the table" "[%default]") parser.set_defaults( filename_weights=None, pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", gap_chars='xXnN', split_id=False, add_total=False, ) (options, args) = E.Start(parser, argv=argv) rx = re.compile(options.regex_identifier) reference_codons = [] if options.filename_weights: options.filename_weights = options.filename_weights.split(",") for filename in options.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( IOTools.ReadMap(IOTools.openFile(filename, "r"), has_header=True, map_functions=(str, float))) # print codon table differences options.stdlog.write( "# Difference between supplied codon usage preferences.\n") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in a.items(): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) options.stdlog.write("# tablediff\t%s\t%s\t%f\n" % (options.filename_weights[x], options.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(options.stdin) def getCounter(section): if options.seqtype == "na": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "na": s = SequenceProperties.SequencePropertiesNA() elif section == "gaps": s = SequenceProperties.SequencePropertiesGaps( options.gap_chars) elif section == "cpg": s = SequenceProperties.SequencePropertiesCpg() elif section == "dn": s = SequenceProperties.SequencePropertiesDN() # these sections requires sequence length to be a multiple of 3 elif section == "aa": s = SequenceProperties.SequencePropertiesAA() elif section == "degeneracy": s = SequenceProperties.SequencePropertiesDegeneracy() elif section == "codon-bias": s = SequenceProperties.SequencePropertiesBias(reference_codons) elif section == "codons": s = SequenceProperties.SequencePropertiesCodons() elif section == "codon-usage": s = SequenceProperties.SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequenceProperties.SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif options.seqtype == "aa": if section == "length": s = SequenceProperties.SequencePropertiesLength() elif section == "sequence": s = SequenceProperties.SequencePropertiesSequence() elif section == "hid": s = SequenceProperties.SequencePropertiesHid() elif section == "aa": s = SequenceProperties.SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s # setup totals totals = {} for section in options.sections: totals[section] = getCounter(section) options.stdout.write("id") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getHeaders())) options.stdout.write("\n") options.stdout.flush() s = getCounter("hid") s.loadSequence("AAAAAAAAA", "na") for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: raise ValueError("empty sequence %s" % cur_record.title) id = rx.search(cur_record.title).groups()[0] if options.split_id is True: options.stdout.write("%s" % id.split()[0]) else: options.stdout.write("%s" % id) options.stdout.flush() for section in options.sections: s = getCounter(section) s.loadSequence(sequence, options.seqtype) totals[section].addProperties(s) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") if options.add_total: options.stdout.write("total") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getFields())) options.stdout.write("\n") E.Stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="methods", type="choice", action="append", choices=("translate", "translate-to-stop", "truncate-at-stop", "back-translate", "mark-codons", "apply-map", "build-map", "pseudo-codons", "filter", "interleaved-codons", "map-codons", "remove-gaps", "mask-seg", "mask-bias", "mask-codons", "mask-incomplete-codons", "mask-stops", "mask-soft", "remove-stops", "upper", "lower", "reverse-complement", "sample", "shuffle"), help="method to apply to sequences.") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one " "[default=%default].") parser.add_option("-x", "--ignore-errors", dest="ignore_errors", action="store_true", help="ignore errors [default = %default].") parser.add_option("--sample-proportion", dest="sample_proportion", type="float", help="sample proportion [default = %default].") parser.add_option("--exclude-pattern", dest="exclude_pattern", type="string", help="exclude all sequences with ids matching pattern " "[default = %default].") parser.add_option("--include-pattern", dest="include_pattern", type="string", help="include only sequences with ids matching pattern " "[default = %default].") parser.add_option("--filter-method", dest="filter_methods", type="string", action="append", help="filtering methods to apply " "[default = %default].") parser.add_option( "-t", "--sequence-type", dest="type", type="choice", choices=("aa", "na"), help="sequence type (aa or na) [%default]. This option determines " "which characters to use for masking [default = %default].") parser.add_option( "-l", "--template-identifier", dest="template_identifier", type="string", help="template for numerical identifier [default = %default] " "for the operation --build-map. A %i is replaced by the position " "of the sequence in the file.") parser.set_defaults( methods=[], parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", ignore_errors=False, exclude_pattern=None, include_pattern=None, sample_proportion=None, filter_methods=[], ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") rx_include, rx_exclude = None, None if options.include_pattern: rx_include = re.compile(options.include_pattern) if options.exclude_pattern: rx_exclude = re.compile(options.exclude_pattern) iterator = FastaIterator.FastaIterator(options.stdin) nseq = 0 map_seq2nid = {} if "apply-map" in options.methods: map_seq2nid = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if options.type == "na": mask_chars = options.na_mask_chars mask_char = options.na_mask_char else: mask_chars = options.aa_mask_chars mask_char = options.aa_mask_char if "map-codons" in options.methods: map_codon2code = IOTools.ReadMap(open(options.parameters[0], "r")) del options.parameters[0] if "mask-soft" in options.methods: f = options.parameters[0] del options.parameters[0] hard_masked_iterator = FastaIterator.FastaIterator(open(f, "r")) if "mask-codons" in options.methods or "back-translate" in options.methods: # open a second stream to read sequences from f = options.parameters[0] del options.parameters[0] other_iterator = FastaIterator.FastaIterator(open(f, "r")) ninput, noutput, nerrors, nskipped = 0, 0, 0, 0 if "sample" in options.methods: if not options.sample_proportion: raise ValueError("specify a sample proportion") sample_proportion = options.sample_proportion else: sample_proportion = None filter_min_sequence_length = None filter_max_sequence_length = None filter_id_list = None for f in options.filter_methods: if f.startswith("min-length"): filter_min_sequence_length = int(f.split("=")[1]) elif f.startswith("max-length"): filter_max_sequence_length = int(f.split("=")[1]) elif f.startswith("id-file"): filter_id_list = [ line[:-1] for line in IOTools.openFile(f.split("=")[1]) ] def raiseIfNotCodon(l, title): '''raise ValueError if sequence length l is not divisible by 3''' if l % 3 != 0: raise ValueError("length of sequence %s not divisible by 3" % (title)) while 1: try: cur_record = next(iterator) except StopIteration: break if cur_record is None: break nseq += 1 ninput += 1 sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if rx_include and not rx_include.search(cur_record.title): nskipped += 1 continue if rx_exclude and rx_exclude.search(cur_record.title): nskipped += 1 continue if sample_proportion: if random.random() > sample_proportion: continue if not (filter_id_list is None or cur_record.title in filter_id_list): nskipped += 1 continue for method in options.methods: if method == "translate": # translate such that gaps are preserved seq = [] ls = len(re.sub('[%s]' % options.gap_chars, sequence, "")) if ls % 3 != 0: msg = "length of sequence %s (%i) not divisible by 3" % ( cur_record.title, ls) nerrors += 1 if options.ignore_errors: E.warn(msg) continue else: raise ValueError(msg) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "back-translate": # translate from an amino acid alignment to codon alignment seq = [] try: other_record = next(other_iterator) except StopIteration: raise ValueError("run out of sequences") if cur_record.title != other_record.title: raise "sequence titles don't match: %s %s" % ( cur_record.title, other_record.title) other_sequence = re.sub("[ %s]" % options.gap_chars, "", other_record.sequence) if len(other_sequence) % 3 != 0: raise ValueError( "length of sequence %s not divisible by 3" % (other_record.title)) r = re.sub("[%s]" % options.gap_chars, "", sequence) if len(other_sequence) != len(r) * 3: raise ValueError( "length of sequences do not match: %i vs %i" % (len(other_sequence), len(r))) x = 0 for aa in sequence: if aa in options.gap_chars: c = options.gap_char * 3 else: c = other_sequence[x:x + 3] x += 3 seq.append(c) sequence = "".join(seq) elif method == "pseudo-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = " ".join(seq) elif method == "reverse-complement": sequence = string.translate( sequence, string.maketrans("ACGTacgt", "TGCAtgca"))[::-1] elif method in ("mask-stops", "remove-stops"): c = [] codon = [] new_sequence = [] if method == "mask-stops": char = options.na_mask_char elif method == "remove-stops": char = options.gap_char for x in sequence: if x not in options.gap_chars: codon.append(x.upper()) c.append(x) if len(codon) == 3: codon = "".join(codon).upper() # mask all non-gaps if Genomics.IsStopCodon(codon): for x in c: if x in options.gap_chars: new_sequence.append(x) else: new_sequence.append(char) else: new_sequence += c c = [] codon = [] new_sequence += c sequence = "".join(new_sequence) elif method == "mask-soft": # Get next hard masked record and extract sequence and length try: cur_hm_record = next(hard_masked_iterator) except StopIteration: break hm_sequence = re.sub(" ", "", cur_hm_record.sequence) lhm = len(hm_sequence) new_sequence = [] # Check lengths of unmasked and soft masked sequences the same if l != lhm: raise ValueError( "length of unmasked and hard masked sequences not " "identical for record %s" % (cur_record.title)) # Check if hard masked seq contains repeat (N), if so replace N # with lowercase sequence from unmasked version if sequence == hm_sequence: pass else: for x, y in zip_longest(sequence, hm_sequence): if y == "N": new_sequence += x.lower() else: new_sequence += x.upper() sequence = "".join(new_sequence) elif method == "map-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in (sequence[x:x + 3].upper() for x in range(0, l, 3)): if codon not in map_codon2code: aa = "X" else: aa = map_codon2code[codon] seq.append(aa) sequence = "".join(seq) elif method == "interleaved-codons": raiseIfNotCodon(l, cur_record.title) seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) seq.append("%s:%s" % (aa, codon)) sequence = " ".join(seq) elif method == "translate-to-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break aa = Genomics.MapCodon2AA(codon) seq.append(aa) sequence = "".join(seq) elif method == "truncate-at-stop": seq = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if Genomics.IsStopCodon(codon): break seq.append(codon) sequence = "".join(seq) elif method == "remove-gaps": seq = [] for s in sequence: if s in options.gap_chars: continue seq.append(s) sequence = "".join(seq) elif method == "upper": sequence = sequence.upper() elif method == "lower": sequence = sequence.lower() elif method == "mark-codons": raiseIfNotCodon(l, cur_record.title) seq = [] sequence = " ".join( [sequence[x:x + 3] for x in range(0, l, 3)]) elif method == "apply-map": id = re.match("^(\S+)", cur_record.title).groups()[0] if id in map_seq2nid: rest = cur_record.title[len(id):] cur_record.title = map_seq2nid[id] + rest elif method == "build-map": # build a map of identifiers id = re.match("^(\S+)", cur_record.title).groups()[0] new_id = options.template_identifier % nseq if id in map_seq2nid: raise "duplicate fasta entries - can't map those: %s" % id map_seq2nid[id] = new_id cur_record.title = new_id elif method == "mask-bias": masker = Masker.MaskerBias() sequence = masker(sequence) elif method == "mask-seg": masker = Masker.MaskerSeg() sequence = masker(sequence) elif method == "shuffle": s = list(sequence) random.shuffle(s) sequence = "".join(s) elif method == "mask-incomplete-codons": seq = list(sequence) for x in range(0, l, 3): nm = len([x for x in seq[x:x + 3] if x in mask_chars]) if 0 < nm < 3: seq[x:x + 3] = [mask_char] * 3 sequence = "".join(seq) elif method == "mask-codons": # mask codons based on amino acids given as reference # sequences. other_record = next(other_iterator) if other_record is None: raise ValueError("run out of sequences.") if cur_record.title != other_record.title: raise ValueError("sequence titles don't match: %s %s" % (cur_record.title, other_record.title)) other_sequence = re.sub(" ", "", other_record.sequence) if len(other_sequence) * 3 != len(sequence): raise ValueError( "sequences for %s don't have matching lengths %i - %i" % (cur_record.title, len(other_sequence) * 3, len(sequence))) seq = list(sequence) c = 0 for x in other_sequence: if x in options.aa_mask_chars: if x.isupper(): seq[c:c + 3] = [options.na_mask_char.upper()] * 3 else: seq[c:c + 3] = [options.na_mask_char.lower()] * 3 c += 3 sequence = "".join(seq) l = len(sequence) if filter_min_sequence_length is not None and \ l < filter_min_sequence_length: nskipped += 1 if filter_max_sequence_length is not None and \ l > filter_max_sequence_length: nskipped += 1 continue options.stdout.write(">%s\n%s\n" % (cur_record.title, sequence)) noutput += 1 if "build-map" in options.methods: p = options.parameters[0] if p: outfile = IOTools.openFile(p, "w") else: outfile = options.stdout outfile.write("old\tnew\n") for old_id, new_id in list(map_seq2nid.items()): outfile.write("%s\t%s\n" % (old_id, new_id)) if p: outfile.close() E.info("ninput=%i, noutput=%i, nskipped=%i, nerrors=%i" % (ninput, noutput, nskipped, nerrors)) E.Stop()
def main(): parser = E.OptionParser( version= "%prog version: $Id: plot_tree.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-i", "--title", dest="title", type="string", help="page title.") parser.add_option("-f", "--footer", dest="footer", type="string", help="page footer.") parser.add_option("-s", "--filename-tree", dest="filename_tree", type="string", help="filename with tree.") parser.add_option("-t", "--tree", dest="tree", type="string", help="tree.") parser.add_option( "-r", "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option("--colour-by-species", dest="colour_by_species", action="store_true", help="colour by species.") parser.add_option("--support-style", dest="support_style", type="choice", choices=("pie", "number"), help="style for support information.") parser.add_option("--error-style", dest="error_style", type="choice", choices=("pie", "number"), help="style for error information.") parser.add_option("--branch-scale", dest="branch_scale", type="float", help="branch length scale factor.") parser.add_option("--height-scale", dest="height_scale", type="float", help="height scale factor.") parser.add_option("-a", "--annotations", dest="annotations", type="choice", action="append", choices=("support", "error", "kaks", "master", "value", "tables"), help="annotations given by further trees.") parser.add_option( "--filename-tables", dest="filename_tables", type="string", help="add tables from file (need also set options -a tables) [%default]" ) parser.add_option("--show-branchlengths", dest="show_branchlengths", action="store_true", help="show branch lengths.") parser.add_option("--leaf-symbol", dest="plot_leaf_symbol", type="choice", choices=("square", "circle"), help="Symbol for leaves.") parser.add_option("--font-size-branches", dest="font_size_branches", type="int", help="set font size for branches.") parser.add_option("--font-size-tips", dest="font_size_tips", type="int", help="set font size for tips.") parser.add_option("--font-style-tips", dest="font_style_tips", type="choice", choices=( "normal", "italic", ), help="set font style for tips.") parser.add_option("--filename-map", dest="filename_map", type="string", help="filename with a name translation table.") parser.add_option("--filename-map-species2colour", dest="filename_colour_map", type="string", help="filename with a map of species to colour.") parser.add_option("--no-leaf-labels", dest="plot_leaf_labels", action="store_false", help="do not show labels at leafs.") parser.add_option("--no-ruler", dest="plot_ruler", action="store_false", help="do not plot ruler.") parser.set_defaults( titles="", title="", footer="", filename_tree=None, species_regex="^([^|]+)\|", colour_by_species=None, tree=None, branch_scale=0, height_scale=0, support_style=None, error_style="number", kaks_style="number", annotations=None, show_branchlengths=False, branch_length_format="%5.2f", font_size_tips=None, font_size_branches=None, font_style_tips=None, filename_map=None, filename_colour_map=None, plot_leaf_labels=True, plot_leaf_symbol=None, plot_ruler=True, filename_tables=None, ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_tree: tree_lines = open(options.filename_tree, "r").readlines() elif options.tree: tree_lines = options.tree else: tree_lines = sys.stdin.readlines() nexus = TreeTools.Newick2Nexus(tree_lines) master_tree = nexus.trees[0] if options.filename_map: map_names = IOTools.ReadMap(open(options.filename_map, "r")) for id, node in master_tree.chain.items(): if node.data.taxon in map_names: node.data.taxon = map_names[node.data.taxon] if options.loglevel >= 2: master_tree.display() plot = SVGTree.SVGTree(master_tree) if options.branch_scale: plot.setBranchScale(options.branch_scale) if options.height_scale != None: plot.setHeightScale(options.height_scale) if options.font_size_tips != None: plot.setFontSize(options.font_size_tips) if options.plot_ruler == False: plot.setRulerElements([]) if options.show_branchlengths: b = SVGTree.BranchDecoratorHorizontalBranchLength(master_tree) if options.font_size_branches: b.setFontSize(options.font_size_branches) plot.setDecoratorHorizontalBranches(b) if options.colour_by_species: if options.filename_colour_map: map_species2colour = IOTools.ReadMap( open(options.filename_colour_map, "r")) else: map_species2colour = None rx = re.compile(options.species_regex) extract_species = lambda x: rx.search(x).groups()[0] plot.setDecoratorExternalNodes( SVGTree.NodeDecoratorBySpecies( master_tree, plot_symbol=options.plot_leaf_symbol, plot_label=options.plot_leaf_labels, map_species2colour=map_species2colour, extract_species=extract_species)) if options.font_style_tips: plot.getDecoratorExternalNodes().setFontStyle(options.font_style_tips) plot.getDecoratorExternalNodes().setPlotLabel(options.plot_leaf_labels) current_tree = 1 ## add annotations by further trees given on the command line branch_length_annotations = [] current_reference_tree = master_tree if options.annotations: for annotation in options.annotations: tree = nexus.trees[current_tree] if annotation == "support": tree.branchlength2support() for id, node in tree.chain.items(): node.data.branchlength = 1.0 if options.support_style == "pie": plot.setDecoratorInternalNodes( NodeDecoratorSupportPieChart( nexus.trees[current_tree])) elif annotation == "error": if options.error_style == "number": b = SVGTree.BranchDecoratorHorizontalBranchLengthError( current_reference_tree, tree) if options.font_size_branches: b.setFontSize(options.font_size_branches) branch_length_annotations.append(b) elif annotation == "kaks": if options.kaks_style == "number": b = SVGTree.BranchDecoratorHorizontalBranchLengthWithKaks( current_reference_tree, tree) if options.font_size_branches: b.setFontSize(options.font_size_branches) branch_length_annotations.append(b) elif annotation == "value": b = SVGTree.BranchDecoratorHorizontalBranchLength(tree) if options.font_size_branches: b.setFontSize(options.font_size_branches) branch_length_annotations.append(b) elif annotation == "master": current_reference_tree = tree elif annotation == "tables": b = BranchDecoratorTable(tree, filename=options.filename_tables) plot.setDecoratorHorizontalBranches(b) current_tree += 1 if len(branch_length_annotations) == 1: b = branch_length_annotations[0] elif len(branch_length_annotations) == 2: b1, b2 = branch_length_annotations b1.setFontColour(SVGTree.BLUE) b2.setFontColour(SVGTree.RED) b = SVGTree.BranchDecoratorHorizontalAboveBelow( master_tree, b1, b2) elif len(branch_length_annotations) > 2: raise "obtained more than three branch length annotations. Layout not implemented" plot.setDecoratorHorizontalBranches(b) plot.initializePlot() plot.writeToFile(sys.stdout) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: compare_clusters.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-o", "--output-pattern", dest="output_pattern", type="string", help="output pattern for filenames.") parser.set_defaults( output_pattern=None, format="%5.2f", ) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) != 2: raise "please supply to filenames with the clusters." map_id2cluster1, map_cluster2ids1 = IOTools.ReadMap(open(args[0]), both_directions=True) map_id2cluster2, map_cluster2ids2 = IOTools.ReadMap(open(args[1]), both_directions=True) graph = networkx.Graph() for a in map_cluster2ids1.keys(): graph.add_node((1, a)) for b in map_cluster2ids2.keys(): graph.add_node((2, b)) # build graph between clusters for cluster1, ids1 in map_cluster2ids1.items(): for id1 in ids1: if id1 in map_id2cluster2: graph.add_edge((1, cluster1), (2, map_id2cluster2[id1])) components = networkx.connected_components(graph) ####################################################### ####################################################### ####################################################### # write components and compute counts ####################################################### outfile = getFile("components", options) outfile.write("id\ttotal\tn1\tn2\tmembers1\tmembers2\n") n = 0 counts = {} subsets = [] for component in components: m1, m2 = [], [] for x in component: if x[0] == 1: m1.append(x[1]) else: m2.append(x[1]) t = len(component) n1 = len(m1) n2 = len(m2) cc = (n1, n2) if cc not in counts: counts[cc] = 0 counts[cc] += 1 if cc == (1, 1): subsets.append(n) n += 1 outfile.write("%i\t%i\t%i\t%i\t%s\t%s\n" % (n, t, n1, n2, ",".join(m1), ",".join(m2))) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") ####################################################### ####################################################### ####################################################### # write counts ####################################################### outfile = getFile("counts", options) outfile.write("n1\tn2\tcounts\tpcounts1\tpcounts2\n") for cc, c in counts.items(): outfile.write( "%i\t%i\t%i\t%s\t%s\n" % (cc[0], cc[1], c, options.format % (100.0 * float(c) / len(map_cluster2ids1)), options.format % (100.0 * float(c) / len(map_cluster2ids2)))) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") ####################################################### ####################################################### ####################################################### # analyze subsets - how many of the 1:1 clusters # contain the exact members? ####################################################### outfile = getFile("subsets", options) outfile.write("id\tn1\tn2\tunion\tinter\tunique1\tunique2\n") ntrue = 0 nrest1 = 0 nrest2 = 0 nother = 0 for component_id in subsets: component = components[component_id] if component[0][0] == 1: id1, id2 = component[0][1], component[1][1] else: id1, id2 = component[1][1], component[0][1] members1 = set(map_cluster2ids1[id1]) members2 = set(map_cluster2ids2[id2]) union = len(members1.union(members2)) intersection = len(members1.intersection(members2)) rest1 = len(members1.difference(members2)) rest2 = len(members2.difference(members1)) if rest1 == 0 and rest2 == 0: ntrue += 1 elif rest1 == 0: nrest1 += 1 elif rest2 == 0: nrest2 += 1 else: nother += 1 outfile.write("%i\t%i\t%i\t%i\t%i\t%i\t%i\n" % (component_id, len(members1), len(members2), union, intersection, rest1, rest2)) if outfile != options.stdout: outfile.close() else: outfile.write("//\n") # write subset statistics ntotal = len(subsets) options.stdout.write("# subset statistics of 1:1 corresponding clusters\n") options.stdout.write("class\tcounts\ttotal\n") options.stdout.write("%s\t%i\t%s\n" % ("total", ntotal, options.format % 100)) options.stdout.write("%s\t%i\t%s\n" % ("true", ntrue, options.format % (100.0 * ntrue / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("unique1", nrest1, options.format % (100.0 * nrest1 / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("unique2", nrest2, options.format % (100.0 * nrest2 / ntotal))) options.stdout.write("%s\t%i\t%s\n" % ("other", nother, options.format % (100.0 * nother / ntotal))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/analyze_queries.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-g", "--genomes", dest="genomes", type="string", help="genomes to analyse.") parser.add_option("-i", "--priority", dest="priority", type="string", help="quality priority.") parser.add_option("-s", "--sort", dest="sort", type="string", help="sort order.") parser.add_option("-p", "--peptides", dest="filename_peptides", type="string", help="filename with template peptide sequences.") parser.add_option("-m", "--methods", dest="methods", type="string", help="methods to apply [missed].") parser.add_option( "-f", "--filter", dest="filename_filter", type="string", help= "filename with schema|prediction_id|gene to use as filter. The prediction_ids are used for filtering." ) parser.add_option("-q", "--filter-quality", dest="filter_quality", type="string", help="only consider predictions of given qualities.") parser.add_option("--pattern-output", dest="pattern_output", type="string", help="output pattern for multiple file output.") parser.add_option("--pattern-stats", dest="pattern_stats", type="string", help="output pattern for multiple statistics output.") parser.add_option("--outfile-clusters", dest="outfile_clusters", type="string", help="output filename for clusters.") parser.add_option("--infile-clusters", dest="infile_clusters", type="string", help="input filename for clusters.") parser.add_option("-n", "--non-redundant", dest="non_redundant", action="store_true", help="use non-redundant set for output.") parser.add_option("--clustering-method", dest="clustering_method", type="choice", choices=("fragment", "hid"), help="clustering method to use.") parser.set_defaults( genomes="", priority="CG,PG,SG,RG,CP,PP,SP,RP,CF,PF,SF,UG,UP,UF,BF,UK", sort="CG,PG,SG,RG,CP,PP,SP,RP,CF,PF,SF,UG,UP,UF,BF,UK", methods="missed", peptides=None, filename_filter=None, separator="|", filter_quality=None, pattern_output="%s", pattern_stats=None, clustering_method="fragment", outfile_clusters=None, infile_clusters=None, non_redundant=False, format_percent="%5.2f", ) (options, args) = E.Start(parser, add_psql_options=True) if options.filename_peptides: peptides = Genomics.ReadPeptideSequences( open(options.filename_peptides, "r")) else: peptides = {} if options.genomes: options.genomes = options.genomes.split(",") if options.priority: options.priority = options.priority.split(",") if options.methods: options.methods = options.methods.split(",") if options.sort: options.sort = options.sort.split(",") if options.filter_quality: options.filter_quality = options.filter_quality.split(",") subset = {} if options.filename_filter: data = map( lambda x: x[:-1].split(options.separator)[:3], filter(lambda x: x[0] != "#", open(options.filename_filter, "r").readlines())) for s, p, g in data: if s not in subset: subset[s] = {} subset[s][p] = 1 if len(options.sort) != len(options.priority): raise "different number of classes in sort order and priority order" dbhandle = pgdb.connect(options.psql_connection) # Cluster peptides if options.infile_clusters: map_peptide2cluster, map_cluster2peptide = IOTools.ReadMap( open(options.infile_clusters, "r"), both_directions=True) elif peptides: if options.clustering_method == "fragment": map_cluster2peptide, map_peptide2cluster = ClusterPeptidesByFragment( peptides) elif options.clustering_method == "hid": map_cluster2peptide, map_peptide2cluster = ClusterPeptidesByHid( peptides) else: map_cluster2peptide = {} map_peptide2cluster = {} if map_cluster2peptide and options.loglevel >= 1: options.stdlog.write( "# clustering of peptides: %i cluster for %i peptides\n" % (len(map_cluster2peptide), len(map_peptide2cluster))) sys.stdout.flush() if options.outfile_clusters and not options.infile_clusters: options.stdlog.write("# writing clusters to %s\n" % options.outfile_clusters) outfile = open(options.outfile_clusters, "w") for k, v in map_peptide2cluster.items(): outfile.write("%s\t%s\n" % (k, v)) outfile.close() for method in options.methods: if method == "stats": # Count number of missed unique genes/transcripts headers = ( "species", "genes", "found_genes", "missed_genes", "pfound_genes", "pmissed_genes", "nr_found_genes", "nr_missed_genes", "pnr_found_genes", "pnr_missed_genes", "transcripts", "found_transcripts", "missed_transcripts", "pfound_transcripts", "pmissed_transcripts", "nr_found_transcripts", "nr_missed_transcripts", "pnr_found_transcripts", "pnr_missed_transcripts", ) options.stdout.write("\t".join(headers) + "\n") for genome in options.genomes: r = GetQueryInfo(dbhandle, genome, options, subset) found_genes, genes, found_transcripts, transcripts =\ CountFoundGenes(map(lambda x: (x[0], x[1], x[2]), r), map_peptide2cluster) nrfound_genes, nrgenes, nrfound_transcripts, nrtranscripts =\ CountFoundGenes(map(lambda x: (x[0], x[1], x[3]), r), map_peptide2cluster) nfound_genes = len(found_genes) nfound_transcripts = len(found_transcripts) nnrfound_genes = len(nrfound_genes) nnrfound_transcripts = len(nrfound_transcripts) ngenes = len(genes) ntranscripts = len(transcripts) if ngenes == 0 or ntranscripts == 0: continue f1 = lambda x: 100 * float(x) / ngenes f2 = lambda x: 100 * float(x) / ntranscripts options.stdout.write( "%s\t%i\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%i\t%5.2f\t%5.2f\t%i\t%i\t%5.2f\t%5.2f\n" % (genome, ngenes, nfound_genes, ngenes - nfound_genes, f1(nfound_genes), f1(ngenes - nfound_genes), nnrfound_genes, ngenes - nnrfound_genes, f1(nnrfound_genes), f1(ngenes - nnrfound_genes), ntranscripts, nfound_transcripts, ntranscripts - nfound_transcripts, f2(nfound_transcripts), f2(ntranscripts - nfound_transcripts), nnrfound_transcripts, ntranscripts - nnrfound_transcripts, f2(nnrfound_transcripts), f2(ntranscripts - nnrfound_transcripts))) elif method == "missed": headers = ( "species", "genes", "transcripts", "missed_genes", "missed_transcripts", "percent_missed_genes", "percent_missed_transcripts", ) options.stdout.write("\t".join(headers) + "\n") all_missed_genes = {} all_missed_transcripts = {} for genome in options.genomes: r = GetQueryInfo(dbhandle, genome, options, subset) if options.non_redundant: found_genes, genes, found_transcripts, transcripts =\ CountFoundGenes(map(lambda x: (x[0], x[1], x[3]), r), map_peptide2cluster) else: found_genes, genes, found_transcripts, transcripts =\ CountFoundGenes(map(lambda x: (x[0], x[1], x[2]), r), map_peptide2cluster) sg = set(genes) missed_genes = sg.difference(found_genes) for x in missed_genes: if x not in all_missed_genes: all_missed_genes[x] = [] all_missed_genes[x].append(genome) sm = set(transcripts) missed_transcripts = sm.difference(found_transcripts) for x in missed_transcripts: if x not in all_missed_transcripts: all_missed_transcripts[x] = [] all_missed_transcripts[x].append(genome) options.stdout.write( "%s\t%i\t%i\t%i\t%i\t%s\t%s\n" % (genome, len(genes), len(transcripts), len(missed_genes), len(missed_transcripts), options.format_percent % (100.0 * float(len(missed_genes)) / len(genes)), options.format_percent % (100.0 * float(len(missed_transcripts)) / len(transcripts)))) for section in ("genes", "transcripts"): if section == "genes": missed = all_missed_genes else: missed = all_missed_transcripts writeListMissed(open(options.pattern_output % section, "w"), missed, options.genomes, options) if options.pattern_stats: outfile = open(options.pattern_stats % section, "w") else: outfile = options.stdout outfile.write("# statistics for %s\n" % section) writeStatsMissed(outfile, missed, options.genomes, options) if outfile != options.stdout: outfile.close() E.Stop()
url=None, radius_increment=40, min_contig_size=10000, remove_empty_contigs=True, separator="|", quality2symbol={'CG': "circle", 'PG': "circle", 'SG': "circle"}, quality2mask=( "RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK"), sort_by_size = True, input_format = "pairwise", ) (options, args) = Experiment.Start(parser, add_pipe_options=True) if options.filename_contig_sizes: map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes, "r"), map_functions=(str, int)) # read data and get contigs that are used (i.e.: remove empty contigs) chrs = {} lines = sys.stdin.readlines() if options.remove_empty_contigs: for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] for l in in_locations.split(";"):
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2predictions.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-l", "--filename-locations", dest="filename_locations", type="string", help="filename with locations") parser.add_option("-m", "--master", dest="master", type="string", help="the master determines the frame.") parser.set_defaults(filename_locations=None, gap_chars="-.", master=None) (options, args) = E.Start(parser, add_pipe_options=True) if len(args) > 0: print USAGE, "no arguments required." sys.exit(2) mali = Mali.Mali() mali.readFromFile(sys.stdin) identifiers = mali.getIdentifiers() aligned_columns, aligned_exons = getAlignedColumns(mali, options) map_id2location = {} if options.filename_locations: map_id2location = IOTools.ReadMap(open(options.filename_locations, "r")) options.stdout.write(Prediction.Prediction().getHeader() + "\n") nid = 1 for identifier in identifiers: if options.loglevel >= 2: options.stdlog.write("# processing %s\n" % (identifier)) entry = mali.getEntry(identifier) sequence = entry.mString if sequence[0] not in string.lowercase: raise "all sequences should start with an exon." was_exon = True d = 0 alignment = [] carry_over = 0 last_codon = [] codon = [] nchars_in_codon = 0 n = 0 last_master_residue = 0 master_residue = 0 for column in range(len(sequence)): c = sequence[column] is_gap = c in options.gap_chars is_aligned = column in aligned_columns is_exon = column in aligned_exons if is_gap: continue if is_exon: master_residue = aligned_exons[column] codon.append((n, master_residue)) n += 1 # check if we have a complete codon if is_exon: # A codon is complete, if it ends at frame 2 or # it spans more than one codons in the master. # Gaps in the master that are a multiple of 3 are ignored d = master_residue - last_master_residue - 1 if master_residue % 3 == 2 or (d % 3 != 0 and d > 0): if last_codon: d = codon[0][0] - last_codon[-1][0] - 1 if d > 0: # add in-frame introns if d > 10: alignment.append(["5", 0, 2]) alignment.append(["I", 0, d - 4]) alignment.append(["3", 0, 2]) else: raise "untreated case" alignment += processCodon(codon) last_codon = codon codon = [] last_master_residue = master_residue last = alignment[0] new_alignment = [] for this in alignment[1:]: if this[0] == last[0]: last[1] += this[1] last[2] += this[2] continue new_alignment.append(last) last = this new_alignment.append(last) if options.loglevel >= 4: options.stdlog.write("# output=%s\n" % (str(new_alignment))) assert (new_alignment[-1][2] % 3 == 0) lalignment = sum(map(lambda x: x[2], new_alignment)) prediction = Prediction.Prediction() prediction.mQueryToken = identifier genomic_sequence = re.sub("[%s]" % options.gap_chars, "", mali[identifier]) prediction.mPredictionId = nid nid += 1 if identifier in map_id2location: prediction.mSbjctToken, prediction.mSbjctStrand, sfrom, sto = map_id2location[ identifier].split(":")[:4] prediction.mSbjctGenomeFrom = int(sfrom) + entry.mFrom prediction.mSbjctGenomeTo = int(sto) else: prediction.mSbjctToken = "unk" prediction.mSbjctStrand = "+" prediction.mSbjctGenomeFrom = 0 prediction.mQueryCoverage = 100 prediction.mPercentIdentity = 100 prediction.mPercentSimilarity = 100 prediction.mQueryLength = prediction.mQueryTo prediction.mSbjctGenomeTo = prediction.mSbjctGenomeFrom + lalignment prediction.mMapPeptide2Genome = new_alignment prediction.mAlignmentString = string.join( map(lambda x: string.join(map(str, x), " "), prediction.mMapPeptide2Genome), " ") prediction.mMapPeptide2Translation, prediction.mTranslation = Genomics.Alignment2PeptideAlignment( prediction.mMapPeptide2Genome, 0, 0, genomic_sequence) (prediction.mNIntrons, prediction.mNFrameShifts, prediction.mNGaps, prediction.mNSplits, prediction.mNStopCodons, disruptions) = \ Genomics.CountGeneFeatures(0, prediction.mMapPeptide2Genome, genomic_sequence) options.stdout.write(str(prediction) + "\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-t", "--template-bam", dest="filename_genome_bam", type="string", help="input bam file for header information [%default]") parser.add_option("-s", "--contig-sizes", dest="filename_contigs", type="string", help="filename with contig sizes [%default]") parser.add_option( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) [%default]") parser.add_option("-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches [%default]") parser.add_option( "-c", "--remove-contigs", dest="remove_contigs", type="string", help="','-separated list of contigs to remove [%default]") parser.add_option("-f", "--force", dest="force", action="store_true", help="force overwriting of existing files [%default]") parser.add_option("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely [%default]") parser.set_defaults( filename_genome_bam=None, filename_gtf=None, filename_mismapped=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, ) ## add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv) genomefile, referencenames, referencelengths = None, None, None if options.filename_genome_bam: genomefile = pysam.Samfile(options.filename_genome_bam, "rb") elif options.filename_contigs: contigs = IOTools.ReadMap(IOTools.openFile(options.filename_contigs)) data = zip(*list(contigs.iteritems())) referencenames, referencelengths = data[0], map(int, data[1]) else: raise ValueError( "please provide either --template-bam or --contig-sizes") infile = pysam.Samfile("-", "rb") outfile = pysam.Samfile("-", "wb", template=genomefile, referencenames=referencenames, referencelengths=referencelengths) if options.colour_mismatches: tag = "CM" else: tag = "NM" nambiguous = 0 ninput = 0 nunmapped = 0 ncigar = 0 nfull = 0 noutput = 0 contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)]) for qname, readgroup in itertools.groupby(infile, lambda x: x.qname): ninput += 1 reads = list(readgroup) if reads[0].is_unmapped: nunmapped += 1 continue # filter for best match best = min([x.opt(tag) for x in reads]) reads = [x for x in reads if x.opt(tag) == best] if len(reads) > 1: nambiguous += 1 continue read = reads[0] # reject complicated matches (indels, etc) # to simplify calculations below. if len(read.cigar) > 1: ncigar += 1 continue # set NH flag to latest count t = dict(read.tags) t['NH'] = 1 read.tags = list(t.iteritems()) sname = infile.getrname(read.tid) contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split( "|") first_exon_end, last_exon_start = middle.split("-") first_exon_start, first_exon_end, last_exon_start, last_exon_end = map(int, (\ first_exon_start, first_exon_end, last_exon_start, last_exon_end ) ) first_exon_end += 1 total = first_exon_end - first_exon_start + last_exon_end - last_exon_start first_exon_length = first_exon_end - first_exon_start match1 = first_exon_length - read.pos intron_length = last_exon_start - first_exon_end match2 = read.qlen - match1 # match lies fully in one exon - ignore if match1 <= 0 or match2 <= 0: nfull += 1 continue # increment pos read.pos = first_exon_start + read.pos read.tid = contig2tid[contig] # 3 = BAM_CREF_SKIP read.cigar = [(0, match1), (3, intron_length), (0, match2)] outfile.write(read) noutput += 1 outfile.close() if genomefile: genomefile.close() c = E.Counter() c.input = ninput c.output = noutput c.full = nfull c.cigar = ncigar c.ambiguous = nambiguous c.unmapped = nunmapped E.info("%s" % str(c)) ## write footer and output benchmark information. E.Stop()
parser.add_option("--dump", dest="dump", action="store_true", help="dump output.") parser.set_defaults( separator="|", dump=False, filename_map=None, filename_alignment="-", filename_tree=None, ) (options, args) = E.Start(parser) if options.filename_map: map_species2sp = IOTools.ReadMap(open(options.filename_map, "r")) E.debug("species map: %s" % str(map_species2sp)) identifier_parser = IdentifierParserGPipe(map_species2sp=map_species2sp) njtree = NJTree(identifier_parser=identifier_parser) njtree.SetLog(options.stdlog) njtree.SetErr(options.stderr) if options.filename_tree: njtree.SetSpeciesTree(options.filename_tree) mali = Mali.Mali() if options.filename_alignment == "-":
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-w", "--filename-weights", dest="filename_weights", type="string", help= "filename with codon frequencies. Multiple filenames can be separated by comma [default=%default]." ) parser.add_option("-s", "--sections", dest="sections", type="choice", action="append", choices=("length", "hid", "na", "aa", "degeneracy", "bias", "codons", "codon-usage", "codon-translator"), help="which sections to output [default=%default]") parser.add_option( "-t", "--type", dest="seqtype", type="choice", choices=("na", "aa"), help= "type of sequence: na=nucleotides, aa=amino acids [default=%default].") parser.add_option( "-e", "--regex-identifier", dest="regex_identifier", type="string", help= "regular expression to extract identifier from fasta description line [default=%default]." ) parser.set_defaults( filename_weights="uniform", pseudocounts=1, sections=[], regex_identifier="(.+)", seqtype="na", ) (options, args) = E.Start(parser, argv=argv) options.filename_weights = options.filename_weights.split(",") rx = re.compile(options.regex_identifier) reference_codons = [] if options.filename_weights: for filename in options.filename_weights: if filename == "uniform": reference_codons.append(Genomics.GetUniformCodonUsage()) else: reference_codons.append( IOTools.ReadMap(open(filename, "r"), has_header=True, map_functions=(str, float))) ## print codon table differences E.info("difference between supplied codon usage preferences.") for x in range(0, len(reference_codons)): for y in range(0, len(reference_codons)): if x == y: continue # calculate KL distance a = reference_codons[x] b = reference_codons[y] d = 0 for codon, p in a.items(): if Genomics.IsStopCodon(codon): continue d += b[codon] * math.log(b[codon] / p) E.info("tablediff\t%s\t%s\t%f" % (options.filename_weights[x], options.filename_weights[y], d)) iterator = FastaIterator.FastaIterator(options.stdin) def getCounter(section): if options.seqtype == "na": if section == "length": s = SequencePropertiesLength() elif section == "hid": s = SequencePropertiesHid() elif section == "na": s = SequencePropertiesNA() elif section == "aa": s = SequencePropertiesAA() elif section == "degeneracy": s = SequencePropertiesDegeneracy() elif section == "bias": s = SequencePropertiesBias(reference_codons) elif section == "codons": s = SequencePropertiesCodons() elif section == "codon-usage": s = SequencePropertiesCodonUsage() elif section == "codon-translator": s = SequencePropertiesCodonTranslator() else: raise ValueError("unknown section %s" % section) elif options.seqtype == "aa": if section == "length": s = SequencePropertiesLength() elif section == "hid": s = SequencePropertiesHid() elif section == "aa": s = SequencePropertiesAminoAcids() else: raise ValueError("unknown section %s" % section) return s ## setup totals totals = {} for section in options.sections: totals[section] = getCounter(section) options.stdout.write("id") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getHeaders())) options.stdout.write("\n") options.stdout.flush() for cur_record in iterator: sequence = re.sub(" ", "", cur_record.sequence).upper() if len(sequence) == 0: E.warning("empty sequence %s" % cur_record.title) continue id = rx.search(cur_record.title).groups()[0] options.stdout.write("%s" % id) options.stdout.flush() for section in options.sections: s = getCounter(section) s.loadSequence(sequence) totals[section].addProperties(s) options.stdout.write("\t" + "\t".join(s.getFields())) options.stdout.write("\n") options.stdout.write("total") for section in options.sections: options.stdout.write("\t" + "\t".join(totals[section].getFields())) options.stdout.write("\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: matrix2stats.py 2795 2009-09-16 15:29:23Z andreas $", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=("chi-squared", "pearson-chi-squared"), help="statistical methods to apply.") parser.add_option("-t", "--header-names", dest="headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--no-headers", dest="headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("full", "sparse", "phylip"), help="""input format for matrix.""" ) parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("full", "sparse", "phylip"), help="""output format for matrix.""" ) parser.add_option("-p", "--parameters", dest="parameters", action="append", type="string", help="parameters for various functions.") parser.add_option("-a", "--iteration", dest="iteration", type="choice", choices=("pairwise", "all-vs-all"), help="""how to compute stats [%default].""" ) parser.set_defaults( method="chi-squared", headers=True, value_format="%6.4f", pvalue_format="%6.4e", input_format="full", write_separators=True, parameters=[], iteration=None, ) (options, args) = E.Start(parser) lines = [x for x in sys.stdin.readlines() if x[0] != "#"] chunks = [x for x in range(len(lines)) if lines[x][0] == ">"] if not chunks: options.write_separators = False chunks = [-1] chunks.append(len(lines)) ninput, noutput, nskipped = 0, 0, 0 if options.write_separators: options.stdout.write("test\t") header_prefix = "" if options.method == "chi-squared": header_prefix = "observed\texpected" options.stdout.write("\t".join( (header_prefix, "n", "min", "max", "chi", "df", "P", "passed", "phi")) + "\n") elif options.method in ("pearson-chi-squared",): options.stdout.write("column\t") options.stdout.write("\t".join( (header_prefix, "n", "prob", "obs", "exp", "chi", "df", "P", "passed", "phi")) + "\n") if len(options.parameters) == 0: raise "out of parameters - please supply probability or filename with probabilities." param = options.parameters[0] del options.parameters[0] if options.write_separators: probabilities = IOTools.ReadMap( IOTools.openFile(param, "r"), map_functions=(str, float)) else: probability = float(param) for x in range(len(chunks) - 1): ninput += 1 matrix, row_headers, col_headers = MatlabTools.readMatrix( StringIO("".join(lines[chunks[x] + 1:chunks[x + 1]])), format=options.input_format, headers=options.headers) nrows, ncols = matrix.shape if options.loglevel >= 2: options.stdlog.write("# read matrix: %i x %i, %i row titles, %i colum titles.\n" % (nrows, ncols, len(row_headers), len(col_headers))) if options.write_separators: options.stdout.write(lines[chunks[x]][1:-1] + "\t") pairs = [] if options.iteration == "pairwise": pairs = [] for row1 in range(0, len(row_headers)): for row2 in range(row1 + 1, len(row_headers)): pairs.append((row1, row2)) elif options.iteration == "all-vs-all": pairs = [] for row1 in range(0, len(row_headers)): for row2 in range(0, len(row_headers)): if row1 == row2: continue pairs.append((row1, row2)) if options.method == "chi-squared": for row1, row2 in pairs: row_header1 = row_headers[row1] row_header2 = row_headers[row2] try: result = Stats.doChiSquaredTest( numpy.vstack((matrix[row1], matrix[row2]))) except ValueError: nskipped += 1 continue noutput += 1 options.stdout.write("\t".join(( "%s" % row_header1, "%s" % row_header2, "%i" % result.mSampleSize, "%i" % min(matrix.flat), "%i" % max(matrix.flat), options.value_format % result.mChiSquaredValue, "%i" % result.mDegreesFreedom, options.pvalue_format % result.mProbability, "%s" % result.mSignificance, options.value_format % result.mPhi)) + "\n") elif options.method == "pearson-chi-squared": if nrows != 2: raise ValueError("only implemented for 2xn table") if options.write_separators: id = re.match("(\S+)", lines[chunks[x]][1:-1]).groups()[0] probability = probabilities[id] for col in range(ncols): options.stdout.write("%s\t" % col_headers[col]) result = Stats.doPearsonChiSquaredTest( probability, sum(matrix[:, col]), matrix[0, col]) options.stdout.write("\t".join(( "%i" % result.mSampleSize, "%f" % probability, "%i" % result.mObserved, "%f" % result.mExpected, options.value_format % result.mChiSquaredValue, "%i" % result.mDegreesFreedom, options.pvalue_format % result.mProbability, "%s" % result.mSignificance, options.value_format % result.mPhi))) if col < ncols - 1: options.stdout.write("\n") if options.write_separators: options.stdout.write(lines[chunks[x]][1:-1] + "\t") options.stdout.write("\n") E.info("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id: tree_strain2species.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("--filename-synonyms", dest="filename_synonyms", type="string", help="filename with synonyms. Use this to aggregate several strains for a species.") parser.add_option("--filename-genes", dest="output_filename_genes", type="string", help="output filename with new gene names.") parser.add_option("--species-tree", dest="species_tree", action="store_true", help="input tree are species trees. If not given, the trees are assumed to be gene trees.") parser.add_option("--merge-mode", dest="merge_mode", type="choice", choices=("ignore", "add-mean", "add-max", "add-min"), help="how to deal with branch lengths of merged nodes.") parser.set_defaults( filename_synonyms="map_strain2species", pattern_gene="J%06i", output_format="nh", separator="|", output_filename_genes=None, keep_old_names=False, species_tree=False, merge_mode="ignore", ) (options, args) = E.Start(parser, add_pipe_options=True) ######################################################################## ######################################################################## ######################################################################## # read synonyms if options.filename_synonyms: infile = open(options.filename_synonyms, "r") map_strain2species = IOTools.ReadMap(infile) infile.close() else: map_strain2species = {} lines = map(lambda x: x[:-1], sys.stdin.readlines()) ninput, noutput, nskipped, nmerged = 0, 0, 0, 0 # iterate over chunks chunks = filter(lambda x: lines[x][0] == ">", range(len(lines))) if len(chunks) == 0: chunks = [0] chunks.append(len(lines)) if options.species_tree: processSpeciesTrees(chunks, lines, map_strain2species, options) else: processGeneTrees(chunks, lines, map_strain2species, options) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-e", "--headers", dest="headers", action="store_true", help="first row is a header [ignored].") parser.add_option("-t", "--title", dest="title", type="string", help="page title.") parser.add_option("-f", "--footer", dest="footer", type="string", help="page footer.") parser.add_option("-c", "--contig-sizes", dest="filename_contig_sizes", type="string", help="filname with contig sizes.") parser.add_option("-r", "--radius", dest="radius", type="int", help="radius.") parser.add_option("-i", "--increment", dest="radius_increment", type="int", help="radius increment.") parser.add_option("-u", "--url", dest="url", type="string", help="string to build url for annotation.") parser.add_option("--min-contig", dest="min_contig_size", type="string", help="minimum contig size to delineate.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum branch length.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum branch length.") parser.set_defaults( filename_contig_sizes=None, headers=False, titles="", pattern_filename=None, title="", footer="", radius=3000, min_value=0.0, max_value=0.2, url=None, radius_increment=40, min_contig_size=10000, remove_empty_contigs=True, separator="|", quality2symbol={ 'CG': "circle", 'PG': "circle", 'SG': "circle" }, quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK"), sort_by_size=True, input_format="pairwise", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_contig_sizes: map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes, "r"), map_functions=(str, int)) # read data and get contigs that are used (i.e.: remove empty contigs) chrs = {} lines = sys.stdin.readlines() if options.remove_empty_contigs: for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue chrs[chr] = 1 for k in map_contig2size.keys(): if k not in chrs: del map_contig2size[k] k = map_contig2size.keys() if len(k) == 0: E.Stop() sys.exit(0) k.sort() if options.sort_by_size: k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y])) plot = DuplicationPlot(k, map_contig2size, num_entries=0) plot.mRadiusIncrement = options.radius_increment plot.mRadius = options.radius plot.mMaxValue = options.max_value plot.mMinValue = options.min_value if options.title: plot.setTitle(options.title) if options.footer: plot.setFooter(options.footer) plot.initializePlot() data = [] if options.input_format == "pairwise": # read data from pairwise analysis # format is: cluster_id, locations of duplications, tree of # duplications for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] mi, ma = 0, 0 found = False n = 0 chrs = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue chrs[chr] = 1 sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) xi = plot.getPosition(chr, strand, sbjct_from) xa = plot.getPosition(chr, strand, sbjct_to) if not mi: mi = xi else: mi = min(mi, xi) n += 1 ma = max(ma, xa) found = True if not found: continue cis = len(chrs) == 1 if options.loglevel >= 2: options.stdlog.write( "# adding duplications in cluster %s: %s with tree %s\n" % (cluster_id, in_locations, in_tree)) data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree)) data.sort() plot.mNumEntries = len(data) plot.initializePlot() last_ndups = 0 for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]: if ndups != last_ndups: plot.pushRadius() plot.addSeparator() last_ndups = ndups map_gene2location = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to) if not map_gene2location: continue tree = TreeTools.Newick2Tree(in_tree) # the last subset is all nodes again. s = TreeTools.GetSubsets(tree) is_first = True for children, height, branchlength in s[:-1]: if len(children) == 1: continue c = map(lambda x: x.split(options.separator), children) plot.addDuplication(c, map_gene2location, height, url=options.url, with_separator=is_first, link_to_previous=not is_first, quality2symbol=options.quality2symbol, quality2mask=options.quality2mask) is_first = False plot.writeToFile(sys.stdout) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="choice", help= "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms." "") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option( "-n", "--norm-test", dest="norm_test", action="store_true", help= """test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write( "# creating %i samples from normal distribution with mean %f and stddev %f\n" % (len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write( "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test(values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created." ) values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: fasta2nj.py 2781 2009-09-10 11:33:14Z andreas $") parser.add_option( "-m", "--map", dest="filename_map", type="string", help="filename with mapping of species ids to swissprot species ids.") parser.set_defaults( separator="|", filename_map=None, ) (options, args) = E.Start(parser) if options.filename_map: map_species2sp = IOTools.ReadMap(open(options.filename_map, "r")) ninput, noutput, nerrors = 0, 0, 0 for line in sys.stdin: if line[0] == ">": ninput += 1 id = re.match(">([^/ \t]+)", line[:-1]).groups()[0] data = id.split(options.separator) species = data[0] if len(data) == 2: gene = data[1] transcript = None elif len(data) >= 3: gene = data[2] transcript = data[1] if map_species2sp: try: species = map_species2sp[species] except IndexError: nerrors += 1 if options.loglevel >= 1: options.stdlog.write("# could not map species %s\n" % species) if transcript: options.stdout.write(">%s_%s GENEID=%s\n" % (transcript, species, gene)) else: options.stdout.write(">%s_%s\n" % (species, gene)) noutput += 1 else: options.stdout.write(line) if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nerrors=%i\n" % (ninput, noutput, nerrors)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, ) (options, args) = E.Start( parser, add_pipe_options=True, add_psql_options=True, ) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"), map_category=map_category2value) values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_category=map_category2value) E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2))) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""" ) R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""" ) R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""" ) R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""" ) print "## Results for %s" % result['method'] for x in ['p.value', 'statistic', 'alternative', 'method']: print x, result[x] E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: shuffle_fasta.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-c", "--codons", dest="codons", action="store_true", help="make sure that shuffled sequences only contain valid codons.") parser.add_option("-a", "--conserve-aminos", dest="conserve_aminos", action="store_true", help="conserve amino acids.") parser.add_option( "-b", "--bias", dest="bias", type="float", help= "introduce bias into codon usage choice. Complete bias is 1.0, while no bias is 0.0." ) parser.add_option( "-i", "--biased-codon-usage", dest="filename_biased_codon_usage", type="string", help="Filename with reference codon usage table for biased codon usage." ) parser.add_option( "-u", "--bulk-codon-usage", dest="filename_bulk_codon_usage", type="string", help= "Filename with reference codon usage table for unbiased codon usage.") parser.set_defaults( codons=False, conserve_aminos=False, bias=0.0, filename_biased_codon_usage=None, filename_bulk_codon_usage=None, stop_codons=("TAG", "TAA", "TGA"), precision=10000, ) (options, args) = E.Start(parser, add_pipe_options=True) iterator = FastaIterator.FastaIterator(sys.stdin) # get map of amino acids to codons map_aa2codons = Genomics.GetMapAA2Codons() # for codon based shuffling: build ranges based on strength of bias and on reference codon usage # Bias switches from completely biased to unbiased. Unbiased is uniform # usage. if options.filename_biased_codon_usage: map_codon2frequency = IOTools.ReadMap(open( options.filename_biased_codon_usage, "r"), map_functions=(str, float), has_header=True) if options.filename_bulk_codon_usage: map_codon2frequency_bulk = IOTools.ReadMap( open(options.filename_bulk_codon_usage, "r"), map_functions=(str, float), has_header=True) codon_ranges = {} for aa in map_aa2codons.keys(): c = [] x = 0 for codon in map_aa2codons[aa]: if options.filename_bulk_codon_usage: u = map_codon2frequency_bulk[codon] else: # uniform usage u = 1.0 / len(map_aa2codons[aa]) g = map_codon2frequency[codon] f = g + (u - g) * (1.0 - options.bias) x += f * options.precision c.append(x) codon_ranges[aa] = c while 1: cur_record = iterator.next() if cur_record is None: break sequence = re.sub(" ", "", cur_record.sequence) l = len(sequence) if options.conserve_aminos: n = [] for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: aa = Genomics.MapCodon2AA(codon) if aa not in map_aa2codons: continue if options.bias or options.filename_biased_codon_usage: # get random number from 0 to precision v = random.randint(0, options.precision) # find the corresponding intervall: l = len(map_aa2codons[aa]) x = 0 while x < l - 1: if v < codon_ranges[aa][x]: break x += 1 else: x = random.randint(0, len(map_aa2codons[aa]) - 1) n.append(map_aa2codons[aa][x]) sequence = "".join(n) else: sequence = list(sequence) if options.codons: while 1: random.shuffle(sequence) for codon in [sequence[x:x + 3] for x in range(0, l, 3)]: if codon in options.stop_codons: redo = True break else: break else: random.shuffle(sequence) sequence = "".join(sequence) options.stdout.write(">%s\n%s\n" % (cur_record.title, "".join(sequence))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/annotate_clusters.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option( "-r", "--species-regex", dest="species_regex", type="string", help="regular expression to extractspecies from identifier.") parser.add_option( "--filename-map", dest="filename_map_id2cluster", type="string", help="filename with mapping information from id to cluster.") parser.add_option("--filename-interpro", dest="filename_interpro", type="string", help="filename with interpro domain information.") parser.add_option("--filename-pfam", dest="filename_pfam", type="string", help="filename with pfam domain information.") parser.set_defaults( master_species="dmel_vs_dmel4", separator="|", filename_map_id2cluster="input.map", filename_interpro="/home/andreas/projects/flies/data_1v5/interpro.list", filename_pfam="/home/andreas/projects/flies/data_1v5/pfam.list", write_no_annotation=True, separator_fields=";", ) (options, args) = E.Start(parser, add_psql_options=True, add_csv_options=True) clusters, nerrors = IOTools.ReadList(sys.stdin) map_id2cluster, map_cluster2id = IOTools.ReadMap(open( options.filename_map_id2cluster, "r"), both_directions=True) if len(clusters) == 0: clusters = map_cluster2id.keys() clusters.sort() if options.filename_interpro: map_id2interpro = readAnnotationInterpro( open(options.filename_interpro, "r")) if options.filename_pfam: map_id2pfam = readAnnotationPfam(open(options.filename_pfam, "r")) ninput, noutput, nnomaster, nnoannotation = 0, 0, 0, 0 nskipped = 0 options.stdout.write("cluster\tgenes") if map_id2interpro: options.stdout.write("\tinterpro\tidescription") if map_id2pfam: options.stdout.write("\tpfam\tpdescription") options.stdout.write("\n") for cluster in clusters: ninput += 1 if cluster not in map_cluster2id: if options.loglevel >= 1: options.stdlog.write("# cluster %s not in map.\n" % cluster) nskipped += 1 continue genes = set() for id in map_cluster2id[cluster]: s, t, g, q = id.split(options.separator) if s != options.master_species: continue genes.add(g) if not genes: nnomaster += 1 continue annotations_interpro = {} if map_id2interpro: for gene in genes: if gene in map_id2interpro: for annotation in map_id2interpro[gene]: annotations_interpro[ annotation.mIdentifier] = annotation annotations_pfam = {} if map_id2pfam: for gene in genes: if gene in map_id2pfam: for annotation in map_id2pfam[gene]: annotations_pfam[annotation.mIdentifier] = annotation nannotations = max(len(annotations_pfam), len(annotations_interpro)) if nannotations == 0 and not options.write_no_annotation: nnoannotation += 1 continue options.stdout.write("%s\t%s" % (cluster, ";".join(genes))) if map_id2interpro: printAnnotations(options.stdout, annotations_interpro, options) if map_id2pfam: printAnnotations(options.stdout, annotations_pfam, options) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i, nnomaster=%i, nnoannotation=%i\n" % (ninput, noutput, nskipped, nnomaster, nnoannotation)) E.Stop()