def writeOrthologSets(outfile, nexus, extract_species, extract_gene, options, reference_tree=None, method="strict", outgroups=None): """output ortholog sets. A "strict" ortholog set contains exactly one gene for each species, while a "degenerate" ortholog set contains at least one gene for each species. """ ###################################################################### # build species set to compare sets = [] species = options.column2org nspecies = len(species) if options.enumeration == "monophyletic": if reference_tree: for members, h1, h2 in TreeTools.GetSubsets(reference_tree): if len(members) > 1: sets.append(members) else: raise "please specify a species tree for monophyletic enumeration" elif options.enumeration == "exhaustive": for x in range(2, len(species)): sets += list(SetTools.xuniqueCombinations(species, x)) sets.append(species) elif options.enumeration == "pairwise": for x in range(len(species) - 1): for y in range(x + 1, len(species)): sets.append((species[x], species[y])) elif options.enumeration == "full": sets.append(species) elif options.enumeration == "lineage": for s in species: sets.append((s, )) elif options.enumeration == "explicit": for x in range(2, len(options.species_set)): sets += list(SetTools.xuniqueCombinations(options.species_set, x)) sets.append(options.species_set) ###################################################################### # build sets with positional information xsets = [] map_frozenset2set = {} for x in range(len(sets)): ss = frozenset(map(lambda x: options.org2column[x], sets[x])) xsets.append(ss) map_frozenset2set[ss] = x ###################################################################### # collect outgroups if outgroups: noutgroups = set() for x in outgroups: noutgroups.add(options.org2column[x]) else: noutgroups = None ###################################################################### # loop over each tree and set # I did not see a way to loop a tree once for all sets without doing # complicated counting. The problem is that counting has to be stopped # at different tree heights for different sets. ninput, noutput, nempty, nskipped = 0, 0, 0, 0 counts = [0] * len(sets) options.stdout.write( "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" % "\t".join(species)) cluster_id = 0 nerrors = 0 for tree in nexus.trees: ninput += 1 ntotal_tree = 0 if options.loglevel >= 3: options.stdlog.write("# processing tree %s\n" % tree.name) if options.reroot: rerootTree(tree, extract_species, options) for c in range(len(xsets)): # numbered species set: 0,1,... sn = xsets[c] # literal species set: species1, species2, ... sl = sets[c] ortholog_nodes = getOrthologNodes(tree, sn, options, selector=method, outgroups=noutgroups) ntotal_tree += len(ortholog_nodes) n = 0 pattern = buildPattern(nspecies, sn) # check for inconsistent partitions (the same gene in different # ortholog clusters) within the current tree found_genes = set() ortho_sets = set() # reverse ortholog_node - work in top-down manner. ortholog_nodes.reverse() for node_id, members in ortholog_nodes: n += 1 cluster_id += 1 otus = filter(lambda x: extract_species(x) in sl, tree.get_taxa(node_id)) genes = set(map(extract_gene, otus)) if found_genes.intersection(genes): # only take largest cluster for lineage specific # duplications if method == "lineage": continue if frozenset(genes) in ortho_sets: nskipped += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: cluster %i: redundant node: %i - skipped because already present: %s\n" % (tree.name, n, node_id, str(found_genes.intersection(genes)))) else: nerrors += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n" % (tree.name, n, node_id, str(found_genes.intersection(genes)))) found_genes = found_genes.union(genes) ortho_sets.add(frozenset(genes)) xpattern = buildPattern(nspecies, sn, members) options.stdout.write( "%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" % (len(sl), tree.name, n, cluster_id, "".join(pattern), "\t".join(xpattern), node_id, ";".join(otus))) counts[c] += n if ntotal_tree == 0: nempty += 1 else: noutput += 1 if options.loglevel >= 1: options.stdout.write( "# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" % (ninput, nempty, noutput, nskipped, nerrors)) # write summary information if options.filename_summary: outfile = open(options.filename_summary, "w") else: outfile = options.stdout outfile.write("//\n") outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species))) for c in range(len(xsets)): pattern = buildPattern(nspecies, xsets[c]) outfile.write("%i\t%s\t%i\t%s\n" % (c, "".join(pattern), counts[c], "\t".join(pattern))) if outfile != options.stdout: outfile.close()
def writeOrthologSets(outfile, nexus, extract_species, extract_gene, options, reference_tree=None, method="strict", outgroups=None): """output ortholog sets. A "strict" ortholog set contains exactly one gene for each species, while a "degenerate" ortholog set contains at least one gene for each species. """ ###################################################################### # build species set to compare sets = [] species = options.column2org nspecies = len(species) if options.enumeration == "monophyletic": if reference_tree: for members, h1, h2 in TreeTools.GetSubsets(reference_tree): if len(members) > 1: sets.append(members) else: raise "please specify a species tree for monophyletic enumeration" elif options.enumeration == "exhaustive": for x in range(2, len(species)): sets += list(SetTools.xuniqueCombinations(species, x)) sets.append(species) elif options.enumeration == "pairwise": for x in range(len(species) - 1): for y in range(x + 1, len(species)): sets.append((species[x], species[y])) elif options.enumeration == "full": sets.append(species) elif options.enumeration == "lineage": for s in species: sets.append((s,)) elif options.enumeration == "explicit": for x in range(2, len(options.species_set)): sets += list(SetTools.xuniqueCombinations(options.species_set, x)) sets.append(options.species_set) ###################################################################### # build sets with positional information xsets = [] map_frozenset2set = {} for x in range(len(sets)): ss = frozenset(map(lambda x: options.org2column[x], sets[x])) xsets.append(ss) map_frozenset2set[ss] = x ###################################################################### # collect outgroups if outgroups: noutgroups = set() for x in outgroups: noutgroups.add(options.org2column[x]) else: noutgroups = None ###################################################################### # loop over each tree and set # I did not see a way to loop a tree once for all sets without doing # complicated counting. The problem is that counting has to be stopped # at different tree heights for different sets. ninput, noutput, nempty, nskipped = 0, 0, 0, 0 counts = [0] * len(sets) options.stdout.write( "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" % "\t".join(species)) cluster_id = 0 nerrors = 0 for tree in nexus.trees: ninput += 1 ntotal_tree = 0 if options.loglevel >= 3: options.stdlog.write("# processing tree %s\n" % tree.name) if options.reroot: rerootTree(tree, extract_species, options) for c in range(len(xsets)): # numbered species set: 0,1,... sn = xsets[c] # literal species set: species1, species2, ... sl = sets[c] ortholog_nodes = getOrthologNodes(tree, sn, options, selector=method, outgroups=noutgroups) ntotal_tree += len(ortholog_nodes) n = 0 pattern = buildPattern(nspecies, sn) # check for inconsistent partitions (the same gene in different # ortholog clusters) within the current tree found_genes = set() ortho_sets = set() # reverse ortholog_node - work in top-down manner. ortholog_nodes.reverse() for node_id, members in ortholog_nodes: n += 1 cluster_id += 1 otus = filter( lambda x: extract_species(x) in sl, tree.get_taxa(node_id)) genes = set(map(extract_gene, otus)) if found_genes.intersection(genes): # only take largest cluster for lineage specific # duplications if method == "lineage": continue if frozenset(genes) in ortho_sets: nskipped += 1 if options.loglevel >= 1: options.stdlog.write("# %s: cluster %i: redundant node: %i - skipped because already present: %s\n" % (tree.name, n, node_id, str(found_genes.intersection(genes)))) else: nerrors += 1 if options.loglevel >= 1: options.stdlog.write("# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n" % (tree.name, n, node_id, str(found_genes.intersection(genes)))) found_genes = found_genes.union(genes) ortho_sets.add(frozenset(genes)) xpattern = buildPattern(nspecies, sn, members) options.stdout.write("%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" % (len(sl), tree.name, n, cluster_id, "".join( pattern), "\t".join( xpattern), node_id, ";".join(otus))) counts[c] += n if ntotal_tree == 0: nempty += 1 else: noutput += 1 if options.loglevel >= 1: options.stdout.write("# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" % ( ninput, nempty, noutput, nskipped, nerrors)) # write summary information if options.filename_summary: outfile = open(options.filename_summary, "w") else: outfile = options.stdout outfile.write("//\n") outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species))) for c in range(len(xsets)): pattern = buildPattern(nspecies, xsets[c]) outfile.write("%i\t%s\t%i\t%s\n" % (c, "".join(pattern), counts[c], "\t".join(pattern))) if outfile != options.stdout: outfile.close()
def buildGeneListMatrix(infiles, outfile): '''build a gene list matrix for simple pathway analysis based on hypergeometric test. A gene list is derived from a gene set by applying thresholds to the input data set. The thresholds are defined in the configuration file. ''' genesets = [] backgrounds = [] headers = [] for infile in infiles: genelist = pandas.read_csv( IOTools.openFile(infile), index_col=0, sep='\t') track = P.snip(os.path.basename(infile), ".tsv.gz") headers.append(track) field = PARAMS[P.matchParameter("%s_foreground_field" % track)] min_threshold = PARAMS[P.matchParameter( "%s_foreground_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter( "%s_foreground_max_threshold" % track)] genesets.append(set(genelist[ (genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info('%s: foreground: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) field = PARAMS[P.matchParameter("%s_background_field" % track)] min_threshold = PARAMS[P.matchParameter( "%s_background_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter( "%s_background_max_threshold" % track)] E.info('%s: background: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) backgrounds.append(set(genelist[ (genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info("%s: fg=%i, bg=%i" % (track, len(genesets[-1]), len(backgrounds[-1]))) E.info("writing gene list matrix") with IOTools.openFile(outfile, "w") as outf: SetTools.writeSets(outf, genesets, labels=headers) with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf: SetTools.writeSets(outf, backgrounds, labels=headers) E.info("writing intersection/union matrix") # build set intersection matrix matrix = SetTools.unionIntersectionMatrix(genesets) with IOTools.openFile(outfile + ".matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers) matrix = SetTools.unionIntersectionMatrix(backgrounds) with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers)
def buildGeneListMatrix(infiles, outfile): '''build a gene list matrix for simple pathway analysis based on hypergeometric test. A gene list is derived from a gene set by applying thresholds to the input data set. The thresholds are defined in the configuration file. ''' genesets = [] backgrounds = [] headers = [] for infile in infiles: genelist = pandas.read_csv(IOTools.openFile(infile), index_col=0, sep='\t') track = P.snip(os.path.basename(infile), ".tsv.gz") headers.append(track) field = PARAMS[P.matchParameter("%s_foreground_field" % track)] min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" % track)] genesets.append( set(genelist[(genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info('%s: foreground: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) field = PARAMS[P.matchParameter("%s_background_field" % track)] min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" % track)] max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" % track)] E.info('%s: background: %f <= %s <= %f' % (track, min_threshold, field, max_threshold)) backgrounds.append( set(genelist[(genelist[field] >= min_threshold) & (genelist[field] <= max_threshold)].index)) E.info("%s: fg=%i, bg=%i" % (track, len(genesets[-1]), len(backgrounds[-1]))) E.info("writing gene list matrix") with IOTools.openFile(outfile, "w") as outf: SetTools.writeSets(outf, genesets, labels=headers) with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf: SetTools.writeSets(outf, backgrounds, labels=headers) E.info("writing intersection/union matrix") # build set intersection matrix matrix = SetTools.unionIntersectionMatrix(genesets) with IOTools.openFile(outfile + ".matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers) matrix = SetTools.unionIntersectionMatrix(backgrounds) with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf: IOTools.writeMatrix(outf, matrix, headers, headers)