Esempio n. 1
0
def writeOrthologSets(outfile,
                      nexus,
                      extract_species,
                      extract_gene,
                      options,
                      reference_tree=None,
                      method="strict",
                      outgroups=None):
    """output ortholog sets.

    A "strict" ortholog set contains exactly one gene for each species,
    while a "degenerate" ortholog set contains at least one gene for each
    species. 
    """

    ######################################################################
    # build species set to compare
    sets = []
    species = options.column2org
    nspecies = len(species)

    if options.enumeration == "monophyletic":
        if reference_tree:
            for members, h1, h2 in TreeTools.GetSubsets(reference_tree):
                if len(members) > 1:
                    sets.append(members)
        else:
            raise "please specify a species tree for monophyletic enumeration"

    elif options.enumeration == "exhaustive":
        for x in range(2, len(species)):
            sets += list(SetTools.xuniqueCombinations(species, x))
        sets.append(species)

    elif options.enumeration == "pairwise":

        for x in range(len(species) - 1):
            for y in range(x + 1, len(species)):
                sets.append((species[x], species[y]))

    elif options.enumeration == "full":
        sets.append(species)

    elif options.enumeration == "lineage":
        for s in species:
            sets.append((s, ))

    elif options.enumeration == "explicit":
        for x in range(2, len(options.species_set)):
            sets += list(SetTools.xuniqueCombinations(options.species_set, x))
        sets.append(options.species_set)

    ######################################################################
    # build sets with positional information
    xsets = []
    map_frozenset2set = {}
    for x in range(len(sets)):
        ss = frozenset(map(lambda x: options.org2column[x], sets[x]))
        xsets.append(ss)
        map_frozenset2set[ss] = x

    ######################################################################
    # collect outgroups
    if outgroups:
        noutgroups = set()
        for x in outgroups:
            noutgroups.add(options.org2column[x])
    else:
        noutgroups = None

    ######################################################################
    # loop over each tree and set
    # I did not see a way to loop a tree once for all sets without doing
    # complicated counting. The problem is that counting has to be stopped
    # at different tree heights for different sets.
    ninput, noutput, nempty, nskipped = 0, 0, 0, 0

    counts = [0] * len(sets)

    options.stdout.write(
        "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" %
        "\t".join(species))

    cluster_id = 0
    nerrors = 0

    for tree in nexus.trees:

        ninput += 1
        ntotal_tree = 0

        if options.loglevel >= 3:
            options.stdlog.write("# processing tree %s\n" % tree.name)

        if options.reroot:
            rerootTree(tree, extract_species, options)

        for c in range(len(xsets)):
            # numbered species set: 0,1,...
            sn = xsets[c]
            # literal species set: species1, species2, ...
            sl = sets[c]

            ortholog_nodes = getOrthologNodes(tree,
                                              sn,
                                              options,
                                              selector=method,
                                              outgroups=noutgroups)
            ntotal_tree += len(ortholog_nodes)

            n = 0

            pattern = buildPattern(nspecies, sn)

            # check for inconsistent partitions (the same gene in different
            # ortholog clusters) within the current tree
            found_genes = set()
            ortho_sets = set()

            # reverse ortholog_node - work in top-down manner.
            ortholog_nodes.reverse()

            for node_id, members in ortholog_nodes:
                n += 1
                cluster_id += 1

                otus = filter(lambda x: extract_species(x) in sl,
                              tree.get_taxa(node_id))
                genes = set(map(extract_gene, otus))

                if found_genes.intersection(genes):

                    # only take largest cluster for lineage specific
                    # duplications
                    if method == "lineage":
                        continue

                    if frozenset(genes) in ortho_sets:
                        nskipped += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: redundant node: %i - skipped because already present: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))
                    else:
                        nerrors += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))

                found_genes = found_genes.union(genes)
                ortho_sets.add(frozenset(genes))

                xpattern = buildPattern(nspecies, sn, members)

                options.stdout.write(
                    "%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" %
                    (len(sl), tree.name, n, cluster_id, "".join(pattern),
                     "\t".join(xpattern), node_id, ";".join(otus)))

            counts[c] += n

        if ntotal_tree == 0:
            nempty += 1
        else:
            noutput += 1

    if options.loglevel >= 1:
        options.stdout.write(
            "# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" %
            (ninput, nempty, noutput, nskipped, nerrors))

    # write summary information

    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = options.stdout
        outfile.write("//\n")

    outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species)))

    for c in range(len(xsets)):
        pattern = buildPattern(nspecies, xsets[c])
        outfile.write("%i\t%s\t%i\t%s\n" %
                      (c, "".join(pattern), counts[c], "\t".join(pattern)))

    if outfile != options.stdout:
        outfile.close()
Esempio n. 2
0
def writeOrthologSets(outfile, nexus,
                      extract_species,
                      extract_gene,
                      options,
                      reference_tree=None,
                      method="strict",
                      outgroups=None):
    """output ortholog sets.

    A "strict" ortholog set contains exactly one gene for each species,
    while a "degenerate" ortholog set contains at least one gene for each
    species. 
    """

    ######################################################################
    # build species set to compare
    sets = []
    species = options.column2org
    nspecies = len(species)

    if options.enumeration == "monophyletic":
        if reference_tree:
            for members, h1, h2 in TreeTools.GetSubsets(reference_tree):
                if len(members) > 1:
                    sets.append(members)
        else:
            raise "please specify a species tree for monophyletic enumeration"

    elif options.enumeration == "exhaustive":
        for x in range(2, len(species)):
            sets += list(SetTools.xuniqueCombinations(species, x))
        sets.append(species)

    elif options.enumeration == "pairwise":

        for x in range(len(species) - 1):
            for y in range(x + 1, len(species)):
                sets.append((species[x], species[y]))

    elif options.enumeration == "full":
        sets.append(species)

    elif options.enumeration == "lineage":
        for s in species:
            sets.append((s,))

    elif options.enumeration == "explicit":
        for x in range(2, len(options.species_set)):
            sets += list(SetTools.xuniqueCombinations(options.species_set, x))
        sets.append(options.species_set)

    ######################################################################
    # build sets with positional information
    xsets = []
    map_frozenset2set = {}
    for x in range(len(sets)):
        ss = frozenset(map(lambda x: options.org2column[x], sets[x]))
        xsets.append(ss)
        map_frozenset2set[ss] = x

    ######################################################################
    # collect outgroups
    if outgroups:
        noutgroups = set()
        for x in outgroups:
            noutgroups.add(options.org2column[x])
    else:
        noutgroups = None

    ######################################################################
    # loop over each tree and set
    # I did not see a way to loop a tree once for all sets without doing
    # complicated counting. The problem is that counting has to be stopped
    # at different tree heights for different sets.
    ninput, noutput, nempty, nskipped = 0, 0, 0, 0

    counts = [0] * len(sets)

    options.stdout.write(
        "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" % "\t".join(species))

    cluster_id = 0
    nerrors = 0

    for tree in nexus.trees:

        ninput += 1
        ntotal_tree = 0

        if options.loglevel >= 3:
            options.stdlog.write("# processing tree %s\n" % tree.name)

        if options.reroot:
            rerootTree(tree, extract_species, options)

        for c in range(len(xsets)):
            # numbered species set: 0,1,...
            sn = xsets[c]
            # literal species set: species1, species2, ...
            sl = sets[c]

            ortholog_nodes = getOrthologNodes(tree, sn, options, selector=method,
                                              outgroups=noutgroups)
            ntotal_tree += len(ortholog_nodes)

            n = 0

            pattern = buildPattern(nspecies, sn)

            # check for inconsistent partitions (the same gene in different
            # ortholog clusters) within the current tree
            found_genes = set()
            ortho_sets = set()

            # reverse ortholog_node - work in top-down manner.
            ortholog_nodes.reverse()

            for node_id, members in ortholog_nodes:
                n += 1
                cluster_id += 1

                otus = filter(
                    lambda x: extract_species(x) in sl, tree.get_taxa(node_id))
                genes = set(map(extract_gene, otus))

                if found_genes.intersection(genes):

                    # only take largest cluster for lineage specific
                    # duplications
                    if method == "lineage":
                        continue

                    if frozenset(genes) in ortho_sets:
                        nskipped += 1
                        if options.loglevel >= 1:
                            options.stdlog.write("# %s: cluster %i: redundant node: %i - skipped because already present: %s\n" %
                                                 (tree.name, n, node_id, str(found_genes.intersection(genes))))
                    else:
                        nerrors += 1
                        if options.loglevel >= 1:
                            options.stdlog.write("# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n" %
                                                 (tree.name, n, node_id, str(found_genes.intersection(genes))))

                found_genes = found_genes.union(genes)
                ortho_sets.add(frozenset(genes))

                xpattern = buildPattern(nspecies, sn, members)

                options.stdout.write("%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" % (len(sl),
                                                                           tree.name,
                                                                           n,
                                                                           cluster_id,
                                                                           "".join(
                                                                               pattern),
                                                                           "\t".join(
                                                                               xpattern),
                                                                           node_id,
                                                                           ";".join(otus)))

            counts[c] += n

        if ntotal_tree == 0:
            nempty += 1
        else:
            noutput += 1

    if options.loglevel >= 1:
        options.stdout.write("# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" % (
            ninput, nempty, noutput, nskipped, nerrors))

    # write summary information

    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = options.stdout
        outfile.write("//\n")

    outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species)))

    for c in range(len(xsets)):
        pattern = buildPattern(nspecies, xsets[c])
        outfile.write("%i\t%s\t%i\t%s\n" % (c,
                                            "".join(pattern),
                                            counts[c],
                                            "\t".join(pattern)))

    if outfile != options.stdout:
        outfile.close()
Esempio n. 3
0
def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(
            IOTools.openFile(infile),
            index_col=0,
            sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter(
            "%s_foreground_min_threshold" % track)]
        max_threshold = PARAMS[P.matchParameter(
            "%s_foreground_max_threshold" % track)]
        genesets.append(set(genelist[
            (genelist[field] >= min_threshold) &
            (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' % (track,
                                                   min_threshold,
                                                   field,
                                                   max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter(
            "%s_background_min_threshold" % track)]
        max_threshold = PARAMS[P.matchParameter(
            "%s_background_max_threshold" % track)]

        E.info('%s: background: %f <= %s <= %f' % (track,
                                                   min_threshold,
                                                   field,
                                                   max_threshold))
        backgrounds.append(set(genelist[
            (genelist[field] >= min_threshold) &
            (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" % (track,
                                     len(genesets[-1]),
                                     len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with IOTools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with IOTools.openFile(outfile + ".matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
Esempio n. 4
0
def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(IOTools.openFile(infile),
                                   index_col=0,
                                   sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" %
                                                track)]
        genesets.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" %
                                                track)]

        E.info('%s: background: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))
        backgrounds.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" %
               (track, len(genesets[-1]), len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with IOTools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with IOTools.openFile(outfile + ".matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)