Python SetTools Examples

Programming Language: Python

Namespace/Package Name: CGAT

Class/Type: SetTools

Examples at hotexamples.com: 4

Python SetTools - 4 examples found. These are the top rated real world Python examples of CGAT.SetTools extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

unionIntersectionMatrix(1)

writeSets(1)

xuniqueCombinations(1)

Example #1

Show file

def writeOrthologSets(outfile,
                      nexus,
                      extract_species,
                      extract_gene,
                      options,
                      reference_tree=None,
                      method="strict",
                      outgroups=None):
    """output ortholog sets.

    A "strict" ortholog set contains exactly one gene for each species,
    while a "degenerate" ortholog set contains at least one gene for each
    species. 
    """

    ######################################################################
    # build species set to compare
    sets = []
    species = options.column2org
    nspecies = len(species)

    if options.enumeration == "monophyletic":
        if reference_tree:
            for members, h1, h2 in TreeTools.GetSubsets(reference_tree):
                if len(members) > 1:
                    sets.append(members)
        else:
            raise "please specify a species tree for monophyletic enumeration"

    elif options.enumeration == "exhaustive":
        for x in range(2, len(species)):
            sets += list(SetTools.xuniqueCombinations(species, x))
        sets.append(species)

    elif options.enumeration == "pairwise":

        for x in range(len(species) - 1):
            for y in range(x + 1, len(species)):
                sets.append((species[x], species[y]))

    elif options.enumeration == "full":
        sets.append(species)

    elif options.enumeration == "lineage":
        for s in species:
            sets.append((s, ))

    elif options.enumeration == "explicit":
        for x in range(2, len(options.species_set)):
            sets += list(SetTools.xuniqueCombinations(options.species_set, x))
        sets.append(options.species_set)

    ######################################################################
    # build sets with positional information
    xsets = []
    map_frozenset2set = {}
    for x in range(len(sets)):
        ss = frozenset(map(lambda x: options.org2column[x], sets[x]))
        xsets.append(ss)
        map_frozenset2set[ss] = x

    ######################################################################
    # collect outgroups
    if outgroups:
        noutgroups = set()
        for x in outgroups:
            noutgroups.add(options.org2column[x])
    else:
        noutgroups = None

    ######################################################################
    # loop over each tree and set
    # I did not see a way to loop a tree once for all sets without doing
    # complicated counting. The problem is that counting has to be stopped
    # at different tree heights for different sets.
    ninput, noutput, nempty, nskipped = 0, 0, 0, 0

    counts = [0] * len(sets)

    options.stdout.write(
        "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" %
        "\t".join(species))

    cluster_id = 0
    nerrors = 0

    for tree in nexus.trees:

        ninput += 1
        ntotal_tree = 0

        if options.loglevel >= 3:
            options.stdlog.write("# processing tree %s\n" % tree.name)

        if options.reroot:
            rerootTree(tree, extract_species, options)

        for c in range(len(xsets)):
            # numbered species set: 0,1,...
            sn = xsets[c]
            # literal species set: species1, species2, ...
            sl = sets[c]

            ortholog_nodes = getOrthologNodes(tree,
                                              sn,
                                              options,
                                              selector=method,
                                              outgroups=noutgroups)
            ntotal_tree += len(ortholog_nodes)

            n = 0

            pattern = buildPattern(nspecies, sn)

            # check for inconsistent partitions (the same gene in different
            # ortholog clusters) within the current tree
            found_genes = set()
            ortho_sets = set()

            # reverse ortholog_node - work in top-down manner.
            ortholog_nodes.reverse()

            for node_id, members in ortholog_nodes:
                n += 1
                cluster_id += 1

                otus = filter(lambda x: extract_species(x) in sl,
                              tree.get_taxa(node_id))
                genes = set(map(extract_gene, otus))

                if found_genes.intersection(genes):

                    # only take largest cluster for lineage specific
                    # duplications
                    if method == "lineage":
                        continue

                    if frozenset(genes) in ortho_sets:
                        nskipped += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: redundant node: %i - skipped because already present: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))
                    else:
                        nerrors += 1
                        if options.loglevel >= 1:
                            options.stdlog.write(
                                "# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n"
                                % (tree.name, n, node_id,
                                   str(found_genes.intersection(genes))))

                found_genes = found_genes.union(genes)
                ortho_sets.add(frozenset(genes))

                xpattern = buildPattern(nspecies, sn, members)

                options.stdout.write(
                    "%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" %
                    (len(sl), tree.name, n, cluster_id, "".join(pattern),
                     "\t".join(xpattern), node_id, ";".join(otus)))

            counts[c] += n

        if ntotal_tree == 0:
            nempty += 1
        else:
            noutput += 1

    if options.loglevel >= 1:
        options.stdout.write(
            "# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" %
            (ninput, nempty, noutput, nskipped, nerrors))

    # write summary information

    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = options.stdout
        outfile.write("//\n")

    outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species)))

    for c in range(len(xsets)):
        pattern = buildPattern(nspecies, xsets[c])
        outfile.write("%i\t%s\t%i\t%s\n" %
                      (c, "".join(pattern), counts[c], "\t".join(pattern)))

    if outfile != options.stdout:
        outfile.close()

Example #2

Show file

File: trees2sets.py Project: Charlie-George/cgat

def writeOrthologSets(outfile, nexus,
                      extract_species,
                      extract_gene,
                      options,
                      reference_tree=None,
                      method="strict",
                      outgroups=None):
    """output ortholog sets.

    A "strict" ortholog set contains exactly one gene for each species,
    while a "degenerate" ortholog set contains at least one gene for each
    species. 
    """

    ######################################################################
    # build species set to compare
    sets = []
    species = options.column2org
    nspecies = len(species)

    if options.enumeration == "monophyletic":
        if reference_tree:
            for members, h1, h2 in TreeTools.GetSubsets(reference_tree):
                if len(members) > 1:
                    sets.append(members)
        else:
            raise "please specify a species tree for monophyletic enumeration"

    elif options.enumeration == "exhaustive":
        for x in range(2, len(species)):
            sets += list(SetTools.xuniqueCombinations(species, x))
        sets.append(species)

    elif options.enumeration == "pairwise":

        for x in range(len(species) - 1):
            for y in range(x + 1, len(species)):
                sets.append((species[x], species[y]))

    elif options.enumeration == "full":
        sets.append(species)

    elif options.enumeration == "lineage":
        for s in species:
            sets.append((s,))

    elif options.enumeration == "explicit":
        for x in range(2, len(options.species_set)):
            sets += list(SetTools.xuniqueCombinations(options.species_set, x))
        sets.append(options.species_set)

    ######################################################################
    # build sets with positional information
    xsets = []
    map_frozenset2set = {}
    for x in range(len(sets)):
        ss = frozenset(map(lambda x: options.org2column[x], sets[x]))
        xsets.append(ss)
        map_frozenset2set[ss] = x

    ######################################################################
    # collect outgroups
    if outgroups:
        noutgroups = set()
        for x in outgroups:
            noutgroups.add(options.org2column[x])
    else:
        noutgroups = None

    ######################################################################
    # loop over each tree and set
    # I did not see a way to loop a tree once for all sets without doing
    # complicated counting. The problem is that counting has to be stopped
    # at different tree heights for different sets.
    ninput, noutput, nempty, nskipped = 0, 0, 0, 0

    counts = [0] * len(sets)

    options.stdout.write(
        "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" % "\t".join(species))

    cluster_id = 0
    nerrors = 0

    for tree in nexus.trees:

        ninput += 1
        ntotal_tree = 0

        if options.loglevel >= 3:
            options.stdlog.write("# processing tree %s\n" % tree.name)

        if options.reroot:
            rerootTree(tree, extract_species, options)

        for c in range(len(xsets)):
            # numbered species set: 0,1,...
            sn = xsets[c]
            # literal species set: species1, species2, ...
            sl = sets[c]

            ortholog_nodes = getOrthologNodes(tree, sn, options, selector=method,
                                              outgroups=noutgroups)
            ntotal_tree += len(ortholog_nodes)

            n = 0

            pattern = buildPattern(nspecies, sn)

            # check for inconsistent partitions (the same gene in different
            # ortholog clusters) within the current tree
            found_genes = set()
            ortho_sets = set()

            # reverse ortholog_node - work in top-down manner.
            ortholog_nodes.reverse()

            for node_id, members in ortholog_nodes:
                n += 1
                cluster_id += 1

                otus = filter(
                    lambda x: extract_species(x) in sl, tree.get_taxa(node_id))
                genes = set(map(extract_gene, otus))

                if found_genes.intersection(genes):

                    # only take largest cluster for lineage specific
                    # duplications
                    if method == "lineage":
                        continue

                    if frozenset(genes) in ortho_sets:
                        nskipped += 1
                        if options.loglevel >= 1:
                            options.stdlog.write("# %s: cluster %i: redundant node: %i - skipped because already present: %s\n" %
                                                 (tree.name, n, node_id, str(found_genes.intersection(genes))))
                    else:
                        nerrors += 1
                        if options.loglevel >= 1:
                            options.stdlog.write("# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n" %
                                                 (tree.name, n, node_id, str(found_genes.intersection(genes))))

                found_genes = found_genes.union(genes)
                ortho_sets.add(frozenset(genes))

                xpattern = buildPattern(nspecies, sn, members)

                options.stdout.write("%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" % (len(sl),
                                                                           tree.name,
                                                                           n,
                                                                           cluster_id,
                                                                           "".join(
                                                                               pattern),
                                                                           "\t".join(
                                                                               xpattern),
                                                                           node_id,
                                                                           ";".join(otus)))

            counts[c] += n

        if ntotal_tree == 0:
            nempty += 1
        else:
            noutput += 1

    if options.loglevel >= 1:
        options.stdout.write("# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" % (
            ninput, nempty, noutput, nskipped, nerrors))

    # write summary information

    if options.filename_summary:
        outfile = open(options.filename_summary, "w")
    else:
        outfile = options.stdout
        outfile.write("//\n")

    outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species)))

    for c in range(len(xsets)):
        pattern = buildPattern(nspecies, xsets[c])
        outfile.write("%i\t%s\t%i\t%s\n" % (c,
                                            "".join(pattern),
                                            counts[c],
                                            "\t".join(pattern)))

    if outfile != options.stdout:
        outfile.close()

Example #3

Show file

File: pipeline_genesets.py Project: gjaime/CGATPipelines

def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(
            IOTools.openFile(infile),
            index_col=0,
            sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter(
            "%s_foreground_min_threshold" % track)]
        max_threshold = PARAMS[P.matchParameter(
            "%s_foreground_max_threshold" % track)]
        genesets.append(set(genelist[
            (genelist[field] >= min_threshold) &
            (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' % (track,
                                                   min_threshold,
                                                   field,
                                                   max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter(
            "%s_background_min_threshold" % track)]
        max_threshold = PARAMS[P.matchParameter(
            "%s_background_max_threshold" % track)]

        E.info('%s: background: %f <= %s <= %f' % (track,
                                                   min_threshold,
                                                   field,
                                                   max_threshold))
        backgrounds.append(set(genelist[
            (genelist[field] >= min_threshold) &
            (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" % (track,
                                     len(genesets[-1]),
                                     len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with IOTools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with IOTools.openFile(outfile + ".matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)

Example #4

Show file

def buildGeneListMatrix(infiles, outfile):
    '''build a gene list matrix for simple pathway analysis
    based on hypergeometric test.

    A gene list is derived from a gene set by
    applying thresholds to the input data set. The
    thresholds are defined in the configuration file.
    '''

    genesets = []
    backgrounds = []
    headers = []
    for infile in infiles:
        genelist = pandas.read_csv(IOTools.openFile(infile),
                                   index_col=0,
                                   sep='\t')

        track = P.snip(os.path.basename(infile), ".tsv.gz")
        headers.append(track)

        field = PARAMS[P.matchParameter("%s_foreground_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_foreground_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_foreground_max_threshold" %
                                                track)]
        genesets.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info('%s: foreground: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))

        field = PARAMS[P.matchParameter("%s_background_field" % track)]
        min_threshold = PARAMS[P.matchParameter("%s_background_min_threshold" %
                                                track)]
        max_threshold = PARAMS[P.matchParameter("%s_background_max_threshold" %
                                                track)]

        E.info('%s: background: %f <= %s <= %f' %
               (track, min_threshold, field, max_threshold))
        backgrounds.append(
            set(genelist[(genelist[field] >= min_threshold)
                         & (genelist[field] <= max_threshold)].index))

        E.info("%s: fg=%i, bg=%i" %
               (track, len(genesets[-1]), len(backgrounds[-1])))

    E.info("writing gene list matrix")
    with IOTools.openFile(outfile, "w") as outf:
        SetTools.writeSets(outf, genesets, labels=headers)
    with IOTools.openFile(outfile + ".bg.tsv.gz", "w") as outf:
        SetTools.writeSets(outf, backgrounds, labels=headers)

    E.info("writing intersection/union matrix")
    # build set intersection matrix
    matrix = SetTools.unionIntersectionMatrix(genesets)
    with IOTools.openFile(outfile + ".matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)
    matrix = SetTools.unionIntersectionMatrix(backgrounds)
    with IOTools.openFile(outfile + ".bg.matrix.gz", "w") as outf:
        IOTools.writeMatrix(outf, matrix, headers, headers)