Beispiel #1
0
def log_matrix(MATRIX):
    # Log the matrix if necessary.  Will log in place.  Return a
    # boolean indicating whether anything was logged.
    from genomicode import jmath
    from genomicode import binreg

    if binreg.is_logged_array_data(MATRIX):
        return False
    print "I will log the matrix."
    MATRIX._X = jmath.log(MATRIX._X, base=2, safe=1)
    return True
Beispiel #2
0
def log_matrices(names, matrices):
    # Log each variable if necessary.  Will log in place.  Return a
    # boolean indicating whether anything was logged.  test can be None.
    from genomicode import jmath
    from genomicode import binreg

    any_files_logged = False
    for name, matrix in zip(names, matrices):
        msg = "I will not log %s." % name

        if not binreg.is_logged_array_data(matrix):
            msg = "I will log %s." % name
            matrix._X = jmath.log(matrix._X, base=2, safe=1)
            any_files_logged = True
        print msg
    sys.stdout.flush()
    return any_files_logged
def main():
    import argparse
    from genomicode import jmath

    parser = argparse.ArgumentParser(
        description="Annotate a gene set with Gene Ontology codes.")
    parser.add_argument(
        "-j", dest="num_procs", type=int, default=1,
        help="Number of jobs to run in parallel.")
    parser.add_argument(
        "--min_genes", type=int, default=0,
        help="Ignore annotations that do not have at least this number of "
        "genes.")

    group = parser.add_argument_group(title="Required arguments")
    group.add_argument(
        "--background", 
        help="Genes are selected from this background geneset.  "
        "Format: <gmx/gmt_file>,<geneset>")
    group.add_argument(
        "--geneset", 
        help="Annotate this geneset.  If multiple gene sets are provided, "
        "their genes will be combined.  "
        "Format: <gmx/gmt_file>[,<geneset>,<geneset>,...]")
    group.add_argument(
        "--all_genesets", 
        help="Use all gene sets in this file.  Format: <gmx/gmt_file>")
    group.add_argument(
        "--ignore_genes_not_in_background", default=False, action="store_true",
        help="Ignore any gene in the gene set that is not in the background.")
    group.add_argument(
        "--annotation", default=None,
        help="A gene set file that contain the annotations for the genes.  "
        "Format: <gmx/gmt_file>,<geneset>")

    group = parser.add_argument_group(title="Descriptors")
    group.add_argument(
        "--annotation_descriptor", default=None,
        help="A text file that contains more information about the "
        "annotations.  "
        "One of the columns should contain descriptors that match the "
        "names of the annotations given in the annotation file.  "
        "This is not used for the scoring, only for the output.")
    group.add_argument(
        "--gene_descriptor", default=None,
        help="A text file that contains alternate names for the genes.  "
        "One of the columns should contain IDs that match geneset.  "
        "gene_name_header should contain the name of the genes to show "
        "in the output file.  This is only used for the output.  "
        "Format: <filename>,<gene_name_header>")
    
    
    args = parser.parse_args()
    assert args.geneset or args.all_genesets, "Please specify a gene set."
    assert not (args.geneset and args.all_genesets)
    assert args.background, "Please specify a background gene set."
    assert args.annotation, "Please specify an annotation file."
    if args.num_procs < 1 or args.num_procs > 100:
        parser.error("Please specify between 1 and 100 processes.")
    assert args.min_genes >= 0, "Need a positive min_genes."

    # Read the gene sets.
    if args.geneset:
        x1 = read_geneset(args.geneset)
    else:
        assert args.all_genesets
        x1 = read_all_genesets(args.all_genesets)
    x2 = read_geneset(args.background)
    assert len(x1) >= 1
    assert len(x2) == 1
    background = x2[0][-1]
    assert background, "No genes in background."
    assert len(background) >= 10, "Very few genes in background."
    
    # Save the gene sets into separate gene sets.
    geneset2genes = {}
    for x in x1:
        filename, gs_name, genes = x
        geneset2genes[gs_name] = genes
    if args.ignore_genes_not_in_background:
        for gs_name, genes in geneset2genes.iteritems():
            g = [x for x in genes if x in background]
            assert g, "All genes from %s are missing in background." % gs_name
            geneset2genes[gs_name] = g
    # Combine the genes in the gene sets.
    geneset = []
    for genes in geneset2genes.itervalues():
        geneset.extend(genes)
    geneset = sorted({}.fromkeys(geneset))
    assert geneset, "No genes in gene set."

    # Make sure each gene is in the background.
    missing = [x for x in geneset if x not in background]
    if len(missing) == len(geneset):
        assert False, "All genes from the geneset are missing in background."
    elif missing and len(missing) <= 5:
        x = ", ".join(missing)
        assert False, "Genes are missing from the background: %s" % x
    elif missing:
        x = missing[:5] + ["..."]
        x = ", ".join(x)
        assert False, "%d genes are missing from the background: %s" % (
            len(missing), x)

    # Read the annotations.
    x = read_annotations(args.annotation)
    annot2genes = x
    assert annot2genes, "No annotations read."

    # For each of the annotations, keep only the genes that show up in
    # our background list.
    clean = {}
    for annot, genes in annot2genes.iteritems():
        x = [x for x in genes if x in background]
        if not x:
            continue
        clean[annot] = x
    if not clean:
        assert annot2genes
        genes = annot2genes[sorted(annot2genes)[0]]
        x = [
            "None of the background genes are annotated.",
            "Genes look like: %s" % ",".join(genes[:3]),
            "Background looks like: %s" % ",".join(background[:3])
            ]
        assert clean, "\n".join(x)
    annot2genes = clean

    # Filter out annotations that does not meet our min_genes criteria.
    if args.min_genes:
        clean = {}
        for annot, genes in annot2genes.iteritems():
            if len(genes) < args.min_genes:
                continue
            clean[annot] = genes
        assert clean, "None of the annotations have >= %d genes." % \
               args.min_genes
        annot2genes = clean
    assert annot2genes, "No annotations."

    # Calculate the score for each of the annotations.
    scores = annotate_genes(geneset, background, annot2genes, args.num_procs)
    all_annots = sorted(scores)

    # Do multiple hypothesis correction.
    nl10ps = []
    for x in scores.itervalues():
        L1A1, L1A0, L0A1, L0A0, fe, nl10p, L1A1_genes = x
        nl10ps.append(nl10p)
    p_values = [10**-x for x in nl10ps]
    bonferroni = jmath.cmh_bonferroni(p_values)
    fdr = jmath.cmh_fdr_bh(p_values)
    nl10p2stats = {}  # nl10p -> p_value, bonf, fdr
    for n, p, b, f in zip(nl10ps, p_values, bonferroni, fdr):
        nl10p2stats[n] = p, b, f

    # Sort the annotations by decreasing nl10p.
    schwartz = []
    for annot in all_annots:
        L1A1, L1A0, L0A1, L0A0, fe, nl10p, L1A1_genes = scores[annot]
        x = -nl10p, annot
        schwartz.append(x)
    schwartz.sort()
    all_annots = [x[-1] for x in schwartz]
    

    # Read the descriptors for output.
    gene2pretty = {}
    if args.gene_descriptor:
        gene2pretty = read_gene_descriptor(args.gene_descriptor, background)
    if args.annotation_descriptor:
        annot_descriptors = read_annotation_descriptor(
            args.annotation_descriptor, all_annots)

    annot_headers = [x[0] for x in annot_descriptors]
    all_genesets = sorted(geneset2genes)
    header = annot_headers + [
        "Annotation",
        "Your Genes (With Ann)", "Your Genes (No Ann)",
        "Other Genes (With Ann)", "Other Genes (No Ann)",
        "Your Genes Annotated", "Other Genes Annotated",
        "Fold Enrichment", "neg log_10(p value)",
        "Bonferroni", "FDR", "neg log_10(Bonf)", "neg log_10(FDR)"] + \
        all_genesets
    print "\t".join(header)
    for i, annot in enumerate(all_annots):
        L1A1, L1A0, L0A1, L0A0, fe, nl10p, L1A1_genes = scores[annot]
        perc_geneset = float(L1A1) / (L1A1+L1A0)
        perc_background = float(L0A1) / (L0A1+L0A0)
        pvalue, bonf, fdr = nl10p2stats[nl10p]
        nl10bonf = -jmath.log(bonf, 10)
        nl10fdr = -jmath.log(fdr, 10)

        # Pull out the descriptors for the annotations.
        annot_info = [x[1][i] for x in annot_descriptors]

        # Separate the L1A1_genes into individual genes.
        geneset2L1A1 = {}
        for gs_name, genes in geneset2genes.iteritems():
            x = [x for x in L1A1_genes if x in genes]
            geneset2L1A1[gs_name] = x

        # Convert the gene names to pretty names.
        for gs_name, genes in geneset2L1A1.iteritems():
            x = [gene2pretty.get(x, x) for x in genes]
            if gene2pretty:
                x.sort()
            geneset2L1A1[gs_name] = x

        # Format the genes for output in the table.
        geneset_list = []
        for gs_name in all_genesets:
            x = geneset2L1A1.get(gs_name, [])
            x = " ".join(x)
            geneset_list.append(x)

        x = annot_info + [
            annot, L1A1, L1A0, L0A1, L0A0, perc_geneset, perc_background,
            fe, nl10p, bonf, fdr, nl10bonf, nl10fdr] + geneset_list
        assert len(x) == len(header), "%d %d" % (len(x), len(header))
        print "\t".join(map(str, x))
Beispiel #4
0
def main():
    from optparse import OptionParser, OptionGroup
    import numpy
    import arrayio
    from genomicode import jmath
    from genomicode import pcalib
    from genomicode import colorlib
    from genomicode import prismlib

    # Does a PCA on the columns.
    usage = "usage: %prog [options] filename outfile.png"
    parser = OptionParser(usage=usage, version="%prog 01")

    #parser.add_option(
    #    "-l", "--log_transform", default=False,
    #    action="store_true",
    #    help="Log transform the data first.")

    parser.add_option(
        "--num_header_cols",
        type=int,
        help="This number of columns are headers.  If not given, will guess.")
    parser.add_option("-g",
                      "--genes",
                      default=None,
                      type="int",
                      help="Number of genes to use.")
    parser.add_option(
        "--prism_file",
        help="Write the column principal components to a prism-formatted "
        "file.")
    parser.add_option(
        "--row_pc_file",
        help="Write the principal components of the rows to this file.")
    parser.add_option(
        "--col_pc_file",
        help="Write the principal components of the cols to this file.")
    #parser.add_option(
    #    "-v", "--verbose", default=False, action="store_true",
    #    help="")

    group = OptionGroup(parser, "Clustering")
    parser.add_option_group(group)
    group.add_option(
        "-c",
        "--cluster",
        default=[],
        action="append",
        help="Group samples into a cluster (e.g. -c 1-5); 1-based.")
    group.add_option(
        "--indexes_include_headers",
        "--iih",
        action="store_true",
        help="If not given (default), then index 1 is the first column "
        "with data.  If given, then index 1 is the very first column "
        "in the file, including the headers.")
    group.add_option(
        "--cluster_file",
        help="A KGG format file of the clusters for the samples.  "
        "Clusters in this file can be 0-based or 1-based.")

    group = OptionGroup(parser, "Visualization")
    parser.add_option_group(group)
    group.add_option("--title", help="Put a title on the plot.")
    group.add_option("--width",
                     default=None,
                     type="int",
                     help="Width (in pixels) of the plot.")
    group.add_option("--label",
                     default=False,
                     action="store_true",
                     help="Label the samples.")
    group.add_option("--label_axes",
                     default=False,
                     action="store_true",
                     help="Label the axes.")
    group.add_option("--scale_label",
                     type=float,
                     default=1.0,
                     help="Scale the size of the labels.")

    # Parse the input arguments.
    options, args = parser.parse_args()
    if len(args) < 2:
        parser.error("Please specify an infile and an outfile.")
    elif len(args) > 2:
        parser.error("Too many input parameters (%d)." % len(args))
    filename, outfile = args
    if not os.path.exists(filename):
        parser.error("I could not find file %s." % filename)
    if options.num_header_cols is not None:
        assert options.num_header_cols > 0 and options.num_header_cols < 100
    if options.width is not None:
        assert options.width > 10, "too small"
        assert options.width < 4096 * 16, "width too big"
    assert options.scale_label > 0.01 and options.scale_label < 100
    options.log_transform = False

    num_genes = options.genes
    #K = 10  # number of dimensions

    MATRIX = read_matrix(filename, options.num_header_cols)
    if options.log_transform:
        MATRIX._X = jmath.log(MATRIX._X, base=2, safe=1)
    assert MATRIX.nrow() and MATRIX.ncol(), "Empty matrix."

    cluster = None
    if options.cluster and options.cluster_file:
        parser.error("Cannot specify clusters and a cluster file.")
    if options.cluster:
        cluster = _parse_cluster(options.cluster,
                                 options.indexes_include_headers, MATRIX)
    if options.cluster_file:
        if not os.path.exists(options.cluster_file):
            parser.error("I could not find cluster file: %s" %
                         options.cluster_file)
        cluster = _parse_cluster_file(options.cluster_file, MATRIX)

    # Select a subset of the genes.
    if num_genes:
        assert MATRIX.ncol() > 1, "Not enough samples to select genes."
        I = pcalib.select_genes_var(MATRIX._X, num_genes)
        MATRIX = MATRIX.matrix(I, None)

    # Calculate the principal components and plot them.
    K = min(MATRIX.nrow(), MATRIX.ncol())
    principal_components, perc_var = pcalib.svd_project_cols(MATRIX._X, K)
    X = [x[0] for x in principal_components]
    Y = [x[1] for x in principal_components]
    color = None
    if cluster is not None:
        color = pcalib.choose_colors(cluster)
    LABEL = None
    if options.label:
        LABEL = MATRIX.col_names(arrayio.COL_ID)
    assert not LABEL or len(LABEL) == len(X), "%d %d" % (len(X), len(LABEL))
    height = width = None
    if options.width is not None:
        height, width = int(options.width * 0.75), options.width
    pcalib.plot_scatter(X,
                        Y,
                        outfile,
                        group=cluster,
                        color=color,
                        title=options.title,
                        label=LABEL,
                        xlabel=options.label_axes,
                        ylabel=options.label_axes,
                        scale_label=options.scale_label,
                        height=height,
                        width=width)

    # Write out the scatter plot in Prism format.
    if options.prism_file:
        # Write out as prism format.
        num_series = 1
        if cluster:
            num_series = max(cluster) + 1
        names = ["CLUSTER-%d" % (i + 1) for i in range(num_series)]
        DATA = {}
        rownames = {}
        for i in range(num_series):
            xy = []
            n = []
            for j in range(len(principal_components)):
                if cluster and cluster[j] != i:
                    continue
                x = principal_components[j][0]
                y = principal_components[j][1]
                xy.append([x, y])
                n.append(MATRIX.col_names(arrayio.COL_ID)[j])
            if xy:
                DATA[names[i]] = xy
                rownames[names[i]] = n

        prismlib.write_scatterplot(options.prism_file, DATA, rownames)

    if options.col_pc_file:
        # Write out the principal components.
        handle = open(options.col_pc_file, 'w')
        assert cluster is None or len(cluster) == len(principal_components)
        x = ["PC%02d (%.2f%%)" % (i, 100 * perc_var[i]) for i in range(K)]
        header = ["Index", "Sample", "Cluster", "Color"] + x
        print >> handle, "\t".join(header)
        for i in range(len(principal_components)):
            x = MATRIX.col_names(arrayio.COL_ID)[i]
            c = ""
            if color and color[i] is not None:
                c = colorlib.rgb2hex(color[i])
            clust = ""
            if cluster is not None and cluster[i] is not None:
                clust = cluster[i]
            x = [i + 1, x, clust, c] + principal_components[i]
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
        handle.close()

    # Look at the principal components on the rows.
    if options.row_pc_file:
        handle = open(options.row_pc_file, 'w')
        row_names = MATRIX.row_names()
        x = ["PC%02d (%.2f%%)" % (i, 100 * perc_var[i]) for i in range(K)]
        header = ["Index"] + row_names + x
        print >> handle, "\t".join(header)

        # U  nrow x k  columns are principal components
        # V  k x ncol  rows are principal components
        U, s, V = numpy.linalg.svd(MATRIX._X, full_matrices=False)
        for i in range(len(U)):
            assert len(U[i]) == K, "%d %d" % (len(U), len(U[i]), K)
            n = [MATRIX.row_names(x)[i] for x in row_names]
            x = [i + 1] + n + list(U[i])
            assert len(x) == len(header)
            print >> handle, "\t".join(map(str, x))
        handle.close()