def log_matrix(MATRIX): # Log the matrix if necessary. Will log in place. Return a # boolean indicating whether anything was logged. from genomicode import jmath from genomicode import binreg if binreg.is_logged_array_data(MATRIX): return False print "I will log the matrix." MATRIX._X = jmath.log(MATRIX._X, base=2, safe=1) return True
def log_matrices(names, matrices): # Log each variable if necessary. Will log in place. Return a # boolean indicating whether anything was logged. test can be None. from genomicode import jmath from genomicode import binreg any_files_logged = False for name, matrix in zip(names, matrices): msg = "I will not log %s." % name if not binreg.is_logged_array_data(matrix): msg = "I will log %s." % name matrix._X = jmath.log(matrix._X, base=2, safe=1) any_files_logged = True print msg sys.stdout.flush() return any_files_logged
def main(): import argparse from genomicode import jmath parser = argparse.ArgumentParser( description="Annotate a gene set with Gene Ontology codes.") parser.add_argument( "-j", dest="num_procs", type=int, default=1, help="Number of jobs to run in parallel.") parser.add_argument( "--min_genes", type=int, default=0, help="Ignore annotations that do not have at least this number of " "genes.") group = parser.add_argument_group(title="Required arguments") group.add_argument( "--background", help="Genes are selected from this background geneset. " "Format: <gmx/gmt_file>,<geneset>") group.add_argument( "--geneset", help="Annotate this geneset. If multiple gene sets are provided, " "their genes will be combined. " "Format: <gmx/gmt_file>[,<geneset>,<geneset>,...]") group.add_argument( "--all_genesets", help="Use all gene sets in this file. Format: <gmx/gmt_file>") group.add_argument( "--ignore_genes_not_in_background", default=False, action="store_true", help="Ignore any gene in the gene set that is not in the background.") group.add_argument( "--annotation", default=None, help="A gene set file that contain the annotations for the genes. " "Format: <gmx/gmt_file>,<geneset>") group = parser.add_argument_group(title="Descriptors") group.add_argument( "--annotation_descriptor", default=None, help="A text file that contains more information about the " "annotations. " "One of the columns should contain descriptors that match the " "names of the annotations given in the annotation file. " "This is not used for the scoring, only for the output.") group.add_argument( "--gene_descriptor", default=None, help="A text file that contains alternate names for the genes. " "One of the columns should contain IDs that match geneset. " "gene_name_header should contain the name of the genes to show " "in the output file. This is only used for the output. " "Format: <filename>,<gene_name_header>") args = parser.parse_args() assert args.geneset or args.all_genesets, "Please specify a gene set." assert not (args.geneset and args.all_genesets) assert args.background, "Please specify a background gene set." assert args.annotation, "Please specify an annotation file." if args.num_procs < 1 or args.num_procs > 100: parser.error("Please specify between 1 and 100 processes.") assert args.min_genes >= 0, "Need a positive min_genes." # Read the gene sets. if args.geneset: x1 = read_geneset(args.geneset) else: assert args.all_genesets x1 = read_all_genesets(args.all_genesets) x2 = read_geneset(args.background) assert len(x1) >= 1 assert len(x2) == 1 background = x2[0][-1] assert background, "No genes in background." assert len(background) >= 10, "Very few genes in background." # Save the gene sets into separate gene sets. geneset2genes = {} for x in x1: filename, gs_name, genes = x geneset2genes[gs_name] = genes if args.ignore_genes_not_in_background: for gs_name, genes in geneset2genes.iteritems(): g = [x for x in genes if x in background] assert g, "All genes from %s are missing in background." % gs_name geneset2genes[gs_name] = g # Combine the genes in the gene sets. geneset = [] for genes in geneset2genes.itervalues(): geneset.extend(genes) geneset = sorted({}.fromkeys(geneset)) assert geneset, "No genes in gene set." # Make sure each gene is in the background. missing = [x for x in geneset if x not in background] if len(missing) == len(geneset): assert False, "All genes from the geneset are missing in background." elif missing and len(missing) <= 5: x = ", ".join(missing) assert False, "Genes are missing from the background: %s" % x elif missing: x = missing[:5] + ["..."] x = ", ".join(x) assert False, "%d genes are missing from the background: %s" % ( len(missing), x) # Read the annotations. x = read_annotations(args.annotation) annot2genes = x assert annot2genes, "No annotations read." # For each of the annotations, keep only the genes that show up in # our background list. clean = {} for annot, genes in annot2genes.iteritems(): x = [x for x in genes if x in background] if not x: continue clean[annot] = x if not clean: assert annot2genes genes = annot2genes[sorted(annot2genes)[0]] x = [ "None of the background genes are annotated.", "Genes look like: %s" % ",".join(genes[:3]), "Background looks like: %s" % ",".join(background[:3]) ] assert clean, "\n".join(x) annot2genes = clean # Filter out annotations that does not meet our min_genes criteria. if args.min_genes: clean = {} for annot, genes in annot2genes.iteritems(): if len(genes) < args.min_genes: continue clean[annot] = genes assert clean, "None of the annotations have >= %d genes." % \ args.min_genes annot2genes = clean assert annot2genes, "No annotations." # Calculate the score for each of the annotations. scores = annotate_genes(geneset, background, annot2genes, args.num_procs) all_annots = sorted(scores) # Do multiple hypothesis correction. nl10ps = [] for x in scores.itervalues(): L1A1, L1A0, L0A1, L0A0, fe, nl10p, L1A1_genes = x nl10ps.append(nl10p) p_values = [10**-x for x in nl10ps] bonferroni = jmath.cmh_bonferroni(p_values) fdr = jmath.cmh_fdr_bh(p_values) nl10p2stats = {} # nl10p -> p_value, bonf, fdr for n, p, b, f in zip(nl10ps, p_values, bonferroni, fdr): nl10p2stats[n] = p, b, f # Sort the annotations by decreasing nl10p. schwartz = [] for annot in all_annots: L1A1, L1A0, L0A1, L0A0, fe, nl10p, L1A1_genes = scores[annot] x = -nl10p, annot schwartz.append(x) schwartz.sort() all_annots = [x[-1] for x in schwartz] # Read the descriptors for output. gene2pretty = {} if args.gene_descriptor: gene2pretty = read_gene_descriptor(args.gene_descriptor, background) if args.annotation_descriptor: annot_descriptors = read_annotation_descriptor( args.annotation_descriptor, all_annots) annot_headers = [x[0] for x in annot_descriptors] all_genesets = sorted(geneset2genes) header = annot_headers + [ "Annotation", "Your Genes (With Ann)", "Your Genes (No Ann)", "Other Genes (With Ann)", "Other Genes (No Ann)", "Your Genes Annotated", "Other Genes Annotated", "Fold Enrichment", "neg log_10(p value)", "Bonferroni", "FDR", "neg log_10(Bonf)", "neg log_10(FDR)"] + \ all_genesets print "\t".join(header) for i, annot in enumerate(all_annots): L1A1, L1A0, L0A1, L0A0, fe, nl10p, L1A1_genes = scores[annot] perc_geneset = float(L1A1) / (L1A1+L1A0) perc_background = float(L0A1) / (L0A1+L0A0) pvalue, bonf, fdr = nl10p2stats[nl10p] nl10bonf = -jmath.log(bonf, 10) nl10fdr = -jmath.log(fdr, 10) # Pull out the descriptors for the annotations. annot_info = [x[1][i] for x in annot_descriptors] # Separate the L1A1_genes into individual genes. geneset2L1A1 = {} for gs_name, genes in geneset2genes.iteritems(): x = [x for x in L1A1_genes if x in genes] geneset2L1A1[gs_name] = x # Convert the gene names to pretty names. for gs_name, genes in geneset2L1A1.iteritems(): x = [gene2pretty.get(x, x) for x in genes] if gene2pretty: x.sort() geneset2L1A1[gs_name] = x # Format the genes for output in the table. geneset_list = [] for gs_name in all_genesets: x = geneset2L1A1.get(gs_name, []) x = " ".join(x) geneset_list.append(x) x = annot_info + [ annot, L1A1, L1A0, L0A1, L0A0, perc_geneset, perc_background, fe, nl10p, bonf, fdr, nl10bonf, nl10fdr] + geneset_list assert len(x) == len(header), "%d %d" % (len(x), len(header)) print "\t".join(map(str, x))
def main(): from optparse import OptionParser, OptionGroup import numpy import arrayio from genomicode import jmath from genomicode import pcalib from genomicode import colorlib from genomicode import prismlib # Does a PCA on the columns. usage = "usage: %prog [options] filename outfile.png" parser = OptionParser(usage=usage, version="%prog 01") #parser.add_option( # "-l", "--log_transform", default=False, # action="store_true", # help="Log transform the data first.") parser.add_option( "--num_header_cols", type=int, help="This number of columns are headers. If not given, will guess.") parser.add_option("-g", "--genes", default=None, type="int", help="Number of genes to use.") parser.add_option( "--prism_file", help="Write the column principal components to a prism-formatted " "file.") parser.add_option( "--row_pc_file", help="Write the principal components of the rows to this file.") parser.add_option( "--col_pc_file", help="Write the principal components of the cols to this file.") #parser.add_option( # "-v", "--verbose", default=False, action="store_true", # help="") group = OptionGroup(parser, "Clustering") parser.add_option_group(group) group.add_option( "-c", "--cluster", default=[], action="append", help="Group samples into a cluster (e.g. -c 1-5); 1-based.") group.add_option( "--indexes_include_headers", "--iih", action="store_true", help="If not given (default), then index 1 is the first column " "with data. If given, then index 1 is the very first column " "in the file, including the headers.") group.add_option( "--cluster_file", help="A KGG format file of the clusters for the samples. " "Clusters in this file can be 0-based or 1-based.") group = OptionGroup(parser, "Visualization") parser.add_option_group(group) group.add_option("--title", help="Put a title on the plot.") group.add_option("--width", default=None, type="int", help="Width (in pixels) of the plot.") group.add_option("--label", default=False, action="store_true", help="Label the samples.") group.add_option("--label_axes", default=False, action="store_true", help="Label the axes.") group.add_option("--scale_label", type=float, default=1.0, help="Scale the size of the labels.") # Parse the input arguments. options, args = parser.parse_args() if len(args) < 2: parser.error("Please specify an infile and an outfile.") elif len(args) > 2: parser.error("Too many input parameters (%d)." % len(args)) filename, outfile = args if not os.path.exists(filename): parser.error("I could not find file %s." % filename) if options.num_header_cols is not None: assert options.num_header_cols > 0 and options.num_header_cols < 100 if options.width is not None: assert options.width > 10, "too small" assert options.width < 4096 * 16, "width too big" assert options.scale_label > 0.01 and options.scale_label < 100 options.log_transform = False num_genes = options.genes #K = 10 # number of dimensions MATRIX = read_matrix(filename, options.num_header_cols) if options.log_transform: MATRIX._X = jmath.log(MATRIX._X, base=2, safe=1) assert MATRIX.nrow() and MATRIX.ncol(), "Empty matrix." cluster = None if options.cluster and options.cluster_file: parser.error("Cannot specify clusters and a cluster file.") if options.cluster: cluster = _parse_cluster(options.cluster, options.indexes_include_headers, MATRIX) if options.cluster_file: if not os.path.exists(options.cluster_file): parser.error("I could not find cluster file: %s" % options.cluster_file) cluster = _parse_cluster_file(options.cluster_file, MATRIX) # Select a subset of the genes. if num_genes: assert MATRIX.ncol() > 1, "Not enough samples to select genes." I = pcalib.select_genes_var(MATRIX._X, num_genes) MATRIX = MATRIX.matrix(I, None) # Calculate the principal components and plot them. K = min(MATRIX.nrow(), MATRIX.ncol()) principal_components, perc_var = pcalib.svd_project_cols(MATRIX._X, K) X = [x[0] for x in principal_components] Y = [x[1] for x in principal_components] color = None if cluster is not None: color = pcalib.choose_colors(cluster) LABEL = None if options.label: LABEL = MATRIX.col_names(arrayio.COL_ID) assert not LABEL or len(LABEL) == len(X), "%d %d" % (len(X), len(LABEL)) height = width = None if options.width is not None: height, width = int(options.width * 0.75), options.width pcalib.plot_scatter(X, Y, outfile, group=cluster, color=color, title=options.title, label=LABEL, xlabel=options.label_axes, ylabel=options.label_axes, scale_label=options.scale_label, height=height, width=width) # Write out the scatter plot in Prism format. if options.prism_file: # Write out as prism format. num_series = 1 if cluster: num_series = max(cluster) + 1 names = ["CLUSTER-%d" % (i + 1) for i in range(num_series)] DATA = {} rownames = {} for i in range(num_series): xy = [] n = [] for j in range(len(principal_components)): if cluster and cluster[j] != i: continue x = principal_components[j][0] y = principal_components[j][1] xy.append([x, y]) n.append(MATRIX.col_names(arrayio.COL_ID)[j]) if xy: DATA[names[i]] = xy rownames[names[i]] = n prismlib.write_scatterplot(options.prism_file, DATA, rownames) if options.col_pc_file: # Write out the principal components. handle = open(options.col_pc_file, 'w') assert cluster is None or len(cluster) == len(principal_components) x = ["PC%02d (%.2f%%)" % (i, 100 * perc_var[i]) for i in range(K)] header = ["Index", "Sample", "Cluster", "Color"] + x print >> handle, "\t".join(header) for i in range(len(principal_components)): x = MATRIX.col_names(arrayio.COL_ID)[i] c = "" if color and color[i] is not None: c = colorlib.rgb2hex(color[i]) clust = "" if cluster is not None and cluster[i] is not None: clust = cluster[i] x = [i + 1, x, clust, c] + principal_components[i] assert len(x) == len(header) print >> handle, "\t".join(map(str, x)) handle.close() # Look at the principal components on the rows. if options.row_pc_file: handle = open(options.row_pc_file, 'w') row_names = MATRIX.row_names() x = ["PC%02d (%.2f%%)" % (i, 100 * perc_var[i]) for i in range(K)] header = ["Index"] + row_names + x print >> handle, "\t".join(header) # U nrow x k columns are principal components # V k x ncol rows are principal components U, s, V = numpy.linalg.svd(MATRIX._X, full_matrices=False) for i in range(len(U)): assert len(U[i]) == K, "%d %d" % (len(U), len(U[i]), K) n = [MATRIX.row_names(x)[i] for x in row_names] x = [i + 1] + n + list(U[i]) assert len(x) == len(header) print >> handle, "\t".join(map(str, x)) handle.close()