def main(): usage = 'usage: %prog [options] <cors file>' parser = OptionParser(usage) parser.add_option('--go_min', dest='go_min', type='int', default=10, help='Minimum number of genes assigned a GO term to consider enrichment of that term') parser.add_option('--go_max', dest='go_max', type='int', default=300, help='Maximum number of genes assigned a GO term to consider enrichment of that term') parser.add_option('-n', dest='num_shuffles', type='int', default=250, help='Number of shuffles to obtain p-value [Default: %default]') parser.add_option('-s', dest='size_skip', type='int', default=4, help='Gene set sizes to skip when computing null distributions [Default: %default]') (options,args) = parser.parse_args() if len(args) != 1: parser.error('Must provide correlations file') else: cors_file = args[0] # get genes, correlations correlations_genes = [] genes = [] for line in open(cors_file): a = line.split() correlations_genes.append((abs(float(a[1])),a[0])) genes.append(a[0]) correlations_genes.sort(reverse=True) # GO go_map, go_descs = read_go(set(genes)) # make null distributions geneset_size_enrichments = make_null_dist(options.go_min, options.go_max, options.num_shuffles, options.size_skip, go_map, correlations_genes, genes) # do stats output_cols = [] for go_term in go_map: go_size = len(go_map[go_term]) if options.go_min <= go_size <= options.go_max: # compute enrichment enrichment = gsea_enrichment(correlations_genes, go_map[go_term]) # compute p-value using normal approximation #p_val = (1+len([e for e in geneset_size_enrichments[go_size] if e >= enrichment])) / float(options.num_shuffles) (mean, sd) = interpolate_null(geneset_size_enrichments, options.size_skip, go_size) p_val = max(norm.rvs(1,loc=0,scale=1e-17), 1.0 - norm.cdf(enrichment, loc=mean, scale=sd)) # output output_cols.append([go_term, enrichment, p_val, 99, go_size, go_descs[go_term]]) # FDR multiple hypothesis correction p_values = [oc[2] for oc in output_cols] q_values = fdr.storey(p_values) for i in range(len(output_cols)): output_cols[i][3] = q_values[i] for oc in output_cols: print '%-12s %.3f %.3e %.3e %4d %s' % tuple(oc)
def main(): usage = 'usage: %prog [options] <cors file> <null file>' parser = OptionParser(usage) parser.add_option('--go_min', dest='go_min', type='int', default=10, help='Minimum number of genes assigned a GO term to consider enrichment of that term') parser.add_option('--go_max', dest='go_max', type='int', default=300, help='Maximum number of genes assigned a GO term to consider enrichment of that term') parser.add_option('-n', dest='null_samples', type='int', default=50, help='Number of null samples to obtain p-value [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide correlations file') else: cors_file = args[0] null_file = args[1] # get genes, correlations correlations_genes = [] genes = [] for line in open(cors_file): a = line.split() correlations_genes.append((abs(float(a[1])),a[0])) genes.append(a[0]) correlations_genes.sort(reverse=True) # GO go_map, go_descs = read_go(set(genes)) consider_go = [go_term for go_term in go_map if options.go_min <= len(go_map[go_term]) <= options.go_max] # compute null GO term enrichments null_go_enrichments = {} header = '' samples = 0 for line in open(null_file): if line[0] == '>': if header: if samples < options.null_samples: process_null_sample(null_correlations_genes, null_go_enrichments, go_map, consider_go) samples += 1 header = line[1:].rstrip() null_correlations_genes = [] else: a = line.split() null_correlations_genes.append((abs(float(a[1])),a[0])) if samples < options.null_samples: process_null_sample(null_correlations_genes, null_go_enrichments, go_map, consider_go) samples += 1 for go_term in consider_go: null_out = open('null_%s_%d.txt' % (go_term,len(go_map[go_term])), 'w') print >> null_out, '\n'.join([str(enr) for enr in null_go_enrichments[go_term]]) null_out.close() # do stats output_cols = [] for go_term in consider_go: # compute enrichment enrichment = gsea_enrichment(correlations_genes, go_map[go_term]) # compute p-value using normal approximation #p_val = (1+len([e for e in geneset_size_enrichments[go_size] if e >= enrichment])) / float(options.num_shuffles) (mean, sd) = util.mean_sd(null_go_enrichments[go_term]) p_val = 1.0 - norm.cdf(enrichment, loc=mean, scale=sd) # output output_cols.append([go_term, enrichment, p_val, 99, len(go_map[go_term]), go_descs[go_term]]) # FDR multiple hypothesis correction p_values = [oc[2] for oc in output_cols] q_values = fdr.storey(p_values) for i in range(len(output_cols)): output_cols[i][3] = q_values[i] for oc in output_cols: print '%-12s %.3f %.3e %.3e %4d %s' % tuple(oc)
def main(): usage = 'usage: %prog [options] <cors file>' parser = OptionParser(usage) parser.add_option( '--go_min', dest='go_min', type='int', default=10, help= 'Minimum number of genes assigned a GO term to consider enrichment of that term' ) parser.add_option( '--go_max', dest='go_max', type='int', default=300, help= 'Maximum number of genes assigned a GO term to consider enrichment of that term' ) parser.add_option( '-n', dest='num_shuffles', type='int', default=250, help='Number of shuffles to obtain p-value [Default: %default]') parser.add_option( '-s', dest='size_skip', type='int', default=4, help= 'Gene set sizes to skip when computing null distributions [Default: %default]' ) (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide correlations file') else: cors_file = args[0] # get genes, correlations correlations_genes = [] genes = [] for line in open(cors_file): a = line.split() correlations_genes.append((abs(float(a[1])), a[0])) genes.append(a[0]) correlations_genes.sort(reverse=True) # GO go_map, go_descs = read_go(set(genes)) # make null distributions geneset_size_enrichments = make_null_dist(options.go_min, options.go_max, options.num_shuffles, options.size_skip, go_map, correlations_genes, genes) # do stats output_cols = [] for go_term in go_map: go_size = len(go_map[go_term]) if options.go_min <= go_size <= options.go_max: # compute enrichment enrichment = gsea_enrichment(correlations_genes, go_map[go_term]) # compute p-value using normal approximation #p_val = (1+len([e for e in geneset_size_enrichments[go_size] if e >= enrichment])) / float(options.num_shuffles) (mean, sd) = interpolate_null(geneset_size_enrichments, options.size_skip, go_size) p_val = max(norm.rvs(1, loc=0, scale=1e-17), 1.0 - norm.cdf(enrichment, loc=mean, scale=sd)) # output output_cols.append( [go_term, enrichment, p_val, 99, go_size, go_descs[go_term]]) # FDR multiple hypothesis correction p_values = [oc[2] for oc in output_cols] q_values = fdr.storey(p_values) for i in range(len(output_cols)): output_cols[i][3] = q_values[i] for oc in output_cols: print '%-12s %.3f %.3e %.3e %4d %s' % tuple(oc)