def flush_to_stdout(self, epoch): stats = dict(self.base_stats) stats.update({"dts_h": util.dts(), "epoch": epoch, "n_egs_trained": self.n_egs_trained, "elapsed_time": int(time.time()) - self.start_time, "train_cost": util.mean_sd(self.train_costs), "dev_cost": util.mean_sd(self.dev_costs), "dev_acc": self.dev_accuracy}) if self.norms: stats.update({"norms": self.norms}) print "STATS\t%s" % json.dumps(stats) sys.stdout.flush() self.reset()
def flush_to_stdout(self, epoch): stats = dict(self.base_stats) stats.update({ "dts_h": util.dts(), "epoch": epoch, "n_egs_trained": self.n_egs_trained, "elapsed_time": int(time.time()) - self.start_time, "train_cost": util.mean_sd(self.train_costs), "dev_cost": util.mean_sd(self.dev_costs), "dev_acc": self.dev_accuracy }) if self.norms: stats.update({"norms": self.norms}) print "STATS\t%s" % json.dumps(stats) sys.stdout.flush() self.reset()
def main(): usage = 'usage: %prog [options] <cors file> <null file>' parser = OptionParser(usage) parser.add_option('--go_min', dest='go_min', type='int', default=10, help='Minimum number of genes assigned a GO term to consider enrichment of that term') parser.add_option('--go_max', dest='go_max', type='int', default=300, help='Maximum number of genes assigned a GO term to consider enrichment of that term') parser.add_option('-n', dest='null_samples', type='int', default=50, help='Number of null samples to obtain p-value [Default: %default]') (options,args) = parser.parse_args() if len(args) != 2: parser.error('Must provide correlations file') else: cors_file = args[0] null_file = args[1] # get genes, correlations correlations_genes = [] genes = [] for line in open(cors_file): a = line.split() correlations_genes.append((abs(float(a[1])),a[0])) genes.append(a[0]) correlations_genes.sort(reverse=True) # GO go_map, go_descs = read_go(set(genes)) consider_go = [go_term for go_term in go_map if options.go_min <= len(go_map[go_term]) <= options.go_max] # compute null GO term enrichments null_go_enrichments = {} header = '' samples = 0 for line in open(null_file): if line[0] == '>': if header: if samples < options.null_samples: process_null_sample(null_correlations_genes, null_go_enrichments, go_map, consider_go) samples += 1 header = line[1:].rstrip() null_correlations_genes = [] else: a = line.split() null_correlations_genes.append((abs(float(a[1])),a[0])) if samples < options.null_samples: process_null_sample(null_correlations_genes, null_go_enrichments, go_map, consider_go) samples += 1 for go_term in consider_go: null_out = open('null_%s_%d.txt' % (go_term,len(go_map[go_term])), 'w') print >> null_out, '\n'.join([str(enr) for enr in null_go_enrichments[go_term]]) null_out.close() # do stats output_cols = [] for go_term in consider_go: # compute enrichment enrichment = gsea_enrichment(correlations_genes, go_map[go_term]) # compute p-value using normal approximation #p_val = (1+len([e for e in geneset_size_enrichments[go_size] if e >= enrichment])) / float(options.num_shuffles) (mean, sd) = util.mean_sd(null_go_enrichments[go_term]) p_val = 1.0 - norm.cdf(enrichment, loc=mean, scale=sd) # output output_cols.append([go_term, enrichment, p_val, 99, len(go_map[go_term]), go_descs[go_term]]) # FDR multiple hypothesis correction p_values = [oc[2] for oc in output_cols] q_values = fdr.storey(p_values) for i in range(len(output_cols)): output_cols[i][3] = q_values[i] for oc in output_cols: print '%-12s %.3f %.3e %.3e %4d %s' % tuple(oc)