コード例 #1
0
ファイル: gsea.py プロジェクト: radaniba/utility
def main():
    usage = 'usage: %prog [options] <cors file>'
    parser = OptionParser(usage)
    parser.add_option('--go_min', dest='go_min', type='int', default=10, help='Minimum number of genes assigned a GO term to consider enrichment of that term')
    parser.add_option('--go_max', dest='go_max', type='int', default=300, help='Maximum number of genes assigned a GO term to consider enrichment of that term')
    parser.add_option('-n', dest='num_shuffles', type='int', default=250, help='Number of shuffles to obtain p-value [Default: %default]')
    parser.add_option('-s', dest='size_skip', type='int', default=4, help='Gene set sizes to skip when computing null distributions [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide correlations file')
    else:
        cors_file = args[0]

    # get genes, correlations
    correlations_genes = []
    genes = []
    for line in open(cors_file):
        a = line.split()
        correlations_genes.append((abs(float(a[1])),a[0]))
        genes.append(a[0])
    correlations_genes.sort(reverse=True)

    # GO
    go_map, go_descs = read_go(set(genes))

    # make null distributions
    geneset_size_enrichments = make_null_dist(options.go_min, options.go_max, options.num_shuffles, options.size_skip, go_map, correlations_genes, genes)

    # do stats
    output_cols = []
    for go_term in go_map:
        go_size = len(go_map[go_term])
        if options.go_min <= go_size <= options.go_max:
            # compute enrichment
            enrichment = gsea_enrichment(correlations_genes, go_map[go_term])

            # compute p-value using normal approximation
            #p_val = (1+len([e for e in geneset_size_enrichments[go_size] if e >= enrichment])) / float(options.num_shuffles)
            (mean, sd) = interpolate_null(geneset_size_enrichments, options.size_skip, go_size)
            p_val = max(norm.rvs(1,loc=0,scale=1e-17), 1.0 - norm.cdf(enrichment, loc=mean, scale=sd))
            
            # output
            output_cols.append([go_term, enrichment, p_val, 99, go_size, go_descs[go_term]])

    # FDR multiple hypothesis correction
    p_values = [oc[2] for oc in output_cols]
    q_values = fdr.storey(p_values)
    for i in range(len(output_cols)):
        output_cols[i][3] = q_values[i]

    for oc in output_cols:
        print '%-12s %.3f %.3e %.3e %4d %s' % tuple(oc)
コード例 #2
0
ファイル: cors_gsea.py プロジェクト: BioXiao/utility
def main():
    usage = 'usage: %prog [options] <cors file> <null file>'
    parser = OptionParser(usage)
    parser.add_option('--go_min', dest='go_min', type='int', default=10, help='Minimum number of genes assigned a GO term to consider enrichment of that term')
    parser.add_option('--go_max', dest='go_max', type='int', default=300, help='Maximum number of genes assigned a GO term to consider enrichment of that term')
    parser.add_option('-n', dest='null_samples', type='int', default=50, help='Number of null samples to obtain p-value [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 2:
        parser.error('Must provide correlations file')
    else:
        cors_file = args[0]
        null_file = args[1]

    # get genes, correlations
    correlations_genes = []
    genes = []
    for line in open(cors_file):
        a = line.split()
        correlations_genes.append((abs(float(a[1])),a[0]))
        genes.append(a[0])
    correlations_genes.sort(reverse=True)

    # GO
    go_map, go_descs = read_go(set(genes))
    consider_go = [go_term for go_term in go_map if options.go_min <= len(go_map[go_term]) <= options.go_max]

    # compute null GO term enrichments
    null_go_enrichments = {}
    header = ''
    samples = 0
    for line in open(null_file):
        if line[0] == '>':
            if header:
                if samples < options.null_samples:
                    process_null_sample(null_correlations_genes, null_go_enrichments, go_map, consider_go)
                    samples += 1
            header = line[1:].rstrip()
            null_correlations_genes = []            
        else:
            a = line.split()
            null_correlations_genes.append((abs(float(a[1])),a[0]))
    if samples < options.null_samples:
        process_null_sample(null_correlations_genes, null_go_enrichments, go_map, consider_go)
        samples += 1

    for go_term in consider_go:
        null_out = open('null_%s_%d.txt' % (go_term,len(go_map[go_term])), 'w')
        print >> null_out, '\n'.join([str(enr) for enr in null_go_enrichments[go_term]])
        null_out.close()

    # do stats
    output_cols = []
    for go_term in consider_go:
        # compute enrichment
        enrichment = gsea_enrichment(correlations_genes, go_map[go_term])

        # compute p-value using normal approximation
        #p_val = (1+len([e for e in geneset_size_enrichments[go_size] if e >= enrichment])) / float(options.num_shuffles)
        (mean, sd) = util.mean_sd(null_go_enrichments[go_term])
        p_val = 1.0 - norm.cdf(enrichment, loc=mean, scale=sd)

        # output
        output_cols.append([go_term, enrichment, p_val, 99, len(go_map[go_term]), go_descs[go_term]])

    # FDR multiple hypothesis correction
    p_values = [oc[2] for oc in output_cols]
    q_values = fdr.storey(p_values)
    for i in range(len(output_cols)):
        output_cols[i][3] = q_values[i]

    for oc in output_cols:
        print '%-12s %.3f %.3e %.3e %4d %s' % tuple(oc)
コード例 #3
0
def main():
    usage = 'usage: %prog [options] <cors file>'
    parser = OptionParser(usage)
    parser.add_option(
        '--go_min',
        dest='go_min',
        type='int',
        default=10,
        help=
        'Minimum number of genes assigned a GO term to consider enrichment of that term'
    )
    parser.add_option(
        '--go_max',
        dest='go_max',
        type='int',
        default=300,
        help=
        'Maximum number of genes assigned a GO term to consider enrichment of that term'
    )
    parser.add_option(
        '-n',
        dest='num_shuffles',
        type='int',
        default=250,
        help='Number of shuffles to obtain p-value [Default: %default]')
    parser.add_option(
        '-s',
        dest='size_skip',
        type='int',
        default=4,
        help=
        'Gene set sizes to skip when computing null distributions [Default: %default]'
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide correlations file')
    else:
        cors_file = args[0]

    # get genes, correlations
    correlations_genes = []
    genes = []
    for line in open(cors_file):
        a = line.split()
        correlations_genes.append((abs(float(a[1])), a[0]))
        genes.append(a[0])
    correlations_genes.sort(reverse=True)

    # GO
    go_map, go_descs = read_go(set(genes))

    # make null distributions
    geneset_size_enrichments = make_null_dist(options.go_min, options.go_max,
                                              options.num_shuffles,
                                              options.size_skip, go_map,
                                              correlations_genes, genes)

    # do stats
    output_cols = []
    for go_term in go_map:
        go_size = len(go_map[go_term])
        if options.go_min <= go_size <= options.go_max:
            # compute enrichment
            enrichment = gsea_enrichment(correlations_genes, go_map[go_term])

            # compute p-value using normal approximation
            #p_val = (1+len([e for e in geneset_size_enrichments[go_size] if e >= enrichment])) / float(options.num_shuffles)
            (mean, sd) = interpolate_null(geneset_size_enrichments,
                                          options.size_skip, go_size)
            p_val = max(norm.rvs(1, loc=0, scale=1e-17),
                        1.0 - norm.cdf(enrichment, loc=mean, scale=sd))

            # output
            output_cols.append(
                [go_term, enrichment, p_val, 99, go_size, go_descs[go_term]])

    # FDR multiple hypothesis correction
    p_values = [oc[2] for oc in output_cols]
    q_values = fdr.storey(p_values)
    for i in range(len(output_cols)):
        output_cols[i][3] = q_values[i]

    for oc in output_cols:
        print '%-12s %.3f %.3e %.3e %4d %s' % tuple(oc)