コード例 #1
0
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if)

# Copyright (C) 2010-2016, H Tang et al., All rights reserved.
コード例 #2
0
ファイル: find_enrichment.py プロジェクト: dangeles/goatools
    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")
  
    if args.fdr:
        methods.append("fdr")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",") 
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if)
            
# Copyright (C) 2010-2016, H Tang et al., All rights reserved.
コード例 #3
0
ファイル: find_enrichment.py プロジェクト: yaoxingqi/goatools
    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        if args.pval is not None:
            # Only print results when uncorrected p-value < this value.A
            num_orig = len(results)
            results = [r for r in results if r.p_uncorrected <= args.pval]
            sys.stdout.write("{N:7,} of {M:,} results have uncorrected P-values <= {PVAL}=pval\n".format(
                N=len(results), M=num_orig, PVAL=args.pval))
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, indent=args.indent)
            else:
                g.wr_tsv(outfile, results, indent=args.indent)

# Copyright (C) 2010-2018, H Tang et al. All rights reserved.
コード例 #4
0
def goea(gene_ids, gene_symbols, trajectory, cluster, out_dir
         ):  ## list of genes represented by their ensembl id and gene symbol
    ## load ontologies

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    from goatools.obo_parser import GODag
    obodag = GODag("goea/go-basic.obo")

    ## load associations
    from goatools.associations import read_ncbi_gene2go
    geneid2gos_human = read_ncbi_gene2go("goea/gene2go", taxids=[9606])

    ## background gene set
    from goea.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_human

    ## GOEA object
    from goatools.go_enrichment import GOEnrichmentStudy
    goeaobj = GOEnrichmentStudy(
        GeneID2nt_human.keys(),  # List of mouse protein-coding genes
        geneid2gos_human,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    geneid2symbol = {}

    for gene_symbol in gene_symbols:
        for id in GeneID2nt_human.keys():
            if GeneID2nt_human[id][5] == gene_symbol:
                geneid2symbol[int(id)] = gene_symbol

    #from PyEntrezId import Conversion
    #for (gene_id, gene_symbol) in zip(gene_ids, gene_symbols):


#    id = Conversion('*****@*****.**')
#        gene_id = id.convert_ensembl_to_entrez(gene_id) ## get entrez
#        geneid2symbol[int(gene_id)] = gene_symbol

## identify which id correspond to the genes in the cluster

## Run GOEA
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
    """
    import rpy2
    from rpy2.robjects import r, pandas2ri

    from rpy2.robjects import pandas2ri
    import rpy2.robjects as robjects
    robjects.r('''
    f <- function(geneNames) {
                library(clusterProfiler)
                kk <- enrichKEGG(geneNames)
                as.data.frame(kk)
        }
    ''')

    r_enrich = robjects.globalenv['f']
    """

    #    print(r_enrich.r_repr())

    gene_names = np.array(list(geneid2symbol.keys()))

    print(gene_names)
    """
    pandas2ri.activate()

    res = r_enrich(gene_names)

    res = r_enrich(gene_names, organism="hsa", pvalueCutoff=0.5, pAdjustMethod="BH", qvalueCutoff=0.1)

    print(res)

    print(pandas2ri.ri2py(res))

    return
    """

    geneids_study = geneid2symbol.keys()

    with open(
            out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) +
            'genes.txt', 'w') as f:
        for gene in geneids_study:
            f.write("%s\n" % gene)

    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

    ## Write the result to file
    goeaobj.wr_xlsx(out_dir + '/' + trajectory[-8:] + 'cluster ' +
                    str(cluster) + 'goea_symbols.xlsx',
                    goea_results_sig,
                    itemid2name=geneid2symbol)
    goeaobj.wr_xlsx(
        out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) +
        'goea_geneids.xlsx', goea_results_sig)
コード例 #5
0
    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop,
                          assoc,
                          obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results,
                        min_ratio=min_ratio,
                        indent=args.indent,
                        pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        prt_if = None  # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if, indent=args.indent)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if, indent=args.indent)

# Copyright (C) 2010-2017, H Tang et al. All rights reserved.
コード例 #6
0
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if, indent=args.indent)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if, indent=args.indent)

# Copyright (C) 2010-2016, H Tang et al. All rights reserved.