コード例 #1
1
ファイル: __init__.py プロジェクト: mofiarska/PyPathway
 def run(study, pop, assoc, alpha=0.05, p_value=0.05, compare=False, ratio=None, obo='go-basic.obo', no_propagate_counts=False,
         method='bonferroni,sidak,holm', pvalcalc='fisher'):
     '''
     This is the wrapper of the Goatools function.
     
     :param study: a list of study gene
     :param pop: a list of population gene
     :param assoc: the association from the gene to the go term
     :return: 
     '''
     if type(study) == str and type(pop) == str:
         # load the study and pop from the file
         study, pop = GO._read_geneset(study, pop, compare=compare)
     else:
         # convert to the set
         study = frozenset(study)
         pop = set(pop)
     methods = method.split(",")
     if obo == 'go-basic.obo':
         obo = os.path.dirname(os.path.realpath(__file__)) + "/obo/go.obo"
     if not os.path.exists(obo):
         print("obo file not found, start to download")
         wget.download('http://purl.obolibrary.org/obo/go/go-basic.obo', obo)
     obo_dag = GODag(obo)
     propagate_counts = not no_propagate_counts
     if type(assoc) == dict:
         buf = ""
         for k, v in assoc.items():
             if not v: continue
             line = ";".join([str(x) for x in v if x])
             buf += "{}\t{}\n".format(k, line)
         path = os.path.dirname(os.path.realpath(__file__)) + "/assoc"
         with open(path, 'w') as fp:
             fp.write(buf)
         assoc = read_associations(path)
     elif type(assoc) == defaultdict:
         pass
     else:
         # if from a file
         assoc = read_associations(assoc)
     g = GOEnrichmentStudy(pop, assoc, obo_dag,
                           propagate_counts=propagate_counts,
                           alpha=alpha,
                           pvalcalc=pvalcalc,
                           methods=methods)
     results = g.run_study(study)
     # g.print_summary(results, min_ratio=ratio, indent=False, pval=p_value)
     r = 'GO\tNS\tenrichment\tname\tratio_in_study\tratio_in_pop\tp_uncorrected\tdepth\tstudy_count\tp_bonferroni\tp_sidak\tp_holm\thit\n'
     for x in results:
         r += x.__str__() + "\n"
     tb = pd.read_table(StringIO(r))
     return GO(tb, study, pop, assoc, alpha, p_value, compare, ratio, obo, no_propagate_counts, method, pvalcalc, obo_dag)
コード例 #2
0
    def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'):
        # Load the Gene Ontology
        n_comps = metagene_matrix.shape[1]

        self.download_and_cache_resources(
        )  # Download ontology and annotations, if necessary
        gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo')

        # Load the human annotations
        c = 0
        with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf:
            funcs = {}
            for entry in GOA.gafiterator(gaf):
                c += 1
                uniprot_id = entry.pop('DB_Object_Symbol')
                funcs[uniprot_id] = entry

        # Our population is the set of genes we are analysing

        population = self.gene_symbols()
        print("We have %d genes in our population" % len(population))

        # Build associations from functional annotations we got from the gaf file
        associations = {}
        for x in funcs:
            if x not in associations:
                associations[x] = set()
            associations[x].add(str(funcs[x]['GO_ID']))

        gea = GOEnrichmentStudy(population,
                                associations,
                                gene_ontology,
                                propagate_counts=True,
                                alpha=0.05,
                                methods=[method])
        gea_results_by_component = {}
        rankings = self.ranked_genes_by_component(metagene_matrix)
        for ci in range(n_comps):
            study_genes = rankings[ci]
            print('\nComp. %d: %s...' % (ci, str(study_genes[:10])))
            gea_results_by_component[ci] = gea.run_study(study_genes)

        # Get results into a dataframe per component.  Easiest way is to use routine to
        # write a .tsv file, then read back and filter

        gea_results_df_by_component = []
        for ci in range(n_comps):
            ge_df = self._perform_gene_enrichment_analysis_one_component(
                ci, gea_results_by_component, gea)
            if ge_df is not None:
                gea_results_df_by_component += [ge_df]

        # Merge the per-component dataframes into a single one
        gea_all_sig_results_df = pd.DataFrame()
        gea_all_sig_results_df = gea_all_sig_results_df.append(
            gea_results_df_by_component)

        gea_all_sig_results_df.to_csv(self.cache_dir +
                                      '%s_gea_all.tsv' % self.prefix,
                                      sep='\t')
コード例 #3
0
class GoEnrich():
    def __init__(self):
        obodag = GODag("../Data/evaluation_reference/goslim_yeast.obo")
        background = [line.strip() for line in open('../Data/evaluation_reference/gene_list.txt')]
        geneid2gos_yeast = read_associations('../Data/evaluation_reference/geneid2gos_yeast.txt')

        self.goeaobj = GOEnrichmentStudy(
            background,
            geneid2gos_yeast,
            obodag,
            propogate_counts=False,
            alpha=0.05,
            methods=['fdr_bh'])

    def measure_enrichment(self,
                           gene_set=['YML106W', 'YKL135C', 'YDR516C',
                                     'YLR420W', 'YNL111C', 'YHR007C',
                                     'YLR014C', 'YKL216W', 'YNL078W',
                                     'YJR005W', 'YJL130C'],
                           run_name='base',
                           cluster_id=1):

        gene_ids = ['YML106W', 'YKL135C', 'YDR516C', 'YLR420W', 'YNL111C',
                    'YHR007C', 'YLR014C', 'YKL216W', 'YNL078W', 'YJR005W',
                    'YJL130C']

        goea_results_all = self.goeaobj.run_study(gene_ids)

        # we can get significant only
        # goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

        self.goeaobj.wr_txt("../Results/" + run_name + "_" + str(cluster_id) +
                            ".txt", goea_results_all)
コード例 #4
0
def check_group_enrichment(tested_gene_file_name, total_gene_file_name):
    total_gene_list = load_gene_list(total_gene_file_name)
    tested_gene = load_gene_list(tested_gene_file_name)

    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        download(constants.GO_OBO_URL, constants.GO_DIR)

    obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    if not os.path.exists(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)):
        download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR)
        with gzip.open(os.path.join(constants.GO_DIR, os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)), 'rb') as f_in:
            with open(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True)

    g = GOEnrichmentStudy([int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
                          assoc, obo_dag, methods=["bonferroni", "fdr_bh"])
    g_res = g.run_study([int(cur) for cur in ensembl2entrez_convertor(tested_gene)])

    GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if
                  cur.p_fdr_bh <= 0.05]
    if len(GO_results) > 0:
        go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(*GO_results)
    else:
        go_terms = []
        uncorrectd_pvals = []
        FDRs = []
        go_names = []
        go_ns = []
    output_rows = [("\r\n".join(e2g_convertor(tested_gene)),  "\r\n".join(go_ns),
                        "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)),
                        "\r\n".join(map(str, FDRs)))]
    print_to_excel(output_rows, tested_gene_file_name, total_gene_file_name)
コード例 #5
0
def test_goea():
    """Test GOEA with method, fdr."""
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=['fdr'])
    goea_results = goeaobj.run_study(study_ids)
    goeaobj.print_summary(goea_results)
コード例 #6
0
def get_goea_results(method="fdr_bh"):
    """Get GOEA results."""
    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
    obo_fin = os.path.join(root_dir, "goslim_generic.obo")
    obo_dag = GODag(obo_fin)
    assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method])
    study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))]
    goea_results = goeaobj.run_study(study_ids, methods=[method])
    return goea_results
コード例 #7
0
def get_goea_results(method="fdr_bh"):
    """Get GOEA results."""
    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
    obo_fin = os.path.join(root_dir, "goslim_generic.obo")
    obo_dag = GODag(obo_fin)
    assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method])
    study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))]
    goea_results = goeaobj.run_study(study_ids, methods=[method])
    return goea_results
コード例 #8
0
def test_i96():
    """Test to re-produce issue#96: Passes currently."""
    # Trying to duplicate: ValueError("All values in table must be nonnegative.
    # Get genes
    study_ids = _get_geneids()
    population_ids = GeneID2nt.keys()
    # Get databases
    gene2go = get_assoc_ncbi_taxids([9606], loading_bar=None)
    fin_obo = os.path.join(os.getcwd(), "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh'])
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study(study_ids)
コード例 #9
0
def run_bonferroni():
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None)
    assoc = read_associations(os.path.join(REPO, "data/association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))]
    study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
コード例 #10
0
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout):
    """Run a GOEA. Return results"""
    taxid = 10090  # Mouse study
    geneids_pop = set(GeneID2nt_mus.keys())
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None)
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    goeaobj = GOEnrichmentStudy(geneids_pop,
                                assoc_geneid2gos,
                                godag,
                                propagate_counts=propagate_counts,
                                relationships=relationships,
                                alpha=0.05,
                                methods=['fdr_bh'])
    return goeaobj.run_study(geneids_study, prt=prt)
コード例 #11
0
def run_bonferroni(log):
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, obo_dag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
コード例 #12
0
def run_bonferroni():
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None)
    fin_assc = os.path.join(REPO, "data/association")
    assoc = read_associations(fin_assc, 'id2gos', no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/population"))]
    study_ids = [line.rstrip() for line in open(os.path.join(REPO, "data/study"))]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids, assoc, godag, alpha=0.05, methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
コード例 #13
0
def _get_results(godag, propagate_counts, relationships, prt=sys.stdout):
    """Run a GOEA. Return results"""
    taxid = 10090 # Mouse study
    geneids_pop = set(GeneID2nt_mus.keys())
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid], loading_bar=None)
    geneids_study = get_geneid2symbol("nbt.3102-S4_GeneIDs.xlsx")
    goeaobj = GOEnrichmentStudy(
        geneids_pop,
        assoc_geneid2gos,
        godag,
        propagate_counts=propagate_counts,
        relationships=relationships,
        alpha=0.05,
        methods=['fdr_bh'])
    return goeaobj.run_study(geneids_study, prt=prt)
コード例 #14
0
def run_bonferroni(log):
    """Do Gene Ontology Enrichment Analysis w/Bonferroni multipletest. Print results 3 ways."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    study_ids = [line.rstrip() for line in open("../data/study")]
    # 2. Run enrichment analysis
    goea = GOEnrichmentStudy(popul_ids,
                             assoc,
                             obo_dag,
                             alpha=0.05,
                             methods=['bonferroni'])
    results_nt = goea.run_study(study_ids)
    return results_nt, goea
コード例 #15
0
def test_i96():
    """Test to re-produce issue#96: Passes currently."""
    # Trying to duplicate: ValueError("All values in table must be nonnegative.
    # Get genes
    print('CWD', os.getcwd())
    study_ids = _get_geneids()
    population_ids = GENEID2NT.keys()
    # Get databases

    print(os.getcwd())
    fin = os.path.join(REPO, 'gene2go')
    dnld_ncbi_gene_file(fin, loading_bar=None)
    gene2go = read_ncbi_gene2go(fin, [9606])

    fin_obo = os.path.join(REPO, "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    goeaobj = GOEnrichmentStudy(population_ids, gene2go, godag, methods=['fdr_bh'])
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study(study_ids)
コード例 #16
0
def test_i96():
    """Test to re-produce issue#96: Passes currently."""
    # Trying to duplicate: ValueError("All values in table must be nonnegative.
    # Get genes
    print('CWD', os.getcwd())
    study_ids = _get_geneids()
    population_ids = GENEID2NT.keys()
    # Get databases

    print(os.getcwd())
    fin = os.path.join(REPO, 'gene2go')
    dnld_ncbi_gene_file(fin, loading_bar=None)
    gene2go = read_ncbi_gene2go(fin, [9606])

    fin_obo = os.path.join(REPO, "go-basic.obo")
    godag = get_godag(fin_obo, loading_bar=None)
    goeaobj = GOEnrichmentStudy(population_ids,
                                gene2go,
                                godag,
                                methods=['fdr_bh'])
    # Run GOEA Gene Ontology Enrichment Analysis
    results_goeas = goeaobj.run_study(study_ids)
コード例 #17
0
    min_dist = 5000
    max_dist = 50000
else:
    sys.exit(-1)
with open(snakemake.input.clusters) as f:
    for line in f:
        cols = line.strip().split()
        cluster = chr(int(cols[3]) + 65)
        if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist:
            genes[cluster].add(cols[7])
            background.add(cols[7])

obodag = GODag("go-basic.obo")
id2go = read_associations("sym2go.txt")
goeaobj = GOEnrichmentStudy(background,
                            id2go,
                            obodag,
                            propagate_counts=False,
                            alpha=0.05,
                            methods=['fdr_bh'])
outfile = open(snakemake.output.txt, 'w')
for cluster, geneids in sorted(genes.items()):
    outfile.write("Cluster {}\n".format(cluster))
    goea_results_all = goeaobj.run_study(geneids)
    for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment)
                                         for r in goea_results_all
                                         if r.p_fdr_bh < 0.2]):
        outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment))
    outfile.write("\n")
    #GOEnrichmentStudy.print_summary(goea_results_sig)
コード例 #18
0
 def run(study,
         pop,
         assoc,
         alpha=0.05,
         p_value=0.05,
         compare=False,
         ratio=None,
         obo='go-basic.obo',
         no_propagate_counts=False,
         method='bonferroni,sidak,holm',
         pvalcalc='fisher'):
     '''
     This is the wrapper of the Goatools function.
     
     :param study: a list of study gene
     :param pop: a list of population gene
     :param assoc: the association from the gene to the go term
     :return: 
     '''
     if type(study) == str and type(pop) == str:
         # load the study and pop from the file
         study, pop = GO._read_geneset(study, pop, compare=compare)
     else:
         # convert to the set
         study = frozenset(study)
         pop = set(pop)
     methods = method.split(",")
     if obo == 'go-basic.obo':
         obo = os.path.dirname(os.path.realpath(__file__)) + "/obo/go.obo"
     if not os.path.exists(obo):
         print("obo file not found, start to download")
         wget.download('http://purl.obolibrary.org/obo/go/go-basic.obo',
                       obo)
     obo_dag = GODag(obo)
     propagate_counts = not no_propagate_counts
     if type(assoc) == dict:
         buf = ""
         for k, v in assoc.items():
             if not v: continue
             line = ";".join([str(x) for x in v if x])
             buf += "{}\t{}\n".format(k, line)
         path = os.path.dirname(os.path.realpath(__file__)) + "/assoc"
         with open(path, 'w') as fp:
             fp.write(buf)
         assoc = read_associations(path)
     elif type(assoc) == defaultdict:
         pass
     else:
         # if from a file
         assoc = read_associations(assoc)
     g = GOEnrichmentStudy(pop,
                           assoc,
                           obo_dag,
                           propagate_counts=propagate_counts,
                           alpha=alpha,
                           pvalcalc=pvalcalc,
                           methods=methods)
     results = g.run_study(study)
     # g.print_summary(results, min_ratio=ratio, indent=False, pval=p_value)
     r = 'GO\tNS\tenrichment\tname\tratio_in_study\tratio_in_pop\tp_uncorrected\tdepth\tstudy_count\tp_bonferroni\tp_sidak\tp_holm\thit\n'
     for x in results:
         r += x.__str__() + "\n"
     tb = pd.read_table(StringIO(r))
     return GO(tb, study, pop, assoc, alpha, p_value, compare, ratio, obo,
               no_propagate_counts, method, pvalcalc, obo_dag)
コード例 #19
0
ファイル: genes_clustering.py プロジェクト: hag007/melanoma
def find_clusters_and_gene_enrichment(tested_gene_list_file_name,
                                      total_gene_list_file_name,
                                      gene_expression_file_name,
                                      phenotype_file_name,
                                      gene_filter_file_name=None,
                                      tested_gene_list_path=None,
                                      total_gene_list_path=None,
                                      gene_expression_path=None,
                                      phenotype_path=None,
                                      gene_filter_file_path=None,
                                      var_th_index=None,
                                      start_k=2,
                                      end_k=6,
                                      calc_go=True,
                                      enrichment_list_file_names=None,
                                      meta_groups=None,
                                      filter_expression=None,
                                      cluster_algorithm=None):
    # fetch gene expression by gene_id, divided by tumor type
    gene_sets = []
    expression_sets = []
    averaged_expression_sets = []
    tested_gene_expression = load_gene_expression_profile_by_genes(
        tested_gene_list_file_name, gene_expression_file_name,
        gene_filter_file_name, tested_gene_list_path, gene_expression_path,
        gene_filter_file_path)
    tested_gene_expression_headers_rows, tested_gene_expression_headers_columns, tested_gene_expression = separate_headers(
        tested_gene_expression)

    if filter_expression is not None:
        filtered_patients = [
            y for x in divided_patient_ids_by_label(phenotype_file_name,
                                                    groups=filter_expression)
            for y in x
        ]
        print "number of filtered patients from phenotypes: {}".format(
            len(filtered_patients))
    else:
        print "no filter applied"
        filtered_patients = tested_gene_expression_headers_columns

    tested_gene_expression, tested_gene_expression_headers_columns = filter_genes_dataset_by_patients(
        filtered_patients, tested_gene_expression_headers_columns,
        tested_gene_expression)
    if np.shape(tested_gene_expression)[1] == 1:
        print "no expressions were found after filtering by labels {}. skipping...".format(
            filter_expression)
        return None

    total_gene_list = load_gene_list(total_gene_list_file_name)
    tested_gene_list = load_gene_list(tested_gene_list_file_name)
    row_var = np.var(tested_gene_expression, axis=1)
    row_var_sorted = np.sort(row_var)[::-1]

    labels_assignment_patients = None
    if meta_groups is not None:
        print "clustering patients by groups"
        labels_assignment_patients = labels_assignments(
            meta_groups, phenotype_file_name,
            tested_gene_expression_headers_columns)

    enrichment_lists = []
    if enrichment_list_file_names is not None:
        for cur in enrichment_list_file_names:
            enrichment_lists.append(load_gene_list(cur))

    if var_th_index is None:
        var_th_index = len(row_var_sorted) - 1
    row_var_th = row_var_sorted[var_th_index]
    row_var_masked_indices = np.where(row_var_th > row_var)[0]
    gene_expression_top_var = np.delete(tested_gene_expression,
                                        row_var_masked_indices,
                                        axis=0)
    gene_expression_top_var_header_rows = np.delete(
        tested_gene_expression_headers_rows, row_var_masked_indices, axis=0)
    gene_expression_top_var_header_columns = tested_gene_expression_headers_columns

    clfs_results = {}
    output_rows = []
    if calc_go:
        if not os.path.exists(
                os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
            wget.download(
                constants.GO_OBO_URL,
                os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))
        # if not os.path.exists(os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf')):
        #     wget.download(go_obo_url, os.path.join(constants.TCGA_DATA_DIR, 'goa_human.gaf'))
        obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

        assoc = read_ncbi_gene2go(os.path.join(
            constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME),
                                  no_top=True)
        g = GOEnrichmentStudy(
            [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
            assoc,
            obo_dag,
            methods=["bonferroni", "fdr_bh"])
        g_res = g.run_study([
            int(cur) for cur in ensembl2entrez_convertor(
                gene_expression_top_var_header_rows)
        ])
        GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.p_uncorrected,
                       cur.p_fdr_bh) for cur in g_res if cur.p_fdr_bh <= 0.05]
        print GO_results

    if cluster_algorithm == "kmeans":

        for n_clusters in range(start_k, end_k + 1):
            clfs_results[n_clusters] = []
            centres, km_clf, dist = kmeanssample(X=gene_expression_top_var,
                                                 k=n_clusters,
                                                 metric="euclidean")
            for i in range(n_clusters):

                ranks = []
                for j in range(n_clusters):
                    ranks.append(
                        np.average(
                            np.delete(gene_expression_top_var,
                                      np.where(km_clf != j)[0],
                                      axis=0)))
                ranks = rankdata(ranks)
                cluster_labels = np.array(km_clf)
                for j in range(n_clusters):
                    cluster_labels[np.where(km_clf == ranks[j] - 1)] = j
                labels_assignment = [cluster_labels + 1]

                cluster_indices = np.where(km_clf != i)[0]
                gene_expression_cluster = np.delete(
                    gene_expression_top_var_header_rows,
                    cluster_indices,
                    axis=0)
                gene_headers_row_cluster = np.delete(
                    gene_expression_top_var_header_rows,
                    cluster_indices,
                    axis=0)
                clfs_results[n_clusters].append(
                    (gene_headers_row_cluster, gene_headers_row_cluster))
                desc = "k={} clustering cluster {} has {} genes".format(
                    n_clusters, i, len(gene_expression_cluster))
                gene_list = ",".join(gene_headers_row_cluster)
                url = check_enrichment(gene_list)

                go_terms = []
                uncorrectd_pvals = []
                FDRs = []
                go_names = []
                go_ns = []
                if calc_go:
                    g_res = g.run_study([
                        int(cur) for cur in ensembl2entrez_convertor(
                            gene_headers_row_cluster)
                    ])
                    GO_results = [(cur.NS, cur.GO, cur.goterm.name,
                                   cur.p_uncorrected, cur.p_fdr_bh)
                                  for cur in g_res if cur.p_fdr_bh <= 0.05]
                    if len(GO_results) > 0:
                        go_ns, go_terms, go_names, uncorrectd_pvals, FDRs = zip(
                            *GO_results)

                if len(enrichment_lists) != 0:
                    for j, cur in enumerate(enrichment_lists):
                        go_terms.append(
                            enrichment_list_file_names[j].split(".")[0])
                        uncorrectd_pvals.append(
                            calc_HG_test(
                                [x.split(".")[0] for x in tested_gene_list],
                                [x.split(".")[0] for x in cur], [
                                    x.split(".")[0]
                                    for x in gene_headers_row_cluster
                                ]))
                        FDRs.append(".")
                        go_names.append(".")
                        go_ns.append(".")

                output_rows.append((desc, "\r\n".join([
                    x.split(".")[0] for x in gene_headers_row_cluster
                ]), url, "\r\n".join(go_ns), "\r\n".join(go_terms),
                                    "\r\n".join(go_names),
                                    "\r\n".join(map(str, uncorrectd_pvals)),
                                    "\r\n".join(map(str, FDRs))))

        gene_sorted_heatmap = np.rot90(np.flip(
            gene_expression_top_var[cluster_labels.argsort(), :], 1),
                                       k=-1,
                                       axes=(1, 0))
        find_clusters(end_k,
                      gene_sorted_heatmap,
                      gene_expression_top_var_header_columns,
                      start_k,
                      e2g_convertor(gene_expression_top_var_header_rows),
                      tested_gene_list_file_name,
                      labels_assignment=labels_assignment_patients)

        plot_heatmap(gene_expression_top_var,
                     gene_expression_top_var_header_columns,
                     labels_assignment,
                     gene_expression_top_var_header_rows,
                     tested_gene_list_file_name,
                     n_clusters=None,
                     label_index=None,
                     phenotype_heatmap=None)

    gene_sorted_heatmap = np.rot90(np.flip(gene_expression_top_var, 1),
                                   k=-1,
                                   axes=(1, 0))
    if cluster_algorithm == "hierarchical":
        df = pd.DataFrame(data=gene_sorted_heatmap,
                          index=gene_expression_top_var_header_columns,
                          columns=gene_expression_top_var_header_rows)

        # correlations = df.corr()
        # correlations_array = np.asarray(df.corr())
        #
        # row_linkage = hierarchy.linkage(
        #     distance.pdist(correlations_array), method='average')
        #
        # col_linkage = hierarchy.linkage(
        #     distance.pdist(correlations_array.T), method='average')

        # enrichment_gene_list = load_gene_list("uvm_mito_part.txt")
        dct = dict(zip(np.unique(labels_assignment_patients[0]), "rbg"))
        row_colors = map(dct.get, labels_assignment_patients[0])
        dct = {1: 'b', 2: 'r'}
        gene_expression_top_var_header_rows_trimmed = [
            x.split(".")[0] for x in gene_expression_top_var_header_rows
        ]
        # col_colors = map(dct.get, [2 if x in enrichment_gene_list else 1 for x in gene_expression_top_var_header_rows_trimmed])
        g = sns.clustermap(df,
                           row_colors=row_colors,
                           metric="euclidean",
                           robust=True,
                           method="single")
        # den_patients = scipy.cluster.hierarchy.dendrogram(g.dendrogram_row.linkage,
        #                                          labels=df.index,
        #                                          color_threshold=0.60)
        den_genes = scipy.cluster.hierarchy.dendrogram(
            g.dendrogram_col.linkage, labels=df.columns, color_threshold=0.7)
        clusters = get_cluster_classes(den_genes)

        g.savefig(
            os.path.join(constants.BASE_PROFILE, "output",
                         "hierarchical_cluster_{}.png".format(time.time())))

    for cur_labels_assignment_patient in labels_assignment_patients:
        plot_heatmap(gene_sorted_heatmap,
                     gene_expression_top_var_header_rows,
                     [cur_labels_assignment_patient],
                     gene_expression_top_var_header_columns,
                     tested_gene_list_file_name,
                     n_clusters=None,
                     label_index=None,
                     phenotype_heatmap=None)

    print_to_excel(
        output_rows=output_rows,
        gene_list_file_name=tested_gene_list_file_name.split(".")[0],
        gene_expression_file_name=gene_expression_file_name.split(".")[0],
        var_th_index=var_th_index)
コード例 #20
0
    methods=['fdr_bh'])  # defult multipletest correction method

# I will also look for GO term enrichment in the group with the top 10 (abs value, but they are all negative) background fitness coeffs in the full model
tp['abs_slope_in_full_model'] = np.abs(tp['full_model_x_slope'])
fit_effect_genes = tp.loc[tp['model_comp_p_full_vs_qtl'] < 0.05].sort_values(
    by='abs_slope_in_full_model', ascending=False).iloc[:10]['Gene.Use']

#testing my groups
results = dict()
go_groups = list(mhq_dat.as_matrix(['QTL', 'Genes_with_interactions']))
go_groups.append(
    ['Top_x_effects', ';'.join([s.split(' ')[1] for s in fit_effect_genes])])
for entry in go_groups:
    study_set_names = [i for i in entry[1].split(';') if 'None' not in i]
    study_set = [genename_2_id.setdefault(i, 'NA') for i in study_set_names]
    goea_results_all = goeaobj.run_study(
        study_set, keep_if=lambda x: x.p_uncorrected < 0.05)
    results[entry[0]] = sorted(goea_results_all, key=lambda r: r.p_fdr_bh)[:10]

with open('../../Analysis/GO_results.csv', 'w') as outfile:
    writer = csv.writer(outfile)
    writer.writerow([
        'Group', 'GO term', 'pval_uncorrected', 'pval_benjamini_hochberg',
        'hits'
    ])
    for r in results:
        for i in range(len(results[r])):
            writer.writerow([
                r, results[r][i].name, results[r][i].p_uncorrected,
                results[r][i].p_fdr_bh,
                ';'.join([id_2_genename[i] for i in results[r][i].study_items])
            ])
コード例 #21
0
def goea(gene_ids, gene_symbols, trajectory, cluster, out_dir
         ):  ## list of genes represented by their ensembl id and gene symbol
    ## load ontologies

    if not os.path.exists(out_dir):
        os.mkdir(out_dir)

    from goatools.obo_parser import GODag
    obodag = GODag("goea/go-basic.obo")

    ## load associations
    from goatools.associations import read_ncbi_gene2go
    geneid2gos_human = read_ncbi_gene2go("goea/gene2go", taxids=[9606])

    ## background gene set
    from goea.genes_NCBI_9606_ProteinCoding import GENEID2NT as GeneID2nt_human

    ## GOEA object
    from goatools.go_enrichment import GOEnrichmentStudy
    goeaobj = GOEnrichmentStudy(
        GeneID2nt_human.keys(),  # List of mouse protein-coding genes
        geneid2gos_human,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    geneid2symbol = {}

    for gene_symbol in gene_symbols:
        for id in GeneID2nt_human.keys():
            if GeneID2nt_human[id][5] == gene_symbol:
                geneid2symbol[int(id)] = gene_symbol

    #from PyEntrezId import Conversion
    #for (gene_id, gene_symbol) in zip(gene_ids, gene_symbols):


#    id = Conversion('*****@*****.**')
#        gene_id = id.convert_ensembl_to_entrez(gene_id) ## get entrez
#        geneid2symbol[int(gene_id)] = gene_symbol

## identify which id correspond to the genes in the cluster

## Run GOEA
# 'p_' means "pvalue". 'fdr_bh' is the multipletest method we are currently using.
    """
    import rpy2
    from rpy2.robjects import r, pandas2ri

    from rpy2.robjects import pandas2ri
    import rpy2.robjects as robjects
    robjects.r('''
    f <- function(geneNames) {
                library(clusterProfiler)
                kk <- enrichKEGG(geneNames)
                as.data.frame(kk)
        }
    ''')

    r_enrich = robjects.globalenv['f']
    """

    #    print(r_enrich.r_repr())

    gene_names = np.array(list(geneid2symbol.keys()))

    print(gene_names)
    """
    pandas2ri.activate()

    res = r_enrich(gene_names)

    res = r_enrich(gene_names, organism="hsa", pvalueCutoff=0.5, pAdjustMethod="BH", qvalueCutoff=0.1)

    print(res)

    print(pandas2ri.ri2py(res))

    return
    """

    geneids_study = geneid2symbol.keys()

    with open(
            out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) +
            'genes.txt', 'w') as f:
        for gene in geneids_study:
            f.write("%s\n" % gene)

    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

    ## Write the result to file
    goeaobj.wr_xlsx(out_dir + '/' + trajectory[-8:] + 'cluster ' +
                    str(cluster) + 'goea_symbols.xlsx',
                    goea_results_sig,
                    itemid2name=geneid2symbol)
    goeaobj.wr_xlsx(
        out_dir + '/' + trajectory[-8:] + 'cluster ' + str(cluster) +
        'goea_geneids.xlsx', goea_results_sig)
コード例 #22
0
ファイル: find_enrichment.py プロジェクト: dangeles/goatools
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")
  
    if args.fdr:
        methods.append("fdr")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",") 
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if)
            
コード例 #23
0
ファイル: go_enrichment.py プロジェクト: seyuboglu/milieu
class GOEnrichment(Experiment):
    """
    Class for running experiment that conducts enrichment of gene ontology terms in 
    pathways in the PPI network. 
    """
    def __init__(self, dir, params):
        """
        Constructor 
        Args: 
            dir (string) directory of the experiment to be run
        """
        super().__init__(dir, params)

        # Set the logger
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        # Log title 
        logging.info("Disease Protein Prediction")
        logging.info("Sabri Eyuboglu  -- SNAP Group")
        logging.info("======================================")
        
        logging.info("Loading Disease Associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading Network...")
        self.network = Network(self.params["ppi_network"]) 
        
        logging.info("Loading enrichment study...")
        obodag = GODag(self.params["go_path"])
        geneid2go = read_ncbi_gene2go(self.params["gene_to_go_path"], taxids=[9606])
        self.enrichment_study = GOEnrichmentStudy(self.network.get_names(),
                                                  geneid2go,
                                                  obodag,
                                                  log=None,
                                                  **self.params["enrichment_params"])

        logging.info("Loading predictions...")
        self.method_to_preds = {name: pd.read_csv(os.path.join(preds, "predictions.csv"), 
                                                  index_col=0) 
                                for name, preds in self.params["method_to_preds"].items()}
        
        outputs_path = os.path.join(self.dir, "outputs.pkl")
        if os.path.exists(outputs_path):
            logging.info("Loading outputs...")
            with open(outputs_path, 'rb') as f:
                self.outputs = pickle.load(f)
        else:
            self.outputs = {}
        
    def run_study(self, proteins):
        """
        """
        results = self.enrichment_study.run_study(proteins)
        term_to_pval = {r.goterm.name: r.p_fdr_bh for r in results}

        return term_to_pval
    
    def compute_spearman_correlation(self, a_term_to_pval, b_term_to_pval):
        """
        """
        terms = list(a_term_to_pval.keys())
        sp_corr, sp_pval = spearmanr([a_term_to_pval[term] for term in terms],
                                     [b_term_to_pval[term] for term in terms])
        return sp_corr, sp_pval

    def process_disease(self, disease):
        """
        """
        results = {}
        output = {}
        # compute method scores for disease
        disease_proteins = set(self.diseases_dict[disease.id].proteins)

        if disease.id in self.outputs:
            disease_term_to_pval = self.outputs[disease.id]["disease"]
        else:
            disease_term_to_pval = self.run_study(disease_proteins)
        output["disease"] = disease_term_to_pval

        disease_terms = set([term for term, pval 
                             in disease_term_to_pval.items() if pval < 0.05])
        top_disease_terms = set([term for term, _
                                 in sorted(disease_term_to_pval.items(), 
                                           key=lambda x: x[1])[:self.params["top_k"]]])
        
        results = {"disease_name": disease.name, 
                   "disease_num_significant": len(disease_terms),
                   "disease_top_{}".format(self.params['top_k']): top_disease_terms}

        # number of predictions to be made 
        num_preds = (len(disease_proteins) 
                     if self.params["num_preds"] == -1 
                     else self.params["num_preds"])
        
        for name, preds in self.method_to_preds.items():
            pred_proteins = set(map(int, preds.loc[disease.id]
                                              .sort_values(ascending=False)
                                              .index[:num_preds]))

            if disease.id in self.outputs:
                pred_term_to_pval = self.outputs[disease.id][name]
            else: 
                pred_term_to_pval = self.run_study(pred_proteins)
            output[name] = pred_term_to_pval

            pred_terms = set([term for term, pval 
                              in pred_term_to_pval.items() if pval < 0.05])
            top_pred_terms = set([term for term, _
                                  in sorted(pred_term_to_pval.items(), 
                                            key=lambda x: x[1])[:self.params["top_k"]]])

            jaccard = (len(disease_terms & pred_terms) / len(disease_terms | pred_terms) 
                       if len(disease_terms | pred_terms) != 0 else 0)
            sp_corr, sp_pval = self.compute_spearman_correlation(disease_term_to_pval,
                                                                 pred_term_to_pval)

            results[f"{name}_num_significant"] = len(pred_terms)
            results[f"{name}_top_{self.params['top_k']}"] = top_pred_terms
            results[f"{name}_jaccard_sim"] = jaccard
            results[f"{name}_sp_corr"] = sp_corr
            results[f"{name}_sp_pval"] = sp_pval

        return disease, results, output

    def _run(self):
        """
        Run the experiment.
        """        
        results = []
        indices = []
        outputs = {}

        diseases = list(self.diseases_dict.values())
        diseases.sort(key=lambda x: x.split)
        if self.params["n_processes"] > 1:
            with tqdm(total=len(diseases)) as t: 
                p = Pool(self.params["n_processes"])
                for disease, result, output in p.imap(process_disease_wrapper, diseases):
                    results.append(result)
                    indices.append(disease.id)
                    outputs[disease.id] = output
                    t.update()
        else:
            with tqdm(total=len(diseases)) as t: 
                for disease in diseases:
                    disease, result, output = self.process_disease(disease)
                    results.append(result)
                    indices.append(disease.id)
                    outputs[disease.id] = output

                    t.update()
        
        self.outputs = outputs
        self.results = pd.DataFrame(results, index=indices)

    def save_results(self, summary=True):
        """
        Saves the results to a csv using a pandas Data Fram
        """
        print("Saving Results...")
        self.results.to_csv(os.path.join(self.dir, 'results.csv'))

        #if self.params["save_enrichment_results"]:
        #    with open(os.path.join(self.dir,'outputs.pkl'), 'wb') as f:
        #        pickle.dump(self.outputs, f)
            
    def load_results(self):
        """
        Loads the results from a csv to a pandas Data Frame.
        """
        print("Loading Results...")
        self.results = pd.read_csv(os.path.join(self.dir, 'results.csv'))
コード例 #24
0
ファイル: go.py プロジェクト: Shamir-Lab/EMP
def check_group_enrichment(tested_gene_file_name,
                           total_gene_file_name,
                           go_folder,
                           th=1):
    if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0:
        return []

    if type(total_gene_file_name) == str:
        total_gene_list = load_gene_list(total_gene_file_name)
    else:
        total_gene_list = total_gene_file_name

    if type(tested_gene_file_name) == str:
        tested_gene_list = load_gene_list(tested_gene_file_name)
    else:
        tested_gene_list = tested_gene_file_name

    if not os.path.exists(os.path.join(go_folder, constants.GO_FILE_NAME)):
        download(constants.GO_OBO_URL, constants.GO_DIR)

    obo_dag = GODag(os.path.join(go_folder, constants.GO_FILE_NAME))

    if not os.path.exists(
            os.path.join(go_folder, constants.GO_ASSOCIATION_FILE_NAME)):
        if not os.path.exists(
                os.path.join(go_folder,
                             constants.GO_ASSOCIATION_FILE_NAME + ".gz")):
            download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR)
        with gzip.open(
                os.path.join(
                    go_folder,
                    os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)),
                'rb') as f_in:
            with open(
                    os.path.join(go_folder,
                                 constants.GO_ASSOCIATION_FILE_NAME),
                    'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    global assoc
    if assoc is None:
        assoc = read_ncbi_gene2go(os.path.join(
            go_folder, constants.GO_ASSOCIATION_FILE_NAME),
                                  no_top=True)

    g = GOEnrichmentStudy(
        [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
        assoc,
        obo_dag,
        log=None)  # "bonferroni", "fdr_bh"
    g_res = g.run_study(
        [int(cur) for cur in ensembl2entrez_convertor(tested_gene_list)])

    GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count,
                   cur.p_uncorrected)
                  for cur in g_res]  # , cur.p_fdr_bh    if cur.p_fdr_bh <= th

    hg_report = [{
        HG_GO_ROOT: cur[0],
        HG_GO_ID: cur[1],
        HG_GO_NAME: cur[2],
        HG_VALUE: cur[3],
        HG_PVAL: cur[4]
    } for cur in GO_results]  # , HG_QVAL : cur[5]
    hg_report.sort(key=lambda x: x[HG_PVAL])  # HG_QVAL

    return hg_report
コード例 #25
0
ファイル: go_enrichment.py プロジェクト: gudeqing/biodev
def enrich(gene2go: str,
           study: str,
           obo: str,
           population: str = None,
           geneid2symbol: str = None,
           correct='fdr_bh',
           alpha=0.05,
           top=20,
           goea_out=None,
           dag_out=None,
           dpi=300,
           show_gene_limit=6,
           only_plot_sig=False):
    """
    Go enrichment based on goatools
    :param gene2go: a file with two columns: gene_id \t go_term_id
    :param study: a file with at least one column, first column contains gene id, second columns is regulation direction
    :param obo: go-basic file download from GeneOntology
    :param population: a file with each row contains one gene; default to use all genes in gene2go file as population
    :param geneid2symbol: file with two columns: gene_id \t gene_symbol, used for DAG plot
    :param correct: pvalue adjustment method:
        Method used for testing and adjustment of pvalues. Can be either the
        full name or initial letters. Available methods are:
        - `bonferroni` : one-step correction
        - `sidak` : one-step correction
        - `holm-sidak` : step down method using Sidak adjustments
        - `holm` : step-down method using Bonferroni adjustments
        - `simes-hochberg` : step-up method  (independent)
        - `hommel` : closed method based on Simes tests (non-negative)
        - `fdr_bh` : Benjamini/Hochberg  (non-negative)
        - `fdr_by` : Benjamini/Yekutieli (negative)
        - `fdr_tsbh` : two stage fdr correction (non-negative)
        - `fdr_tsbky` : two stage fdr correction (non-negative)
    :param alpha: fdr cutoff, default 0.05
    :param top: n top go terms to plot, sorted by corrected pvalue
    :param goea_out: output enrichment result file
    :param dag_out: dag figure file
    :param dpi: resolution of image, no effect for svg
    :param show_gene_limit: the max number of gene in a node to show
    :param only_plot_sig: only plot dag for significantly enriched terms
    :return: None
    """
    if str(correct) == '3':
        correct = 'fdr_bh'
    if geneid2symbol:
        geneid2symbol = dict(x.strip().split()[:2] for x in open(geneid2symbol)
                             if x.strip())
    else:
        geneid2symbol = dict()
    obo = GODag(obo, optional_attrs=['relationship', 'is_a'])
    gene2go = read_associations(gene2go)
    study_genes = [x.strip().split()[0] for x in open(study)]
    try:
        reg_dict = dict(x.strip().split()[:2] for x in open(study))
    except:
        reg_dict = {x.strip(): '' for x in open(study)}
    if not population:
        population = gene2go.keys()
    else:
        population = [
            x.strip().split()[0] for x in open(population) if x.strip()
        ]

    goea_obj = GOEnrichmentStudy(population,
                                 gene2go,
                                 obo,
                                 propagate_counts=False,
                                 alpha=alpha,
                                 methods=('fdr_bh', ))
    keep_if = lambda r: r.ratio_in_study[0] != 0
    goea_results_all = goea_obj.run_study(study_genes, keep_if=keep_if)
    goea_out = goea_out or study + '.goea.xls'
    goea_obj.wr_tsv(goea_out, goea_results_all)

    def func(y):
        results = []
        genes = [x.strip() for x in y.split(',')]
        for gene in genes:
            tmp = [gene]
            if gene in reg_dict:
                tmp.append(reg_dict[gene])
            if gene in geneid2symbol:
                tmp.append(geneid2symbol[gene])
            results.append('|'.join(tmp))
        return ';'.join(results)

    # func = lambda y: ';'.join(x.strip()+'|'+reg_dict[x.strip()] if x.strip() in reg_dict else x.strip() for x in y.split(','))
    table = pd.read_table(goea_out, header=0, index_col=0)
    # 重新校正pvalue, 修改内容
    fdr = multipletests(table['p_uncorrected'], method=correct)[1]
    table['p_fdr_bh'] = fdr
    # 修改goea_result_all方便后续的画图
    for r, fdr in zip(goea_results_all, fdr):
        r.p_fdr_bh = fdr
    table.columns = [
        x if x != 'p_fdr_bh' else 'p_corrected' for x in table.columns
    ]
    table['enrichment'] = [
        'e' if x <= alpha else 'p' for x in table['p_corrected']
    ]
    table['study_items'] = table.loc[:, 'study_items'].map(func)
    # table = table.sort_values(by=['p_corrected', 'p_uncorrected'])
    table.to_csv(goea_out, header=True, index=True, sep='\t')

    # -------------------plot dag------------------------
    for each in ['BP', 'MF', 'CC']:
        if only_plot_sig:
            goea_results_sig = table[table['enrichment'] == 'e']
        else:
            goea_results_sig = table.copy()
        goea_results_sig = goea_results_sig[goea_results_sig['NS'] == each]
        if not goea_results_sig.shape[0]:
            print(f"No significant term to plot for {each} ")
            return
        if goea_results_sig.shape[0] >= top:
            goea_results_sig = goea_results_sig.iloc[:top]
        goid_subset = list(goea_results_sig.index)
        # t = obo[goid_subset[5]]
        # for k, v in t.relationship.items():
        #     print(t, k, type(v), list(v)[0].id)
        # print(dag_out[:-4]+'.'+each+dag_out[-4:])
        dag_out = dag_out or study + '.goea.dag.svg'
        plot_gos(
            dag_out[:-4] + '.' + each + dag_out[-4:],
            goid_subset,  # Source GO ids, 如果分析结果里面没有包含这个节点,则他的颜色会是苍白绿色,但这里这个情况不会出现
            obo,
            goea_results=
            goea_results_all,  # use pvals for coloring:"p_{M}".format(M=goea[0].method_flds[0].fieldname)
            # We can further configure the plot...
            id2symbol=geneid2symbol,  # Print study gene Symbols, not GeneIDs
            study_items=show_gene_limit,  # Only max 6 gene Symbols on GO terms
            items_p_line=3,  # Print 3 genes per line)
            dpi=0 if dag_out.endswith('svg') else dpi,
            # title="Directed Graph of enriched {} terms".format(each)
        )
コード例 #26
0
        'MEX3A'
    ]
    target_gene_prot_ids = get_prot_ids_of_genes(target_gene)
    gene_prot_id_dict = get_prot_ids_of_genes(gene_names)
    target_study = list(target_gene_prot_ids.values())
    target_study = [x for inside_list in target_study for x in inside_list]
    study = list(gene_prot_id_dict.values())
    study = [x for inside_list in study for x in inside_list]

    g = GOEnrichmentStudy(pop,
                          assoc,
                          go,
                          propagate_counts=True,
                          alpha=0.05,
                          methods=methods)
    g_res_target = g.run_study(target_study)
    pathways_of_target = [x.GO for x in g_res_target if x.study_count > 0]
    target_assoc = {}
    for key, entry in assoc.items():
        for go_id in pathways_of_target:
            if go_id in entry:
                target_assoc[key] = entry
    g = GOEnrichmentStudy(pop,
                          assoc,
                          go,
                          propagate_counts=True,
                          alpha=0.05,
                          methods=methods)
    g_res = g.run_study(study)
    g.prt_txt(sys.stdout, g_res)
    x = 0
コード例 #27
0
class FunctionalEnrichmentAnalysis(Experiment):
    """
    """
    
    def __init__(self, dir, params):
        """
        """
        super().__init__(dir, params)
        
        set_logger(os.path.join(self.dir, 'experiment.log'), 
                   level=logging.INFO, console=True)

        logging.info("Loading disease associations...")
        self.diseases_dict = load_diseases(self.params["associations_path"], 
                                           self.params["disease_subset"],
                                           exclude_splits=['none'])
        
        logging.info("Loading network...")
        self.network = Network(self.params["ppi_network"]) 
        self.degrees = np.array(list(dict(self.network.nx.degree()).values()))
        
        logging.info("Loading weights...")
        with open(os.path.join(params["model_path"], "models", "models.tar"), "rb") as f:
            split_to_model = pickle.load(f)
            
        self.ci_weights = ci_weights = np.mean([model['ci_weight'][0, 0].numpy() 
                                                for model in split_to_model.values()], axis=0)
        self.ci_weights_norm = self.ci_weights / np.sqrt(self.degrees)
        
        logging.info("Loading enrichment study...")
        geneid2go = read_ncbi_gene2go("data/go/gene2go.txt", taxids=[9606])
        obodag = GODag("data/go/go-basic.obo")
        self.go_study = GOEnrichmentStudy(self.network.get_names(),
                                          geneid2go,
                                          obodag, 
                                          propagate_counts = True,
                                          alpha = 0.05,
                                          methods = ['fdr_bh'])

    
    def run_study(self):
        """
        """
        top_nodes = np.argsort(self.ci_weights_norm)[-self.params["top_k"]:]
        top_proteins = self.network.get_names(top_nodes)
        self.raw_results = self.go_study.run_study(set(top_proteins))  
    
    def to_csv(self):
        """
        """
        self.results = []
        for r in self.raw_results:
            self.results.append({
                "name": r.name,
                "pvalue": r.p_fdr_bh,
                "goterm_id": r.goterm.id
            })
        self.results = sorted(self.results, key = lambda x: x["pvalue"])
        
        results_df = pd.DataFrame(self.results)
        results_df.to_csv(os.path.join(self.dir, "all_terms.csv"))            
コード例 #28
0
class GeneOntology:
    def __init__(self, go_obo_path='data/go.obo'):
        canonical_orfs = paper_orfs

        self.obodag = GODag(go_obo_path)

        # read genes containing GO Ontology annotations
        orfs_with_go = read_sgd_orfs()

        # only use canonical orfs dataset
        self.orfs_with_go = orfs_with_go.join(canonical_orfs[[]], how='inner')

        # create mapping of gene names to set of GO annotaitons
        assoc = defaultdict(set)
        for idx, gene in self.orfs_with_go.iterrows():
            assoc[gene['name']] = set(gene.ontology.split(','))
        self.assoc = assoc
        self.methods = ['fdr_bh', 'bonferroni']

        self.devnull = open('/dev/null', 'w')

        # create GO enrichment object to run GO
        self.goeaobj = GOEnrichmentStudy(
            assoc.keys(),  # List of protein-coding genes
            assoc,  # geneid/GO associations
            self.obodag,  # Ontologies
            propagate_counts=False,
            alpha=0.05,  # default significance cut-off
            methods=self.methods,
            log=self.devnull)

    def run_go(self, geneids, sig=0.001):
        """Run gene ontology against set of genes"""
        self.goea_results_all = self.goeaobj.run_study(geneids)
        self.goea_results_sig = [
            r for r in self.goea_results_all
            if (r.p_fdr_bh < sig and r.study_count > 0)
        ]

        cols = [
            'id', 'name', 'pop_count', 'pop_n', 'study_count', 'study_n',
            'pop_items', 'study_items'
        ] + self.methods

        results_dic = {}

        for c in cols:
            results_dic[c] = []

        for g in self.goea_results_all:

            study_items = ','.join(g.study_items)
            name = g.name
            fdr = g.p_fdr_bh
            pop_items = ','.join(g.pop_items)

            results_dic['id'].append(g.GO)
            results_dic['name'].append(name)

            for method in self.methods:
                results_dic[method].append(
                    g.__dict__['p_' + method.replace('-', '_')])

            results_dic['study_items'].append(study_items)
            results_dic['pop_items'].append(pop_items)
            results_dic['study_count'].append(g.study_count)
            results_dic['pop_count'].append(g.pop_count)
            results_dic['study_n'].append(g.study_n)
            results_dic['pop_n'].append(g.pop_n)

        results_df = pandas.DataFrame(results_dic)
        self.results_df = results_df[cols].sort_values('fdr_bh').reset_index(
            drop=True)
        self.results_sig_df = self.results_df[
            (self.results_df.fdr_bh < sig) & (self.results_df.study_count > 0)]

    def plot_sig(self):
        plot_results("test_{NS}.pdf", self.goea_results_sig)
コード例 #29
0
def goe(
    genelist,
    go_file,
    goa_file,
    bg=None,
    nmin=5,
    conversion=None,
    evidence_set={
        'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'HTP', 'HDA', 'HMP', 'HGI', 'IBA',
        'IBD', 'IKR', 'IRD', 'ISS', 'ISO', 'ISA', 'ISM'
    }):
    """Finds GO enrichment with goatools (0.7.11 tested).

	**WARNING**\ : This method is inexact for multi-maps in gene name conversion. However, it has a negligible effect in top GO component removal in single-cell co-expression.

	Parameters
	------------
	genelist:	list of str
		Genes to search for enrichment.
	go_file:	str
		File path for GO DAG (downloadable at http://geneontology.org/docs/download-ontology/)).
	goa_file:	str
		File path for GO associations. See parameter **conversion**.
	bg:			list of str
		Background genes.
	nmin:		int
		Minimum number of principal genes required in GO.
	conversion:	tuple
		Conversion of `gene ID system <https://docs.mygene.info/en/latest/doc/data.html>`_ from gene list to the GO annotation.

		* name_from:	Gene naming system of genelist. For gene names, use 'symbol,alias'.
		* name_to:		Gene naming system of goa_file. Examples:

			* Human: use 'uniprot.Swiss-Prot' (for GO annotations downloded from http://geneontology.org/gene-associations/goa_human.gaf.gz).
			* Mouse: use 'MGI' (for GO annotations downloded from http://current.geneontology.org/annotations/mgi.gaf.gz).

		* species:		Species for gene name conversion. Examples: 'human', 'mouse'.

	evidence_set:	set of str
		`GO evidences <http://geneontology.org/docs/guide-go-evidence-codes/>`_ to include. Defaults to non-expression based results to avoid circular reasoning bias.

	Returns
	----------
	goe:		pandas.DataFrame
		GO enrichment.
	gotop:		str
		Top enriched GO ID
	genes:		list of str or None
		Intersection list of genes in gotop and also bg. None if bg is None.

	"""
    from tempfile import NamedTemporaryFile
    from os import linesep
    from goatools.go_enrichment import GOEnrichmentStudy
    from goatools.obo_parser import GODag
    from goatools.associations import read_gaf
    from collections import defaultdict
    import itertools
    from biothings_client import get_client
    import pandas as pd
    import logging
    assert type(genelist) is list and len(genelist) > 0
    if nmin < 1:
        nmin = 1

    bg0 = bg
    # Convert gene names
    if conversion is not None:
        assert len(conversion) == 3
        name_from, name_to, species = conversion
        mg = get_client('gene')
        ans = set(genelist)
        if bg is not None:
            t1 = set(bg)
            assert len(ans - t1) == 0
            ans |= t1
        ans = list(ans)
        ans = mg.querymany(ans,
                           scopes=name_from,
                           fields=name_to,
                           species=species)
        t1 = set(['query', '_score', name_to.split('.')[0]])
        ans = list(filter(lambda x: len(t1 - set(x)) == 0, ans))
        ans = sorted(ans, key=lambda x: x['_score'])
        convert = {x['query']: x for x in ans}
        for xi in name_to.split('.'):
            convert = filter(lambda x: xi in x[1], convert.items())
            convert = {x[0]: x[1][xi] for x in convert}
        convert = {
            x[0]: x[1] if type(x[1]) is str else x[1][0]
            for x in convert.items()
        }
        genelist2 = list(
            set([convert[x]
                 for x in filter(lambda x: x in convert, genelist)]))
        if bg is not None:
            bg = list(
                set([convert[x] for x in filter(lambda x: x in convert, bg)]))
        t1 = set(genelist)
        converti = list(filter(lambda x: x[0] in t1, convert.items()))
        t1 = defaultdict(list)
        for xi in converti:
            t1[xi[1]].append(xi[0])
        converti = dict(t1)
        t1 = defaultdict(list)
        for xi in convert.items():
            t1[xi[1]].append(xi[0])
        convertia = dict(t1)
    else:
        genelist2 = genelist

    # Load GO DAG and association files
    logging.debug('Reading GO DAG file ' + go_file)
    godag = GODag(go_file)
    logging.debug('Reading GO association file ' + goa_file)
    goa = read_gaf(goa_file, evidence_set=evidence_set)
    if bg is None:
        bg = list(goa.keys())

    # Compute enrichment
    goe = GOEnrichmentStudy(bg, goa, godag)
    ans = goe.run_study(genelist2)
    # Format output
    with NamedTemporaryFile() as f:
        goe.wr_tsv(f.name, ans)
        ans = f.read()
    ans = ans.decode()
    ans = [x.split('\t') for x in ans.split(linesep)]
    if len(ans[-1]) < 2:
        ans = ans[:-1]
    if len(ans) == 0 or len(ans[0]) == 0:
        raise ValueError('No enrichment found. Check your input ID type.')
    ans[0][0] = ans[0][0].strip('# ')
    ans = pd.DataFrame(ans[1:], columns=ans[0])
    ans.drop(['NS', 'enrichment', 'study_count', 'p_sidak', 'p_holm'],
             axis=1,
             inplace=True)
    for xj in ['p_uncorrected', 'p_bonferroni']:
        ans[xj] = pd.to_numeric(ans[xj], errors='raise')
    ans['depth'] = pd.to_numeric(ans['depth'],
                                 errors='raise',
                                 downcast='unsigned')
    # Odds ratio column and sort column
    ans['odds_ratio'] = toratio(ans['ratio_in_study']) / toratio(
        ans['ratio_in_pop'])
    ans = ans[[
        'name', 'depth', 'p_uncorrected', 'p_bonferroni', 'odds_ratio',
        'ratio_in_study', 'ratio_in_pop', 'GO', 'study_items'
    ]]
    ans['study_items'] = ans['study_items'].apply(lambda x: x.replace(' ', ''))
    # Convert back study_items
    if conversion is not None:
        ans['study_items'] = ans['study_items'].apply(lambda x: ','.join(
            list(
                itertools.chain.from_iterable(
                    [converti[y] for y in x.split(',')])))
                                                      if len(x) > 0 else x)
    ans.sort_values('p_uncorrected', inplace=True)

    # Get top enriched GO by P-value
    gotop = ans[
        (ans['odds_ratio'] > 1)
        & ans['ratio_in_study'].apply(lambda x: int(x.split('/')[0]) >= nmin)]
    if len(gotop) == 0:
        raise ValueError('No GO enrichment found for given criteria.')
    gotop = str(gotop.iloc[0]['GO'])
    if bg0 is not None:
        # Children GOs
        gos = set([gotop] + list(godag.query_term(gotop).get_all_children()))
        # Look for genes
        genes = list(
            filter(lambda x: len(list(filter(lambda y: y in gos, goa[x]))) > 0,
                   goa))
        if conversion is not None:
            genes = [
                convertia[x] for x in filter(lambda x: x in convertia, genes)
            ]
            genes = list(set(list(itertools.chain.from_iterable(genes))))
        genes = set(genes)
        genes = list(filter(lambda x: x in genes, bg0))
    else:
        genes = None
    return (ans, gotop, genes)
コード例 #30
0
        if overlap <= 0.7:
            exit("\nERROR: only {} of genes/proteins in the study are found in the "
                 "background population. Please check.\n".format(overlap))

    assoc = read_associations(assoc_fn)

    methods = args.method.split(",")

    obo_dag = GODag(obo_file=args.obo)
    propagate_counts = not args.no_propagate_counts
    g = GOEnrichmentStudy(pop, assoc, obo_dag,
                          propagate_counts=propagate_counts,
                          alpha=args.alpha,
                          pvalcalc=args.pvalcalc,
                          methods=methods)
    results = g.run_study(study)
    if args.outfile is None:
        g.print_summary(results, min_ratio=min_ratio, indent=args.indent, pval=args.pval)
    else:
        # Users can print to both tab-separated file and xlsx file in one run.
        outfiles = args.outfile.split(",")
        prt_if = None # Print all values
        if args.pval is not None:
            # Only print out when uncorrected p-value < this value.
            prt_if = lambda nt: nt.p_uncorrected < args.pval
        for outfile in outfiles:
            if outfile.endswith(".xlsx"):
                g.wr_xlsx(outfile, results, prt_if=prt_if)
            else:
                g.wr_tsv(outfile, results, prt_if=prt_if)
コード例 #31
0
def check_group_enrichment_goatools(tested_gene_file_name,
                                    total_gene_file_name,
                                    th=1):
    if len(tested_gene_file_name) == 0 or len(total_gene_file_name) == 0:
        return []

    if type(total_gene_file_name) == str:
        total_gene_list = load_gene_list(total_gene_file_name)
    else:
        total_gene_list = total_gene_file_name

    if type(tested_gene_file_name) == str:
        tested_gene_list = load_gene_list(tested_gene_file_name)
    else:
        tested_gene_list = tested_gene_file_name

    if not os.path.exists(
            os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        download(constants.GO_OBO_URL, constants.GO_DIR)

    obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    if not os.path.exists(
            os.path.join(constants.GO_DIR,
                         constants.GO_ASSOCIATION_FILE_NAME)):
        download(constants.GO_ASSOCIATION_GENE2GEO_URL, constants.GO_DIR)
        with gzip.open(
                os.path.join(
                    constants.GO_DIR,
                    os.path.basename(constants.GO_ASSOCIATION_GENE2GEO_URL)),
                'rb') as f_in:
            with open(
                    os.path.join(constants.GO_DIR,
                                 constants.GO_ASSOCIATION_FILE_NAME),
                    'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)

    assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR,
                                           constants.GO_ASSOCIATION_FILE_NAME),
                              no_top=True)

    sw = Stopwatch()
    sw.start()
    g = GOEnrichmentStudy(
        [int(cur) for cur in ensembl2entrez_convertor(total_gene_list)],
        assoc,
        obo_dag,
        methods=[],
        log=None)  # "bonferroni", "fdr_bh"
    g_res = g.run_study(
        [int(cur) for cur in ensembl2entrez_convertor(tested_gene_list)])
    print sw.stop("done GO analysis in ")
    # GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count, cur.p_uncorrected, cur.p_fdr_bh) for cur in g_res if
    #               cur.p_fdr_bh <= 0.05]
    GO_results = [(cur.NS, cur.GO, cur.goterm.name, cur.pop_count,
                   cur.p_uncorrected) for cur in g_res
                  if cur.p_uncorrected <= th]

    hg_report = [{
        HG_GO_ROOT: cur[0],
        HG_GO_ID: cur[1],
        HG_GO_NAME: cur[2],
        HG_VALUE: cur[3],
        HG_PVAL: cur[4],
        HG_QVAL: 1
    } for cur in GO_results]  # , HG_QVAL : cur[5]
    # hg_report.sort(key=lambda x: x[HG_QVAL])
    hg_report.sort(key=lambda x: x[HG_PVAL])

    if len(GO_results) > 0:
        go_ns, go_terms, go_names, go_hg_value, uncorrectd_pvals = zip(
            *GO_results)  # , FDRs
    else:
        go_terms = []
        uncorrectd_pvals = []
        FDRs = []
        go_names = []
        go_ns = []
    # output_rows = [("\r\n".join(e2g_convertor(tested_gene_list)),  "\r\n".join(go_ns),
    #                     "\r\n".join(go_terms), "\r\n".join(go_names), "\r\n".join(map(str, uncorrectd_pvals)),
    #                     "\r\n".join(map(str, FDRs)))]
    # print_to_excel(output_rows, str(tested_gene_file_name)[:10], str(total_gene_file_name)[:10])
    return hg_report
コード例 #32
0
def get_GO(gene_query, species='mouse'):
    """
    Get Gene Ontologies (GOs).

    Args:
        gene_query (array of str): gene list.

        species (str): Select species. Either "mouse" or "human"

    Returns:
        pandas.dataframe: GO analysis results as dataframe.
    """

    sig_thresh = 3
    num_genes = None
    GOIs = gene_query

    # prepare files
    # check files
    _check_data_and_download_if_necessary(go_folder)

    obodag = GODag(os.path.join(go_folder, "go-basic.obo"))

    #go analysis

    if (species == 'human'):

        geneid2gos = read_ncbi_gene2go(os.path.join(go_folder, "gene2go.txt"),
                                       taxids=[9606])
        print("{N:,} annotated genes".format(N=len(geneid2gos)))

        Xtable = pd.read_csv(os.path.join(go_folder, 'hg19_xref.txt'),
                             sep='\t')
        Xtable.index = Xtable['Approved Symbol']
        GOIs_entrez = [
            int(x)
            for x in np.unique(Xtable.loc[GOIs].dropna()['EntrezGene ID'])
        ]

    elif (species == 'mouse'):

        geneid2gos = read_ncbi_gene2go(os.path.join(go_folder, "gene2go.txt"),
                                       taxids=[10090])
        print("{N:,} annotated genes".format(N=len(geneid2gos)))

        from goatools.test_data.genes_NCBI_10090_ProteinCoding import GENEID2NT as GeneID2nt_mus

        Xtable = pd.read_csv(os.path.join(go_folder, 'biomart_xref.mm10.txt'),
                             sep='\t')
        Xtable = Xtable[['Associated Gene Name', 'EntrezGene ID']].dropna()
        Xtable.index = Xtable['Associated Gene Name']
        GOIs_entrez = [
            int(x)
            for x in np.unique(Xtable.loc[GOIs].dropna()['EntrezGene ID'])
        ]

    print("processing " + str(len(GOIs)) + " genes ...")

    goeaobj = GOEnrichmentStudy(
        GeneID2nt_mus.keys(),  # List of mouse protein-coding genes
        geneid2gos,  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=False,
        alpha=0.05,  # default significance cut-off
        methods=['fdr_bh'])  # defult multipletest correction method

    goea_results = goeaobj.run_study(GOIs_entrez)

    li = []
    names = []

    go_default_output = goea_results[0].get_prtflds_default()

    for i in goea_results:
        li.append(i.get_field_values(go_default_output))
        names.append(i.name)

    df_GO = pd.DataFrame(li)

    if len(li) != 0:
        df_GO.columns = go_default_output
        df_GO["genes"] = df_GO.study_items.apply(
            lambda x: _ids2symbols(x, species))
    else:
        print("Found No GO with significant p-value")

    return df_GO
コード例 #33
0
            genename_2_id[s[2]] = s[1]
id_2_genename = {genename_2_id[i]: i for i in genename_2_id}
# Only looking at "biological process" GO terms
geneid2gos_yeast = read_gaf('../accessory_files/gene_association.sgd',
                            namespace='BP')
ids = [i for i in geneid2gos_yeast.keys()]
background_set = [genename_2_id[i] for i in genename_2_id]
goeaobj = GOEnrichmentStudy(
    background_set,  # List of all genes in analysis
    geneid2gos_yeast,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.05,  # default significance cut-off
    methods=['fdr_bh'])  # defult multipletest correction method

goea_results_all = goeaobj.run_study(multi_hit_sgdids,
                                     keep_if=lambda x: x.p_uncorrected < 0.05)
go_results = sorted(goea_results_all, key=lambda r: r.p_fdr_bh)

cols = [
    'GO ID', 'GO term', 'pval_uncorrected', 'pval_benjamini_hochberg',
    'num_hits', 'num_in_group', 'hits'
]
big_mat = []
for res in go_results:
    big_mat.append([
        res.GO, res.name, res.p_uncorrected, res.p_fdr_bh,
        res.ratio_in_study[0], res.ratio_in_pop[0],
        ';'.join([id_2_genename[i] for i in res.study_items])
    ])

pd.DataFrame(big_mat,
コード例 #34
0
ファイル: gsea.py プロジェクト: fransua/SerUtils
    out = open(
        'coord_{}_A-{}_B-{}.tsv'.format(strategy, species[key[0]]['vulgar'],
                                        species[key[1]]['vulgar']), 'w')
    out.write('\n'.join([
        '{}\t{}\t{}'.format(c, b * reso, b * reso + reso) for c, b in pos_list
    ]) + '\n')
    out.close()
    out = open(
        'genes_{}_A-{}_B-{}.tsv'.format(strategy, species[key[0]]['vulgar'],
                                        species[key[1]]['vulgar']), 'w')
    out.write('\n'.join(gene_list) + '\n')
    out.close()

    # run GSE test
    print(' - Repressed ({:5,d}): '.format(len(gene_list)))
    results_all = goeaobj.run_study(
        [reverse[k] for k in gene_list if k in reverse], log=None)
    results_sig = [r for r in results_all if r.p_fdr_bh < 0.05]
    GO_results[spe]['Repressed'] = results_all
    if results_sig:
        for r, s in sorted([(r.get_pvalue(), r.name) for r in results_sig]):
            print('   -> {:8.3g} {}'.format(r, s))
        plot_results("sign_{}_pv05_repressed.png".format(spe.replace(' ',
                                                                     ' ')),
                     results_sig,
                     log=open('/dev/null', 'w'))
    print('.' * 50)

    pos_list = [(c, b) for c, b in results[key]
                if test2(key, c, b) and np.isfinite(results[key][c, b][which])]
    gene_list = reduce(lambda x, y: x + y,
                       [genes.get((c, b), []) for c, b in pos_list])
コード例 #35
0
ファイル: validate.py プロジェクト: KyleTDavid/DeepBio
from goatools.obo_parser import GODag
obodag = GODag("go-basic.obo")

from goatools.go_enrichment import GOEnrichmentStudy
goeaobj = GOEnrichmentStudy(
    uniprot_df.Entry,
    ns2assoc,  # geneid/GO associations
    obodag,  # Ontologies
    propagate_counts=False,
    alpha=0.001,  # default significance cut-off
    methods=['fdr_bh'])  # default multipletest correction method

gos = []
for e in set(uniprot_df[uniprot_df['n'] >= 2]['Encoding']):
    goea_results = goeaobj.run_study(
        list(uniprot_df[uniprot_df['Encoding'] == e].Entry))
    for r in goea_results:
        if (r.enrichment == 'e'):
            id = r.goterm.id
            name = r.name
            cat = r.goterm.namespace
            members = r.study_items
            p = r.p_fdr_bh
            gos.append([id, name, cat, e, members, p])

godf = pd.DataFrame(
    gos, columns=['id', 'name', 'category', 'encoding', 'members', 'p'])
godf['unique?'] = ~godf['name'].duplicated(keep=False)
godf['member_count'] = godf.members.apply(lambda x: len(x))
godf['representation'] = godf.apply(lambda row: row.member_count / int(
    np.unique(uniprot_df[uniprot_df.Encoding == row.encoding]['n'])),
コード例 #36
0
elif snakemake.wildcards.state_type == 'Enhancer':
    min_dist = 5000
    max_dist = 50000
else:
    sys.exit(-1)
with open(snakemake.input.clusters) as f:
    for line in f:
        cols = line.strip().split()
        cluster = chr(int(cols[3]) + 65)
        if int(cols[-1]) <= max_dist and int(cols[-1]) >= min_dist:
            genes[cluster].add(cols[7])
            background.add(cols[7])

obodag = GODag("go-basic.obo")
id2go = read_associations("sym2go.txt")
goeaobj = GOEnrichmentStudy(background, id2go, obodag, propagate_counts=False, alpha=0.05, methods=['fdr_bh'])
outfile = open(snakemake.output.txt, 'w')
for cluster, geneids in sorted(genes.items()):
    outfile.write("Cluster {}\n".format(cluster))
    goea_results_all = goeaobj.run_study(geneids)
    for fdr, name, enrichment in sorted([(r.p_fdr_bh, r.name, r.enrichment) for r in goea_results_all if r.p_fdr_bh < 0.2]):
        outfile.write("\t{}\t{}\t{}\n".format(name, fdr, enrichment))
    outfile.write("\n")
    #GOEnrichmentStudy.print_summary(goea_results_sig)






コード例 #37
0
dictionary = {x: funcs[x]
               for x in funcs 
               if x in C_int_UP}

GO_IDs = {x: assoc[x]
         for x in assoc 
         if x in C_int_UP}

         
study = dictionary.keys()  

g = GO_en(pop, assoc, go,
          propagate_counts=True,
          alpha=0.05,
          methods=["bonferroni", "sidak", "holm", "fdr"])
g_res = g.run_study(study)


#Select GO terms based on Bonferroni Correction
s_bonferroni = []
s_fdr = []
for x in g_res:
    if x.p_bonferroni <= 0.01:
        s_bonferroni.append((x.goterm.id, x.p_bonferroni))
    if x.p_fdr <= 0.01:
        s_fdr.append((x.goterm.id, x.p_fdr))
        
enriched_GO_ID_bon = [i[0] for i in s_bonferroni]
enriched_GO_ID_fdr = [i[0] for i in s_fdr]

#Only select genes with GO terms that are enriched
コード例 #38
0
from goatools.obo_parser import GODag
obodag = GODag("go-basic.obo")

from goatools.go_enrichment import GOEnrichmentStudy
goeaobj = GOEnrichmentStudy(
        df.Entry, 
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = 0.001, # default significance cut-off
        methods = ['fdr_bh']) # default multipletest correction method

gos = []
for e in set(df[df['n']>=2]['Encoding']):
  goea_results = goeaobj.run_study(list(df[df['Encoding']==e].Entry))
  for r in goea_results:
      if (r.p_fdr_bh < 0.001) & (r.enrichment=='e') :
        id = r.goterm.id
        name = r.name
        cat = r.goterm.namespace
        members = r.study_items
        gos.append([id, name, cat, e, members])

godf = pd.DataFrame(gos, columns=['id', 'name', 'category', 'encoding', 'members'])
godf['unique?'] = ~godf['name'].duplicated(keep=False)
godf['representation'] = godf.members.apply (lambda x: len(x))
godf['representation'] = godf.apply (lambda row: row.representation / int(np.unique(df[df.Encoding==row.encoding]['n'])), axis=1)


#what % of encodings have at least one significant GO?