def run(path, f, output_path, dictionary, column='gene', **kwargs):
    """Run and save e.a.."""
    genes = pd.read_csv(path+f, **kwargs)
    df = tea.enrichment_analysis(genes[column], dictionary,
                                 show=False)
    df = df[df.Observed > 2]
    df.to_csv(output_path + f + '.csv', index=False)
Exemple #2
0
    def run_tea(parent, sleuth, out, q_threshold=0.05):
        """
        Runs TEA on sleuth output.
        """
        try:
            import tissue_enrichment_analysis as tea
        except ImportError as e:
            print_with_flush('# TEA is not installed...skipping')
            sys.exit(0)
        try:
            import pandas as pd
        except ImportError as e:
            print_with_flush('# pandas is not installed...skipping')
            sys.exit(0)

        analyses = ['tissue', 'phenotype', 'go']

        # Load sleuth results.
        wdir = os.getcwd()
        print_with_flush('# entering 3_diff_exp')
        os.chdir(parent)
        print_with_flush('# creating {} directory'.format(out))
        os.makedirs(out, exist_ok=True)
        for file in os.listdir(sleuth):
            if file.endswith('.csv'):
                df = pd.read_csv(os.path.join(sleuth, file), index_col=0)
                gene_list = df[df.qval < q_threshold].ens_gene
                name = os.path.splitext(os.path.basename(file))[0]

                if len(gene_list) == 0:
                    print_with_flush(('# there are no genes with q < {} in ' +
                                      '{}!').format(q_threshold, file))
                    print_with_flush(
                        '# this means there are no significantly ' +
                        'differentially-expressed genes for ' +
                        'this set of conditions.')
                    continue

                for analysis in analyses:
                    print_with_flush(('# performing {} enrichment analysis ' +
                                      'for {}').format(analysis, file))
                    fname = '{}_{}'.format(name.replace('betas_wt', out),
                                           analysis)
                    title = os.path.join(out, fname)
                    df_dict = tea.fetch_dictionary(analysis)
                    df_results = tea.enrichment_analysis(gene_list,
                                                         df_dict,
                                                         aname=title + '.csv',
                                                         save=True,
                                                         show=False)
                    tea.plot_enrichment_results(df_results,
                                                analysis=analysis,
                                                title=title,
                                                save=True)
        os.chdir(wdir)
        print_with_flush('# returned to root')
def walker(tissue_df, directory, save=True):
    """Given the tissue dictionary and a directory to save to,
    open all the gene sets, analyze them and deposit the results in the
    specified directory.

    Parameters:
    -------------------
    tissue_df - pandas dataframe containing specified tissue dictionary
    directory - where to save to
    save - boolean indicating whether to save results or not.

    """
    with open(directory + 'empty.txt', 'w') as f:
        f.write('Genesets with no enrichment:\n')

    # go through each file in the folder
    for fodder in os.walk(path_sets):
        for f_set in fodder[2]:
            # open df
            df = pd.read_csv(path_sets + f_set)

            # extract gene list and analyze
            short_name = f_set
            test = df.gene.values
            df_analysis, unused = tea.enrichment_analysis(test,
                                                          tissue_df,
                                                          show=False)

            # if it's not empty and you want to save:
            if df_analysis.empty is False & save:
                # save without index
                df_analysis.to_csv(directory + short_name + '.csv',
                                   index=False)
                # add a comment
                line = '#' + short_name + '\n'
                line_prepender(directory + short_name + '.csv', line)
                # plot
                tea.plot_enrichment_results(df_analysis,
                                            title=short_name,
                                            dirGraphs=directory,
                                            ftype='pdf')
                plt.close()

            # if it's empty and you want to save, place it in file called empty
            if df_analysis.empty & save:
                with open(directory + 'empty.txt', 'a+') as f:
                    f.write(short_name + '\n')
def walker(tissue_df, directory, save=True):
    """Given the tissue dictionary and a directory to save to,
    open all the gene sets, analyze them and deposit the results in the
    specified directory.

    Parameters:
    -------------------
    tissue_df - pandas dataframe containing specified tissue dictionary
    directory - where to save to
    save - boolean indicating whether to save results or not.

    """
    with open(directory+'empty.txt', 'w') as f:
        f.write('Genesets with no enrichment:\n')

    # go through each file in the folder
    for fodder in os.walk(path_sets):
        for f_set in fodder[2]:
            # open df
            df = pd.read_csv(path_sets + f_set)

            # extract gene list and analyze
            short_name = f_set
            test = df.gene.values
            df_analysis, unused = tea.enrichment_analysis(test, tissue_df,
                                                          show=False)

            # if it's not empty and you want to save:
            if df_analysis.empty is False & save:
                # save without index
                df_analysis.to_csv(directory+short_name+'.csv', index=False)
                # add a comment
                line = '#' + short_name+'\n'
                line_prepender(directory+short_name+'.csv', line)
                # plot
                tea.plot_enrichment_results(df_analysis, title=short_name,
                                            dirGraphs=directory, ftype='pdf')
                plt.close()

            # if it's empty and you want to save, place it in file called empty
            if df_analysis.empty & save:
                with open(directory+'empty.txt', 'a+') as f:
                    f.write(short_name+'\n')
        thresh = float(thresh)  # typecasting
        method = f_dict[-7:-4]

        ntiss = len(tissue_df.columns)
        ngenes = tissue_df.shape[0]

        # open each enrichment set
        for fodder in os.walk(path_sets):
            for f_set in fodder[2]:
                df = pd.read_csv(path_sets + f_set)
                test = df.gene.values
                ntest = len(test)
                short_name = f_set[16:len(f_set)-16]

                df_analysis, unused = tea.enrichment_analysis(test, tissue_df,
                                                              alpha=0.05,
                                                              show=False)

                nana = len(df_analysis)  # len of results
                nun = len(unused)  # number of genes dropped
                avf = df_analysis['Fold Change'].mean()
                avq = df_analysis['Q value'].mean()
                s = '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(
                        annot, thresh, method, f_set, ntiss, ntest, nana,
                        ntest-nun, avf, avq, ngenes)
                with open(dirSummaries+'ExecutiveSummary.csv', 'a+') as fSum:
                    fSum.write(s)
                    fSum.write('\n')

# Print summary to csv
df_summary = pd.read_csv(dirSummaries+'ExecutiveSummary.csv', comment='#')
for i, df in enumerate(Ldf):
    fname = Lnames[i]
    obj = receptacle(fname)

    for direction in Ldirection:
        ind = g(df[df[direction] == 1.0], 'SequenceNameGene')
        x = names[ind].WBID

        print('---------')
        print(fname + ' ' + direction)
        print('Number of genes submitted for analysis ', len(x))
        y = tissue_df[tissue_df.wbid.isin(x)].wbid.unique().shape[0]
        print('Number of genes used for analysis ', y)
        print('\n')

        df_res, unused = tea.enrichment_analysis(x, tissue_df,
                                                 show=False, alpha=0.1)

        print(df_res.empty)
        if len(df_res) > 0:
            obj.add_result(direction, df_res)

        n_genes.append(['{0}, {1}'.format(fname, direction), y, len(unused)])

    Lreceptacles[fname] = obj


# make
for species in Lreceptacles:
    current = Lreceptacles[species]

    n = current.n  # number of dfs
def run(path, f, output_path, dictionary, **kwargs):
    """Run and save e.a.."""
    genes = pd.read_csv(path+f, **kwargs)
    df, _ = tea.enrichment_analysis(genes.gene, dictionary,
                                    show=False)
    df.to_csv(output_path + f + '.csv', index=False)
            print('-----------------------------------')
            print('Genes with annotated phenotype terms: ',
                  get_n(pheno_traits, worm_genes))
            print('Genes with annotated tissue terms: ',
                  get_n(tissue_traits, worm_genes))
            print('Genes with annotated go terms: ',
                  get_n(go_traits, worm_genes))
            print('-----------------------------------\n')

        # one of the traits foolishly has a '/'
        if '/' in trait:
            # rename the trait with this character or it breaks your code
            trait = 'post bronchodilator fev1 fevc ratio'

        # enrichment analyses:
        df = tea.enrichment_analysis(worm_genes, phenotype_df, show=False)
        df = df[df.Observed > n_min_obs].copy()
        df.to_csv('../output/phenologues_2/pea_' + trait + '.csv', index=False)
        if 'lupus' in trait:
            print('Graphing PEA results for ', trait)
            fig, ax = plt.subplots()
            tea.plot_enrichment_results(df, title='../output/lupus_pea',
                                        save=True, analysis='phenotype')
            plt.close()

        df = tea.enrichment_analysis(worm_genes, tissue_df, show=False)
        df = df[df.Observed > n_min_obs].copy()
        df.to_csv('../output/disease_tissues_2/tea_' + trait + '.csv',
                  index=False)
        if 'lupus' in trait:
            print('Graphing TEA results for ', trait)
import tissue_enrichment_analysis as tea
import pandas as pd


phenotype_df = pd.read_csv('../input/phenotype_ontology.csv')
go_df = pd.read_csv('../input/gene_ontology.csv')
tissue_df = tea.fetch_dictionary()

sfari = pd.read_excel('../input/sfari.xlsx')
name_df = pd.read_excel('../input/sfari_name_converter.xlsx')

sfari.head()
df, _ = tea.enrichment_analysis(sfari.Gene, tissue_df, show=False)

df.to_csv('../output/tea_sfari.csv', index=False)


df, _ = tea.enrichment_analysis(sfari.Gene, phenotype_df, show=False)

df.to_csv('../output/pea_sfari.csv', index=False)

df, _ = tea.enrichment_analysis(sfari.Gene, go_df, show=False)

df.to_csv('../output/goa_sfari.csv', index=False)


melt_pheno = pd.melt(phenotype_df, id_vars='wbid', var_name='phenotype')
melt_pheno = melt_pheno[melt_pheno.value == 1]

def convert(x):
    return name_df[name_df.wbid == x].gene_name.values[0]
Exemple #10
0
    def go_enrichment(self,
                      mode: str = 'go',
                      alpha: float = 0.05,
                      save_csv: bool = False,
                      fname: str = None):
        """
        Analyzes GO, Tissue and/or Phenotype enrichment of the given group of features. \
        Uses the the Anatomy, Phenotype and Gene Ontology annotations for C. elegans. \
        Corrected p-values are calculated using hypergeometric statistics. \
        For more details see GitHub page of the developers: https://github.com/dangeles/TissueEnrichmentAnalysis

        :param mode: the enrichment you wish to perform. 'go' for gene ontology enrichment, \
        'tissue' for tissue enrichment, 'phenotype' for phenotype enrichment.
        :param alpha: float. Significance threshold. Default is 0.05
        :param save_csv: bool. False by default. If True, save the result to a csv.
        :param fname: Name and path in which to save the results. Must be filled if save_csv is True.
        :return:
        a DataFrame which contains the significant enrichmenet terms

        .. figure::  go_en.png
           :align:   center
           :scale: 40 %

           Example plot of GO enrichment

        .. figure::  tissue_en.png
           :align:   center
           :scale: 40 %

           Example plot of Tissue enrichment
        """
        assert isinstance(alpha, float), "alpha must be a float!"
        assert isinstance(mode, str), "'mode' must be a string!"
        if mode == 'all':
            d = []
            df_comb = pd.DataFrame()
            for k, arg in enumerate(('go', 'tissue', 'phenotype')):
                print(f'Calculating... {100 * k / 3 :.2f}% done')
                if arg in EnrichmentProcessing._go_dicts:
                    d.append(EnrichmentProcessing._go_dicts[arg])
                else:
                    d.append(tea.fetch_dictionary(arg))
                    EnrichmentProcessing._go_dicts[arg] = d[-1]
                df = tea.enrichment_analysis(self.gene_set, d[-1], alpha=alpha)
                df_comb = df_comb.append(df)
                tea._plot_enrichment_results(
                    df,
                    title=f'{arg.capitalize()} Enrichment Analysis',
                    analysis=arg)
                plt.title(f'{arg.capitalize()} Enrichment Analysis',
                          fontsize=20)
                print(df)

        else:
            assert (mode == 'go' or mode == 'tissue'
                    or mode == 'phenotype'), "Invalid mode!"
            d = tea.fetch_dictionary(mode)
            df_comb = tea.enrichment_analysis(self.gene_set, d, show=True)
            tea._plot_enrichment_results(
                df_comb,
                title=f'{mode.capitalize()} Enrichment Analysis',
                analysis=mode)
            plt.title(f'{mode.capitalize()} Enrichment Analysis', fontsize=20)

        if save_csv:
            self._enrichment_save_csv(df_comb, fname)
        plt.show()
        return df_comb
        thresh = float(thresh)  # typecasting
        method = f_dict[-7:-4]

        ntiss = len(tissue_df.columns)
        ngenes = tissue_df.shape[0]

        # open each enrichment set
        for fodder in os.walk(path_sets):
            for f_set in fodder[2]:
                df = pd.read_csv(path_sets + f_set)
                test = df.gene.values
                ntest = len(test)
                short_name = f_set[16:len(f_set) - 16]

                df_analysis, unused = tea.enrichment_analysis(test,
                                                              tissue_df,
                                                              alpha=0.05,
                                                              show=False)

                # save the analysis to the relevant folder
                savepath = '../output/HGT' + annot + '_' + method + '_Results/'
                df_analysis.to_csv(savepath + f_set + '.csv', index=False)

                tea.plot_enrichment_results(df_analysis,
                                            save='savepath' + f_set + 'Graph',
                                            ftype='pdf')

                nana = len(df_analysis)  # len of results
                nun = len(unused)  # number of genes dropped
                avf = df_analysis['Enrichment Fold Change'].mean()
                avq = df_analysis['Q value'].mean()
                s = '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}'.format(
Exemple #12
0
def run_post(project, code='post', requires='diff'):
    print_with_flush('# starting post for project {}'.format(project.objectId))

    organism = project.relation('samples').query()[0].reference.organism
    if organism.genus != 'caenorhabditis' or organism.species != 'elegans':
        print_with_flush('# Currently, post analysis is only supported for '
                         'C. elegans')
        return

    config = Config.get()
    q_threshold = config['qThreshold']
    tea_types = config['teaTypes']

    diff_path = project.paths[requires]
    post_path = project.paths[code]

    for file in os.listdir(diff_path):
        file_name = os.path.splitext(os.path.basename(file))[0]
        file_path = os.path.join(diff_path, file)

        if file.startswith('betas') and file.endswith('.csv'):
            df = pd.read_csv(file_path, index_col=0)
            gene_list = df[df.qval < q_threshold].ens_gene

            # Skip if gene list is empty.
            if len(gene_list) == 0:
                print_with_flush(
                    ('# there are no genes with q < {} in ' + '{}!').format(
                        q_threshold, file))
                print_with_flush('# this means there are no significantly ' +
                                 'differentially-expressed genes for ' +
                                 'this set of conditions.')
                continue

            for tea_type in tea_types:
                tea_file = '{}_{}'.format(
                    file_name.replace('betas_wt', 'enrichment'), tea_type)
                tea_title = os.path.join(post_path, tea_file)
                print_with_flush(
                    ('# performing {} enrichment analysis ' + 'for {}').format(
                        tea_type, file))
                df_dict = tea.fetch_dictionary(tea_type)
                df_results = tea.enrichment_analysis(gene_list,
                                                     df_dict,
                                                     aname=tea_title + '.csv',
                                                     save=True,
                                                     show=False)
                tea.plot_enrichment_results(df_results,
                                            analysis=tea_type,
                                            title=tea_title,
                                            save=True)

    # Archive.
    archive_path = archive(project, code)

    if code not in project.files:
        project.files[code] = {}
    project.files[code]['archive'] = archive_path
    project.save()

    print_with_flush('# done')