Exemple #1
0
def add_gene_name_gprofiler(data_df: pd.DataFrame, col: str,
                            organism: str) -> pd.DataFrame:
    gp = GProfiler(return_dataframe=True)
    protein_list = data_df[col].tolist()

    # details of what returns the following function : https://pypi.org/project/gprofiler-official/
    # TODO : documentation
    res = gp.convert(organism=organism,
                     query=protein_list,
                     target_namespace='UNIPROTSWISSPROT')

    # now add the relevant results to dataframe
    res_f = res[['incoming', 'name', 'namespaces']]

    res_f.rename(columns={
        "incoming": col,
        "name": "gene_name",
        "namespaces": "gene_name_bank"
    },
                 inplace=True)

    res_f = res_f.replace({
        'UNIPROTSWISSPROT,UNIPROT_GN_ACC': 'Swiss-Prot',
        'UNIPROTSPTREMBL,UNIPROT_GN_ACC': 'TrEMBL'
    })

    df = data_df.merge(res_f, how='left', on=col)

    # gProfiler returns one line for each alias of the gene (as in alias section in Uniprot): keep only the first one
    df = df[~df['Accession'].duplicated(keep='first')]

    return df
def get_gene_names(geneList):
    gp = GProfiler(return_dataframe=True)
    df = gp.convert(organism='athaliana',
                    query=geneList)[['incoming', 'name', 'description']]
    df['description'] = df.apply(
        lambda x: x['description'].split('[')[0].split(';')[0], axis=1)
    return df
Exemple #3
0
def add_gene_name_gprofiler(data_df, col, organism):
    print(type(data_df[col].tolist()))
    gp = GProfiler(return_dataframe=True)

    # details of what returns the following function : caleydo.org/tools/
    res = gp.convert(organism=organism,
                     query=data_df[col].tolist(),
                     target_namespace='UNIPROTSWISSPROT')

    # now add the relevant results to dataframe
    res_f = res[['incoming', 'name', 'namespaces']]

    res_f.rename(columns={"incoming":col,
                       "name": "gene_name",
                       "namespaces": "gene_name_bank"}, inplace=True)

    res_f = res_f.replace({'UNIPROTSWISSPROT,UNIPROT_GN_ACC':'Swiss-Prot',
                           'UNIPROTSPTREMBL,UNIPROT_GN_ACC':'TrEMBL'})

    df = data_df.merge(res_f, how='left', on=col)

    # TODO check if concordant with description
    #df['OK'] = np.where(df['gene_name_PD'] == df['converted_gprofiler'], True, False)
    print(df)

    return df
Exemple #4
0
 def get_gene_list(self, samples_stat):
     for sample in samples_stat:
         gene = samples_stat[sample]['gene']
         if len(gene) == 0:
             continue
         else:
             gp = GProfiler(user_agent='ExampleTool', return_dataframe=True)
             df = gp.profile(organism='hsapiens', query=gene)
             go = df[df['native'].str.contains('GO')]
             go.to_csv('{module}/{sample}/GO_FuncTerm.csv'.format(
                 module=self.module, sample=sample),
                       header=True,
                       index=False,
                       sep=',')
             self.plot_go(go, sample, 'GO')
             kegg = df[df['native'].str.contains('KEGG')]
             kegg.to_csv('{module}/{sample}/KEGG_FuncTerm.csv'.format(
                 module=self.module, sample=sample),
                         header=True,
                         index=False,
                         sep=',')
             self.plot_go(kegg, sample, 'KEGG')
             df = gp.convert(organism='hsapiens',
                             query=gene,
                             target_namespace='ENTREZGENE_ACC')
             df.to_csv('{module}/{sample}/Entrez_Gene_converted.csv'.format(
                 module=self.module, sample=sample),
                       header=True,
                       index=False,
                       sep=',')
             with open(
                     '{module}/{sample}/gene_list.txt'.format(
                         module=self.module, sample=sample), 'wt') as f:
                 f.write('\n'.join(gene))
Exemple #5
0
def gene_name_annotation_short(genes):
    gp = GProfiler(return_dataframe=True)
    gene_annot = gp.convert(organism='mmusculus',
             query= genes,
             target_namespace='ENTREZGENE_ACC')

    gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between []
    gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1)
    return gene_annot
Exemple #6
0
def gene_name_annotation_long(genes):
    gp = GProfiler(return_dataframe=True)
    gene_annot = gp.convert(organism='mmusculus',
             query= genes,
             target_namespace='ENTREZGENE_ACC')

    gene_annot['short_description'] = gene_annot['description'].map(lambda x: re.sub('\[.+\]', '', x)) # delete extra text between []
    gene_annot = gene_annot.drop(['description','name', 'converted','n_incoming','n_converted', 'namespaces', 'query'], axis=1)
    with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # print all lines
        return display(gene_annot)
Exemple #7
0
def make_tcga_gtex_id_mapping_file(tcga_gtex_id_df, tcga_gtex_id_addr):

    # print(tcga_gtex_id_df)
    ensembl_id = tcga_gtex_id_df['sample'].str.split(".", n=1, expand=True)
    tcga_gtex_id_df['ensembl_gene'] = ensembl_id[0]

    # print(tcga_gtex_id_df)
    gp = GProfiler(return_dataframe=True)
    ensembl_2_symbol = gp.convert(
        organism='hsapiens',
        query=tcga_gtex_id_df['ensembl_gene'].tolist(),
        target_namespace='ENSG')
    # print(ensembl_2_symbol[['incoming','name']])

    tcga_gtex_id_df['gene_symbol'] = tcga_gtex_id_df[['ensembl_gene']].merge(
        ensembl_2_symbol,
        how='left',
        right_on='incoming',
        left_on='ensembl_gene').name

    # print(tcga_gtex_id_df)

    tcga_gtex_id_df.to_csv(tcga_gtex_id_addr, sep='\t', index=False)
res
myres
res.columns
myres.columns
myres.goshv
res.pvalue
res.pval
res
res.columns
res.p_value
myres.p_value
res.significant
myres.significant
gp.profile?
myres.columns
gp.convert(organism='mmusculus', query=genes)
gp.convert(organism='mmusculus', query=genes, target_namespace='name')
gp.convert?
gp.convert(organism='mmusculus', query=genes, target_namespace='name')
gp.convert(organism='mmusculus', query=genes, )
gp.convert(organism='mmusculus', query=genes, ).namespaces
gp.convert(organism='mmusculus', query=genes, target_namespace='ENSG' )
r = requests.post(
    url='https://biit.cs.ut.ee/gprofiler/api/convert/convert/',
    json={
        'organism':'mmusculus',
        'target':'UCSC',
        'query':genes,
    }
    )
x = r.json()