def get_canonical_ensembl_names(taxon_id): beginners = { 7227: 'FLYBASE:', 6239: 'WormBase:', 3702: 'Araport:', 511145: 'EcoGene', 559292: 'SGD' } if taxon_id in beginners.keys(): beg = beginners[taxon_id] else: beg = 'Ensembl:' gene_info = meta.gene_info(taxon_id, usecols=['gene_ncbi', 'dbXrefs']) gene_info = utils.split_text_to_multiple_rows(gene_info, 'dbXrefs', '\|') f = gene_info['dbXrefs'].str.startswith(beg) gene_info = gene_info.loc[f, :].copy() gene_info['dbXrefs'] = gene_info.loc[:, 'dbXrefs'].str.replace( r'^' + beg, '', ) gene_info = gene_info.drop_duplicates() canonical_ensembl_names = set(gene_info['dbXrefs'].values) return canonical_ensembl_names
def _get_gene_ncbi_2_ensembl(): gi = meta.gene_info(taxon_id=9606) f = gi['dbXrefs'].str.contains('Ensembl:ENSG[0-9]*') gi.loc[f, 'gene_ensembl'] = gi.loc[f, 'dbXrefs'].str.extract( 'Ensembl:(ENSG[0-9]*)', expand=False) gi = gi[['gene_ncbi', 'gene_ensembl']].drop_duplicates() gi = gi.drop_duplicates('gene_ensembl', keep=False) gi = gi[gi['gene_ncbi'].isin(get_ref_genes())] return gi
def reference_genes(taxon_id, ref_code): """ Obtains a list of reference genes Input: taxon_id int ref_code str; if it contains l -> at least one medline paper o -> official nomenclature require p -> protein-coding only Output: ref_genes sorted list of gene identifiers """ df = meta.gene_info(taxon_id) if df.shape[0] == 0: raise EnvironmentError(""" Did not find gene info for taxon {}""".format(int(taxon_id))) if 'l' in ref_code: genes_in_medline = medline.gene2pubmed(taxon_id, ['gene_ncbi']) f = df.loc[:, 'gene_ncbi'].isin(genes_in_medline['gene_ncbi']) df = df.loc[f, :] if df.shape[0] == 0: raise EnvironmentError(""" After filtering for genes with at least one paper, no gene is left.""") if 'o' in ref_code: # official nomeclature f = df.loc[:, 'Nomenclature_status'] == 'O' df = df.loc[f, :] if df.shape[0] == 0: raise EnvironmentError(""" After filtering for genes with official nomeclature, no gene is left.""") if 'p' in ref_code: # protein-coding f = df.loc[:, 'type_of_gene'] == 'protein-coding' df = df.loc[f, :] if df.shape[0] == 0: raise EnvironmentError(""" After filtering for protein-coding, no gene is left.""") if 'r' in ref_code: genes_in_medline = medline.gene2pubmed(taxon_id, ['pubmed_id', 'gene_ncbi'], paper_kind='research') f = df.loc[:, 'gene_ncbi'].isin(genes_in_medline['gene_ncbi']) df = df.loc[f, :] if df.shape[0] == 0: raise EnvironmentError(""" After filtering for genes with at least one research paper, no gene is left.""") ref_genes = sorted(df.loc[:, 'gene_ncbi'].values) return ref_genes
def any_gwas(): ebi_gwas = gwas_studies.ebi_gwas() f = ebi_gwas['MAPPED_GENE'].str.contains('[;,-]') == True gwas = ebi_gwas.loc[ ~f, ['MAPPED_GENE', 'DISEASE/TRAIT', 'PVALUE_MLOG', 'pubmed_id']].rename( columns={ 'MAPPED_GENE': 'symbol_ambiguous', 'DISEASE/TRAIT': 'trait', 'PVALUE_MLOG': 'log_pvalue' } ) gwas = pd.merge( gwas, meta.gene_info(taxon_id=9606, usecols=[ 'symbol_ncbi', 'gene_ncbi']), left_on='symbol_ambiguous', right_on='symbol_ncbi', how='inner' ).drop('symbol_ambiguous', axis=1).drop('symbol_ncbi', axis=1) gwas = gwas[gwas['gene_ncbi'].isin(get_ref_genes())] ge = sorted(gwas['gene_ncbi'].unique()) gwas = gwas.sort_values('log_pvalue', ascending=False) gwas = gwas.drop_duplicates( ['trait', 'pubmed_id', 'gene_ncbi'], keep='first') studies_per_phenotype = gwas[ ['pubmed_id', 'trait']].drop_duplicates()[ 'trait'].value_counts() required_studies = 1 important_gwas = gwas.loc[ ( gwas['trait'].isin( studies_per_phenotype[ studies_per_phenotype >= required_studies].index)), : ][['pubmed_id', 'trait', 'gene_ncbi']].drop_duplicates() he = pd.merge( important_gwas.groupby( ['trait', 'gene_ncbi']).size( ).reset_index().rename(columns={0: 'records'}), studies_per_phenotype.to_frame( 'studies').reset_index().rename(columns={'index': 'trait'})) he.loc[:, 'fraction_of_any_gwas_studies'] = he['records'] / he['studies'] dd = he.pivot( index='gene_ncbi', columns='trait', values='fraction_of_any_gwas_studies' ) dd.columns = ['gwas_any_{}'.format(x) for x in dd.columns] dd = dd.reindex(ge) dd = dd.fillna(0) cl = dd > 0.0 cl.loc[:, 'any_gwas'] = cl.any(axis=1) return cl, dd, ge