def mappings(): """Find single mapped entries.""" df = get_xrefs_df() idx = (df['target_db'] == 'pr') & (df['type'] == 'protein') protein_pr_errors = get_single_mappings(df, idx) if protein_pr_errors: click.secho('Some entries only mapped to Protein Ontology', fg='red', bold=True) _p(protein_pr_errors) idx = (df['target_db'] == 'go') & (df['type'] == 'protein complex') go_complex_errors = get_single_mappings(df, idx) if go_complex_errors: click.secho('Some complexes only mapped to Gene Ontology', fg='red', bold=True) _p(go_complex_errors) idx = (df['target_db'] == 'mesh') mesh_errors = get_single_mappings(df, idx) if mesh_errors: click.secho('Some roles only mapped to MeSH', fg='red', bold=True) _p(mesh_errors) idx = (df['type'] == 'molecular function') mf_errors = get_single_mappings(df, idx) if mf_errors: click.secho('Some roles only mapped to molecular function', fg='red', bold=True) _p(mf_errors) if any([ protein_pr_errors, go_complex_errors, mesh_errors, mf_errors, ]): sys.exit(1)
def main(): """Run the MeSH curation pipeline.""" xrefs_df = get_xrefs_df() mesh_xrefs_df = xrefs_df[xrefs_df['source_db'] == 'mesh'] curated_mesh_ids = set(mesh_xrefs_df['source_id']) terms = { identifier: (name, suffix.strip('s')) for identifier, name in pyobo.get_id_name_mapping('mesh').items() if identifier not in curated_mesh_ids and identifier not in BLACKLIST for suffix in SUFFIXES if name.lower().endswith(suffix) } for i, (identifier, (name, suffix)) in enumerate(sorted(terms.items(), key=lambda t: t[1][0]), start=1): print('mesh', identifier, name, suffix, '?', '?', '?', '?', sep='\t')
def main(show_ungrounded: bool, output: Optional[TextIO]): """Run the MeSH curation pipeline.""" xrefs_df = get_xrefs_df() mesh_xrefs_df = xrefs_df[xrefs_df['source_db'] == 'mesh'] curated_mesh_ids = set(mesh_xrefs_df['source_id']) terms = { identifier: (name, name[:-len(suffix)], suffix.strip('s')) for identifier, name in pyobo.get_id_name_mapping('mesh').items() if identifier not in curated_mesh_ids and identifier not in BLACKLIST for suffix in SUFFIXES if name.lower().endswith(suffix) } it = sorted(terms.items(), key=lambda t: t[1][0]) it = tqdm(it, desc='making MeSH curation sheet') for i, (identifier, (name, search_text, suffix)) in enumerate(it, start=1): for row in yield_gilda('mesh', identifier, name, suffix, search_text, show_ungrounded or output is not None): print(*row, sep='\t', file=output)
BIOCHEMICAL_ROLE_CHEBI_ID = '52206' PATHWAY_INHIBITOR_CHEBI_ID = '76932' ENZYME_INHIBITOR_CHEBI_ID = '23924' AGONIST_CHEBI_ID = '48705' INVERSE_AGONIST_CHEBI_ID = '90847' INHIBITOR_CHEBI_ID = '35222' ANTAGONIST_CHEBI_ID = '48706' BLACKLIST = [ '48001', # protein synthesis inhibitor '64106', # protein kinase agonist ] chebi_obo = pyobo.get('chebi') chebi_id_to_name = pyobo.get_id_name_mapping('chebi') XREFS_DF = get_xrefs_df() CURATED_ROLE_CHEBI_IDS = { source_id[len('CHEBI:'):] for source_db, source_id in XREFS_DF[['source_db', 'source_id']].values if source_db == 'chebi' } IRRELEVANT_ROLE_CHEBI_IDS = set( itt.chain.from_iterable( chebi_obo.descendants(chebi_id[len('CHEBI'):]) for chebi_id in get_irrelevant_roles_df().identifier if chebi_id[len('CHEBI'):] in chebi_obo.hierarchy)) def _get_inhibitors_reclassification() -> pd.DataFrame: return pd.read_csv(RECLASSIFICATION_PATH, sep='\t', comment='#')
def get_relations_df() -> pd.DataFrame: """Assemble the relations dataframe.""" xrefs_df = get_xrefs_df() logger.info('loading famplex mapping') famplex_id_to_members = defaultdict(list) famplex_relations_df = pd.read_csv(FAMPLEX_RELATIONS_URL) for source_id, source_name, rel, target_db, target_name in famplex_relations_df.values: if source_id.lower() == 'hgnc' and rel == 'isa' and target_db.lower() == 'fplx': try: hgnc_id = hgnc_name_to_id[source_name] except KeyError: logger.warning(f'Could not find {source_name} for fplx:{target_name}') continue famplex_id_to_members[target_name].append((hgnc_id, source_name)) logger.info('getting enzyme classes') expasy_graph, ec_code_to_children = get_expasy_closure() logger.info('getting ec2go') ec2go = get_ec2go() logger.info('inferring over target hierarchies') x = defaultdict(list) for source_db, source_id, _, modulation, target_type, target_db, target_id, target_name in xrefs_df.values: if source_db != 'chebi': continue if target_db == 'hgnc': # Append original x[source_db, source_id].append((modulation, 'protein', 'hgnc', target_id, target_name)) # Append inferred for uniprot_id, uniprot_name in get_uniprot_id_names(target_id): x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name)) elif target_db == 'fplx': # Append original x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name)) # Append inferred for hgnc_id, hgnc_symbol in famplex_id_to_members.get(target_id, []): x[source_db, source_id].append((modulation, 'protein', 'hgnc', hgnc_id, hgnc_symbol)) for uniprot_id, uniprot_name in get_uniprot_id_names(hgnc_id): x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name)) elif target_db == 'ec-code': children_ec_codes = ec_code_to_children.get(target_id) if children_ec_codes is None: # this is the case for about 15 entries logger.info(f'could not find children of {target_db}:{target_id}') continue for sub_target_db, sub_target_id, sub_target_name in children_ec_codes: target_type = DB_TO_TYPE[sub_target_db] x[source_db, source_id].append(( modulation, target_type, sub_target_db, sub_target_id, sub_target_name, )) for go_id, go_name in ec2go.get(target_id, []): x[source_db, source_id].append(( modulation, 'molecular function', 'go', go_id, go_name, )) else: x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name)) logger.info('inferring over role hiearchies') db_to_role_to_chemical_curies = { 'chebi': get_chebi_role_to_children(), } db_to_id_mapping = { 'chebi': get_id_name_mapping('chebi'), } #: A set of databases to remove the prefix from remove_prefix = {'chebi'} rows = [] for (role_db, role_id), entries in x.items(): if role_db in remove_prefix and role_id.lower().startswith(f'{role_db}:'.lower()): role_id = role_id[len(f'{role_db}:'):] # TODO map role_db, role_id to set of sub_role_db, sub_role_id sub_role_curies = {(role_db, role_id)} for modulation, target_type, target_db, target_id, target_name in entries: chemical_curies = set(itt.chain.from_iterable( db_to_role_to_chemical_curies[sub_role_db].get(sub_role_id, []) for sub_role_db, sub_role_id in sub_role_curies )) if not chemical_curies: logger.debug('no inference for %s:%s', role_db, role_id) continue for chemical_db, chemical_id in chemical_curies: rows.append(( chemical_db, chemical_id, db_to_id_mapping[chemical_db][chemical_id], modulation, target_type, target_db, target_id, target_name, )) return pd.DataFrame(rows, columns=XREFS_COLUMNS)
def rewrite_repo_readme(): """Rewrite the summary of curated content in the repository's readme, automatically.""" df = get_xrefs_df() summary_df = df.groupby(['source_db', 'modulation', 'type', 'target_db']).size().reset_index() summary_df.columns = ['Source Database', 'Modulation', 'Target Type', 'Target Database', 'Count'] summary_df.to_csv(os.path.join(EXPORT_DIRECTORY, 'curated_summary.tsv'), sep='\t', index=False) modulation_summary_df = df.groupby('modulation').size().reset_index() modulation_summary_df.columns = ['Modulation', 'Count'] modulation_summary_df.to_csv( os.path.join(EXPORT_DIRECTORY, 'curated_summary_by_modulation.tsv'), sep='\t', index=False, ) type_summary_df = df.groupby('type').size().reset_index() type_summary_df.columns = ['Target Type', 'Count'] type_summary_df.to_csv( os.path.join(EXPORT_DIRECTORY, 'curated_summary_by_type.tsv'), sep='\t', index=False, ) namespace_summary_df = df.groupby('target_db').size().reset_index() namespace_summary_df.columns = ['Target Database', 'Count'] namespace_summary_df.to_csv( os.path.join(EXPORT_DIRECTORY, 'curated_summary_by_namespace.tsv'), sep='\t', index=False, ) logger.info('Plotting modulation and target type summary') fig, (lax, rax) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5)) g = sns.barplot(y="Modulation", x='Count', data=modulation_summary_df, ax=lax) g.set_xscale("log") g = sns.barplot(y="Target Type", x='Count', data=type_summary_df, ax=rax) g.set_xscale("log") plt.tight_layout() plt.savefig(os.path.join(EXPORT_DIRECTORY, 'curated_summary.png'), dpi=300) text = f'There are {len(df.index)} curated roles as of export on {time.asctime()}\n\n' text += tabulate(modulation_summary_df.values, ['Modulation', 'Count'], tablefmt='rst') text += '\n\n' text += tabulate(type_summary_df.values, ['Target Entity Type', 'Count'], tablefmt='rst') text += '\n\n' text += tabulate(namespace_summary_df.values, ['Target Database', 'Count'], tablefmt='rst') text += '\n' path = os.path.join(HERE, 'README.rst') with open(path) as file: readme = [line.rstrip() for line in file] for i, line in enumerate(readme): if line == 'Summary': start = i + 2 break else: raise ValueError('could not find summary block') for i, line in enumerate(readme): if line == 'Axioms': end = i break else: raise ValueError('could not find end block') with open(path, 'w') as file: for line in readme[:start]: print(line, file=file) print(text, file=file) for line in readme[end:]: print(line, file=file)