Example #1
0
def mappings():
    """Find single mapped entries."""
    df = get_xrefs_df()

    idx = (df['target_db'] == 'pr') & (df['type'] == 'protein')
    protein_pr_errors = get_single_mappings(df, idx)
    if protein_pr_errors:
        click.secho('Some entries only mapped to Protein Ontology', fg='red', bold=True)
        _p(protein_pr_errors)

    idx = (df['target_db'] == 'go') & (df['type'] == 'protein complex')
    go_complex_errors = get_single_mappings(df, idx)
    if go_complex_errors:
        click.secho('Some complexes only mapped to Gene Ontology', fg='red', bold=True)
        _p(go_complex_errors)

    idx = (df['target_db'] == 'mesh')
    mesh_errors = get_single_mappings(df, idx)
    if mesh_errors:
        click.secho('Some roles only mapped to MeSH', fg='red', bold=True)
        _p(mesh_errors)

    idx = (df['type'] == 'molecular function')
    mf_errors = get_single_mappings(df, idx)
    if mf_errors:
        click.secho('Some roles only mapped to molecular function', fg='red', bold=True)
        _p(mf_errors)

    if any([
        protein_pr_errors,
        go_complex_errors,
        mesh_errors,
        mf_errors,
    ]):
        sys.exit(1)
def main():
    """Run the MeSH curation pipeline."""
    xrefs_df = get_xrefs_df()
    mesh_xrefs_df = xrefs_df[xrefs_df['source_db'] == 'mesh']
    curated_mesh_ids = set(mesh_xrefs_df['source_id'])

    terms = {
        identifier: (name, suffix.strip('s'))
        for identifier, name in pyobo.get_id_name_mapping('mesh').items()
        if identifier not in curated_mesh_ids and identifier not in BLACKLIST
        for suffix in SUFFIXES if name.lower().endswith(suffix)
    }

    for i, (identifier, (name,
                         suffix)) in enumerate(sorted(terms.items(),
                                                      key=lambda t: t[1][0]),
                                               start=1):
        print('mesh', identifier, name, suffix, '?', '?', '?', '?', sep='\t')
def main(show_ungrounded: bool, output: Optional[TextIO]):
    """Run the MeSH curation pipeline."""
    xrefs_df = get_xrefs_df()
    mesh_xrefs_df = xrefs_df[xrefs_df['source_db'] == 'mesh']
    curated_mesh_ids = set(mesh_xrefs_df['source_id'])

    terms = {
        identifier: (name, name[:-len(suffix)], suffix.strip('s'))
        for identifier, name in pyobo.get_id_name_mapping('mesh').items()
        if identifier not in curated_mesh_ids and identifier not in BLACKLIST
        for suffix in SUFFIXES if name.lower().endswith(suffix)
    }

    it = sorted(terms.items(), key=lambda t: t[1][0])
    it = tqdm(it, desc='making MeSH curation sheet')
    for i, (identifier, (name, search_text, suffix)) in enumerate(it, start=1):
        for row in yield_gilda('mesh', identifier, name, suffix, search_text,
                               show_ungrounded or output is not None):
            print(*row, sep='\t', file=output)
BIOCHEMICAL_ROLE_CHEBI_ID = '52206'
PATHWAY_INHIBITOR_CHEBI_ID = '76932'
ENZYME_INHIBITOR_CHEBI_ID = '23924'
AGONIST_CHEBI_ID = '48705'
INVERSE_AGONIST_CHEBI_ID = '90847'
INHIBITOR_CHEBI_ID = '35222'
ANTAGONIST_CHEBI_ID = '48706'
BLACKLIST = [
    '48001',  # protein synthesis inhibitor
    '64106',  # protein kinase agonist
]

chebi_obo = pyobo.get('chebi')
chebi_id_to_name = pyobo.get_id_name_mapping('chebi')

XREFS_DF = get_xrefs_df()
CURATED_ROLE_CHEBI_IDS = {
    source_id[len('CHEBI:'):]
    for source_db, source_id in XREFS_DF[['source_db', 'source_id']].values
    if source_db == 'chebi'
}
IRRELEVANT_ROLE_CHEBI_IDS = set(
    itt.chain.from_iterable(
        chebi_obo.descendants(chebi_id[len('CHEBI'):])
        for chebi_id in get_irrelevant_roles_df().identifier
        if chebi_id[len('CHEBI'):] in chebi_obo.hierarchy))


def _get_inhibitors_reclassification() -> pd.DataFrame:
    return pd.read_csv(RECLASSIFICATION_PATH, sep='\t', comment='#')
Example #5
0
def get_relations_df() -> pd.DataFrame:
    """Assemble the relations dataframe."""
    xrefs_df = get_xrefs_df()

    logger.info('loading famplex mapping')
    famplex_id_to_members = defaultdict(list)
    famplex_relations_df = pd.read_csv(FAMPLEX_RELATIONS_URL)
    for source_id, source_name, rel, target_db, target_name in famplex_relations_df.values:
        if source_id.lower() == 'hgnc' and rel == 'isa' and target_db.lower() == 'fplx':
            try:
                hgnc_id = hgnc_name_to_id[source_name]
            except KeyError:
                logger.warning(f'Could not find {source_name} for fplx:{target_name}')
                continue
            famplex_id_to_members[target_name].append((hgnc_id, source_name))

    logger.info('getting enzyme classes')
    expasy_graph, ec_code_to_children = get_expasy_closure()
    logger.info('getting ec2go')
    ec2go = get_ec2go()

    logger.info('inferring over target hierarchies')
    x = defaultdict(list)
    for source_db, source_id, _, modulation, target_type, target_db, target_id, target_name in xrefs_df.values:
        if source_db != 'chebi':
            continue

        if target_db == 'hgnc':
            # Append original
            x[source_db, source_id].append((modulation, 'protein', 'hgnc', target_id, target_name))
            # Append inferred
            for uniprot_id, uniprot_name in get_uniprot_id_names(target_id):
                x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name))

        elif target_db == 'fplx':
            # Append original
            x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name))
            # Append inferred
            for hgnc_id, hgnc_symbol in famplex_id_to_members.get(target_id, []):
                x[source_db, source_id].append((modulation, 'protein', 'hgnc', hgnc_id, hgnc_symbol))
                for uniprot_id, uniprot_name in get_uniprot_id_names(hgnc_id):
                    x[source_db, source_id].append((modulation, 'protein', 'uniprot', uniprot_id, uniprot_name))

        elif target_db == 'ec-code':
            children_ec_codes = ec_code_to_children.get(target_id)
            if children_ec_codes is None:
                # this is the case for about 15 entries
                logger.info(f'could not find children of {target_db}:{target_id}')
                continue

            for sub_target_db, sub_target_id, sub_target_name in children_ec_codes:
                target_type = DB_TO_TYPE[sub_target_db]
                x[source_db, source_id].append((
                    modulation, target_type, sub_target_db, sub_target_id, sub_target_name,
                ))

            for go_id, go_name in ec2go.get(target_id, []):
                x[source_db, source_id].append((
                    modulation, 'molecular function', 'go', go_id, go_name,
                ))

        else:
            x[source_db, source_id].append((modulation, target_type, target_db, target_id, target_name))

    logger.info('inferring over role hiearchies')
    db_to_role_to_chemical_curies = {
        'chebi': get_chebi_role_to_children(),
    }
    db_to_id_mapping = {
        'chebi': get_id_name_mapping('chebi'),
    }
    #: A set of databases to remove the prefix from
    remove_prefix = {'chebi'}

    rows = []
    for (role_db, role_id), entries in x.items():
        if role_db in remove_prefix and role_id.lower().startswith(f'{role_db}:'.lower()):
            role_id = role_id[len(f'{role_db}:'):]

        # TODO map role_db, role_id to set of sub_role_db, sub_role_id
        sub_role_curies = {(role_db, role_id)}

        for modulation, target_type, target_db, target_id, target_name in entries:
            chemical_curies = set(itt.chain.from_iterable(
                db_to_role_to_chemical_curies[sub_role_db].get(sub_role_id, [])
                for sub_role_db, sub_role_id in sub_role_curies
            ))
            if not chemical_curies:
                logger.debug('no inference for %s:%s', role_db, role_id)
                continue
            for chemical_db, chemical_id in chemical_curies:
                rows.append((
                    chemical_db, chemical_id, db_to_id_mapping[chemical_db][chemical_id],
                    modulation, target_type, target_db, target_id, target_name,
                ))
    return pd.DataFrame(rows, columns=XREFS_COLUMNS)
Example #6
0
def rewrite_repo_readme():
    """Rewrite the summary of curated content in the repository's readme, automatically."""
    df = get_xrefs_df()

    summary_df = df.groupby(['source_db', 'modulation', 'type', 'target_db']).size().reset_index()
    summary_df.columns = ['Source Database', 'Modulation', 'Target Type', 'Target Database', 'Count']
    summary_df.to_csv(os.path.join(EXPORT_DIRECTORY, 'curated_summary.tsv'), sep='\t', index=False)

    modulation_summary_df = df.groupby('modulation').size().reset_index()
    modulation_summary_df.columns = ['Modulation', 'Count']
    modulation_summary_df.to_csv(
        os.path.join(EXPORT_DIRECTORY, 'curated_summary_by_modulation.tsv'), sep='\t', index=False,
    )
    type_summary_df = df.groupby('type').size().reset_index()
    type_summary_df.columns = ['Target Type', 'Count']
    type_summary_df.to_csv(
        os.path.join(EXPORT_DIRECTORY, 'curated_summary_by_type.tsv'), sep='\t', index=False,
    )
    namespace_summary_df = df.groupby('target_db').size().reset_index()
    namespace_summary_df.columns = ['Target Database', 'Count']
    namespace_summary_df.to_csv(
        os.path.join(EXPORT_DIRECTORY, 'curated_summary_by_namespace.tsv'), sep='\t', index=False,
    )

    logger.info('Plotting modulation and target type summary')
    fig, (lax, rax) = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
    g = sns.barplot(y="Modulation", x='Count', data=modulation_summary_df, ax=lax)
    g.set_xscale("log")
    g = sns.barplot(y="Target Type", x='Count', data=type_summary_df, ax=rax)
    g.set_xscale("log")
    plt.tight_layout()
    plt.savefig(os.path.join(EXPORT_DIRECTORY, 'curated_summary.png'), dpi=300)

    text = f'There are {len(df.index)} curated roles as of export on {time.asctime()}\n\n'
    text += tabulate(modulation_summary_df.values, ['Modulation', 'Count'], tablefmt='rst')
    text += '\n\n'
    text += tabulate(type_summary_df.values, ['Target Entity Type', 'Count'], tablefmt='rst')
    text += '\n\n'
    text += tabulate(namespace_summary_df.values, ['Target Database', 'Count'], tablefmt='rst')
    text += '\n'

    path = os.path.join(HERE, 'README.rst')
    with open(path) as file:
        readme = [line.rstrip() for line in file]

    for i, line in enumerate(readme):
        if line == 'Summary':
            start = i + 2
            break
    else:
        raise ValueError('could not find summary block')

    for i, line in enumerate(readme):
        if line == 'Axioms':
            end = i
            break
    else:
        raise ValueError('could not find end block')

    with open(path, 'w') as file:
        for line in readme[:start]:
            print(line, file=file)
        print(text, file=file)
        for line in readme[end:]:
            print(line, file=file)