Example #1
0
def get_df() -> pd.DataFrame:
    """Get the BioGRID identifiers mapping dataframe."""
    version = bioversions.get_version('biogrid')
    url = f'{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip'
    df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version)
    df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup)
    return df
Example #2
0
def get_df(version: str) -> pd.DataFrame:
    """Get a combine ComplexPortal dataframe."""
    url_base = f'ftp://ftp.ebi.ac.uk/pub/databases/intact/complex/{version}/complextab'
    urls = [f'{url_base}/{species}.tsv' for species in SPECIES]

    dfs = [
        ensure_df(PREFIX, url=url, version=version, na_values={'-'}, names=COLUMNS, header=0, dtype=DTYPE)
        for url in urls
    ]
    return pd.concat(dfs)
Example #3
0
def get_df() -> pd.DataFrame:
    """Get a combine ComplexPortal dataframe."""
    dfs = [
        ensure_df(PREFIX,
                  url,
                  version=VERSION,
                  na_values={'-'},
                  names=COLUMNS,
                  header=0,
                  dtype=DTYPE) for url in URLS
    ]
    return pd.concat(dfs)
Example #4
0
def get_chembl_compound_equivalences_raw(usecols=None,
                                         version: Optional[str] = None
                                         ) -> pd.DataFrame:
    """Get the chemical representations raw dataframe."""
    if version is None:
        version = bioversions.get_version('chembl')

    base_url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}'
    url = f'{base_url}/chembl_{version}_chemreps.txt.gz'
    return ensure_df(CHEMBL_COMPOUND_PREFIX,
                     url=url,
                     sep='\t',
                     usecols=usecols)
Example #5
0
def get_chembl_protein_equivalences(
        version: Optional[str] = None) -> pd.DataFrame:
    """Get ChEMBL protein equivalences."""
    if version is None:
        version = bioversions.get_version('chembl')

    url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_uniprot_mapping.txt'
    df = ensure_df(
        CHEMBL_TARGET_PREFIX,
        url=url,
        sep='\t',
        usecols=[0, 1],
        names=[TARGET_ID, SOURCE_ID],  # switch around
    )
    df.loc[:, SOURCE_PREFIX] = 'chembl.target'
    df.loc[:, TARGET_PREFIX] = 'uniprot'
    df.loc[:, PROVENANCE] = f'chembl{version}'
    df = df[XREF_COLUMNS]
    return df
Example #6
0
def iter_terms(version: str) -> Iterable[Term]:
    """Iterate over DrugCentral terms."""
    df = ensure_df(PREFIX, url=URL, version=version)
    for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values:
        if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key):
            logger.warning('missing data for drugcentral:%s', drugcentral_id)
            continue
        xrefs = [
            Reference(prefix='smiles', identifier=smiles),
            Reference(prefix='inchi', identifier=inchi),
            Reference(prefix='inchikey', identifier=inchi_key),
        ]

        if pd.notna(cas):
            xrefs.append(Reference(prefix='cas', identifier=cas))

        yield Term(
            reference=Reference(prefix=PREFIX,
                                identifier=drugcentral_id,
                                name=drugcentral_name),
            xrefs=xrefs,
        )
Example #7
0
def get_df() -> pd.DataFrame:
    """Get the BioGRID identifiers mapping dataframe."""
    df = ensure_df(PREFIX, URL, skiprows=28, dtype=str)
    df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup)
    return df
Example #8
0
def get_terms(version: str) -> Iterable[Term]:
    """Get the FamPlex terms."""
    entities_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/entities.csv'
    entities_df = ensure_df(PREFIX,
                            url=entities_url,
                            version=version,
                            dtype=str)

    relations_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/relations.csv'
    relations_df = ensure_df(PREFIX,
                             url=relations_url,
                             version=version,
                             header=None,
                             sep=',',
                             dtype=str)

    # TODO add xrefs
    # xrefs_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/equivalences.csv'
    # xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=',', dtype=str)

    hgnc_name_to_id = get_name_id_mapping('hgnc')
    in_edges = defaultdict(list)
    out_edges = defaultdict(list)
    for h_ns, h_name, r, t_ns, t_name in relations_df.values:
        if h_ns == 'HGNC':
            h_identifier = hgnc_name_to_id.get(h_name)
            if h_identifier is None:
                logger.warning(
                    '[%s] could not look up HGNC identifier for gene: %s',
                    PREFIX, h_name)
            h = Reference(prefix='hgnc', identifier=h_identifier, name=h_name)
        elif h_ns == 'FPLX':
            h = Reference(prefix='fplx', identifier=h_name, name=h_name)
        elif h_ns == 'UP':
            continue
        else:
            print(h_ns)
            raise
        if t_ns == 'HGNC':
            t_identifier = hgnc_name_to_id.get(t_name)
            if t_identifier is None:
                logger.warning(
                    '[%s] could not look up HGNC identifier for gene: %s',
                    PREFIX, t_name)
            t = Reference(prefix='hgnc', identifier=t_identifier, name=t_name)
        elif t_ns == 'FPLX':
            t = Reference(prefix='fplx', identifier=t_name, name=t_name)
        elif h_ns == 'UP':
            continue
        else:
            raise

        out_edges[h].append((r, t))
        in_edges[t].append((r, h))

    for entity, in entities_df.values:
        reference = Reference(prefix=PREFIX, identifier=entity, name=entity)
        term = Term(reference=reference)

        for r, t in out_edges.get(reference, []):
            if r == 'isa' and t.prefix == 'fplx':
                term.append_parent(t)
            elif r == 'isa':
                term.append_relationship(is_a, t)
            elif r == 'partof':
                term.append_relationship(part_of, t)
            else:
                logging.warning('unhandled relation %s', r)

        for r, h in in_edges.get(reference, []):
            if r == 'isa':
                term.append_relationship(has_member, h)
            elif r == 'partof':
                term.append_relationship(has_part, h)
            else:
                logging.warning('unhandled relation %s', r)
        yield term