def get_df() -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" version = bioversions.get_version('biogrid') url = f'{BASE_URL}/BIOGRID-{version}/BIOGRID-IDENTIFIERS-{version}.tab.zip' df = ensure_df(PREFIX, url=url, skiprows=28, dtype=str, version=version) df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup) return df
def get_df(version: str) -> pd.DataFrame: """Get a combine ComplexPortal dataframe.""" url_base = f'ftp://ftp.ebi.ac.uk/pub/databases/intact/complex/{version}/complextab' urls = [f'{url_base}/{species}.tsv' for species in SPECIES] dfs = [ ensure_df(PREFIX, url=url, version=version, na_values={'-'}, names=COLUMNS, header=0, dtype=DTYPE) for url in urls ] return pd.concat(dfs)
def get_df() -> pd.DataFrame: """Get a combine ComplexPortal dataframe.""" dfs = [ ensure_df(PREFIX, url, version=VERSION, na_values={'-'}, names=COLUMNS, header=0, dtype=DTYPE) for url in URLS ] return pd.concat(dfs)
def get_chembl_compound_equivalences_raw(usecols=None, version: Optional[str] = None ) -> pd.DataFrame: """Get the chemical representations raw dataframe.""" if version is None: version = bioversions.get_version('chembl') base_url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}' url = f'{base_url}/chembl_{version}_chemreps.txt.gz' return ensure_df(CHEMBL_COMPOUND_PREFIX, url=url, sep='\t', usecols=usecols)
def get_chembl_protein_equivalences( version: Optional[str] = None) -> pd.DataFrame: """Get ChEMBL protein equivalences.""" if version is None: version = bioversions.get_version('chembl') url = f'ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_{version}/chembl_uniprot_mapping.txt' df = ensure_df( CHEMBL_TARGET_PREFIX, url=url, sep='\t', usecols=[0, 1], names=[TARGET_ID, SOURCE_ID], # switch around ) df.loc[:, SOURCE_PREFIX] = 'chembl.target' df.loc[:, TARGET_PREFIX] = 'uniprot' df.loc[:, PROVENANCE] = f'chembl{version}' df = df[XREF_COLUMNS] return df
def iter_terms(version: str) -> Iterable[Term]: """Iterate over DrugCentral terms.""" df = ensure_df(PREFIX, url=URL, version=version) for smiles, inchi, inchi_key, drugcentral_id, drugcentral_name, cas in df.values: if pd.isna(smiles) or pd.isna(inchi) or pd.isna(inchi_key): logger.warning('missing data for drugcentral:%s', drugcentral_id) continue xrefs = [ Reference(prefix='smiles', identifier=smiles), Reference(prefix='inchi', identifier=inchi), Reference(prefix='inchikey', identifier=inchi_key), ] if pd.notna(cas): xrefs.append(Reference(prefix='cas', identifier=cas)) yield Term( reference=Reference(prefix=PREFIX, identifier=drugcentral_id, name=drugcentral_name), xrefs=xrefs, )
def get_df() -> pd.DataFrame: """Get the BioGRID identifiers mapping dataframe.""" df = ensure_df(PREFIX, URL, skiprows=28, dtype=str) df['taxonomy_id'] = df['ORGANISM_OFFICIAL_NAME'].map(_lookup) return df
def get_terms(version: str) -> Iterable[Term]: """Get the FamPlex terms.""" entities_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/entities.csv' entities_df = ensure_df(PREFIX, url=entities_url, version=version, dtype=str) relations_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/relations.csv' relations_df = ensure_df(PREFIX, url=relations_url, version=version, header=None, sep=',', dtype=str) # TODO add xrefs # xrefs_url = f'https://raw.githubusercontent.com/sorgerlab/famplex/{version}/equivalences.csv' # xrefs_df = ensure_df(PREFIX, url=xrefs_url, version=version, header=None, sep=',', dtype=str) hgnc_name_to_id = get_name_id_mapping('hgnc') in_edges = defaultdict(list) out_edges = defaultdict(list) for h_ns, h_name, r, t_ns, t_name in relations_df.values: if h_ns == 'HGNC': h_identifier = hgnc_name_to_id.get(h_name) if h_identifier is None: logger.warning( '[%s] could not look up HGNC identifier for gene: %s', PREFIX, h_name) h = Reference(prefix='hgnc', identifier=h_identifier, name=h_name) elif h_ns == 'FPLX': h = Reference(prefix='fplx', identifier=h_name, name=h_name) elif h_ns == 'UP': continue else: print(h_ns) raise if t_ns == 'HGNC': t_identifier = hgnc_name_to_id.get(t_name) if t_identifier is None: logger.warning( '[%s] could not look up HGNC identifier for gene: %s', PREFIX, t_name) t = Reference(prefix='hgnc', identifier=t_identifier, name=t_name) elif t_ns == 'FPLX': t = Reference(prefix='fplx', identifier=t_name, name=t_name) elif h_ns == 'UP': continue else: raise out_edges[h].append((r, t)) in_edges[t].append((r, h)) for entity, in entities_df.values: reference = Reference(prefix=PREFIX, identifier=entity, name=entity) term = Term(reference=reference) for r, t in out_edges.get(reference, []): if r == 'isa' and t.prefix == 'fplx': term.append_parent(t) elif r == 'isa': term.append_relationship(is_a, t) elif r == 'partof': term.append_relationship(part_of, t) else: logging.warning('unhandled relation %s', r) for r, h in in_edges.get(reference, []): if r == 'isa': term.append_relationship(has_member, h) elif r == 'partof': term.append_relationship(has_part, h) else: logging.warning('unhandled relation %s', r) yield term