def to_bel(self) -> BELGraph: """Return a new graph corresponding to the pathway. Example Usage: >>> manager = Manager() >>> manager.get_pathway_graph_by_id('WP61') # Notch signaling pathway """ graph = BELGraph( name='WikiPathways Associations', version='1.0.0', ) wikipathways_namespace = self.upload_bel_namespace() graph.namespace_url[ wikipathways_namespace.keyword] = wikipathways_namespace.url hgnc_manager = bio2bel_hgnc.Manager(engine=self.engine, session=self.session) hgnc_namespace = hgnc_manager.upload_bel_namespace() graph.namespace_url[hgnc_namespace.keyword] = hgnc_namespace.url for pathway in tqdm(self.get_all_pathways(), total=self._count_model(Pathway)): for protein in pathway.proteins: pathway_bel = pathway.serialize_to_pathway_node() protein_bel = protein.serialize_to_protein_node() graph.add_part_of(protein_bel, pathway_bel) return graph
def to_bel(self, drug_namespace: Optional[str] = None, target_namespace: Optional[str] = None) -> BELGraph: """Export DrugBank as BEL.""" graph = BELGraph( name='DrugBank', version='5.1.4', ) self.add_namespace_to_graph(graph) hgnc_manager = bio2bel_hgnc.Manager(engine=self.engine, session=self.session) hgnc_manager.add_namespace_to_graph(graph) dpis = self.list_drug_protein_interactions() dpis: Iterable[DrugProteinInteraction] = tqdm( dpis, total=self.count_drug_protein_interactions(), desc='Mapping drug-protein interactions to BEL', ) for dpi in dpis: dpi.add_to_graph(graph, drug_namespace=drug_namespace, target_namespace=target_namespace) return graph
def populate(cls): """Fill the HGNC and mirTarBase databases. Contents of the test Excel sheet: miRTarBase ID miRNA Species (miRNA) Target Gene Target Gene (Entrez Gene ID) Species (Target Gene) Experiments Support Type References (PMID) MIRT000002 hsa-miR-20a-5p H**o sapiens HIF1A 3091 H**o sapiens Luciferase reporter assay//Western blot//Northern blot//qRT-PCR Functional MTI 18632605 MIRT000002 hsa-miR-20a-5p H**o sapiens HIF1A 3091 H**o sapiens Luciferase reporter assay//qRT-PCR//Western blot Functional MTI 23911400 MIRT000002 hsa-miR-20a-5p H**o sapiens HIF1A 3091 H**o sapiens HITS-CLIP Functional MTI (Weak) 22473208 MIRT000178 hsa-miR-20a-5p H**o sapiens TCEAL1 9338 H**o sapiens Luciferase reporter assay//Microarray//Northern blot//qRT-PCR//Western blot Functional MTI 23059786 MIRT000004 dme-miR-8-3p Drosophila melanogaster ush 33225 Drosophila melanogaster qRT-PCR//Luciferase reporter assay//Western blot Functional MTI 20005803 MIRT000005 mmu-miR-124-3p Mus musculus Itgb1 16412 Mus musculus Luciferase reporter assay//Microarray//qRT-PCR Functional MTI 18042700 MIRT000005 mmu-miR-124-3p Mus musculus Itgb1 16412 Mus musculus Luciferase reporter assay//qRT-PCR//Western blot//Reporter assay;Microarray Functional MTI 18619591 MIRT000006 hsa-miR-146a-5p H**o sapiens CXCR4 7852 H**o sapiens qRT-PCR//Luciferase reporter assay//Western blot Functional MTI 18568019 MIRT000006 hsa-miR-146a-5p H**o sapiens CXCR4 7852 H**o sapiens Microarray Functional MTI (Weak) 20375304 MIRT000012 hsa-miR-122-5p H**o sapiens CYP7A1 1581 H**o sapiens qRT-PCR//Luciferase reporter assay Functional MTI 20351063 """ cls.hgnc_manager = bio2bel_hgnc.Manager(connection=cls.connection) cls.hgnc_manager._create_tables() json_data = cls.hgnc_manager.load_hgnc_json( hgnc_file_path=TEST_HGNC_JSON) cls.hgnc_manager.insert_hgnc(hgnc_dict=json_data, silent=True, low_memory=False) cls.mirbase_manager = bio2bel_mirbase.Manager( connection=cls.connection) with open(TEST_MIRBASE_JSON) as file: mirbase_list = json.load(file) cls.mirbase_manager._populate_definitions_helper(mirbase_list) cls.manager.populate(TEST_MIRTARBASE_EXCEL)
def populate(self, url: Union[None, str, Iterable[str]] = None): """Populate the database. :param url: url from a GMT file """ hgnc_manager = bio2bel_hgnc.Manager(engine=self.engine, session=self.session) if not hgnc_manager.is_populated(): hgnc_manager.populate() if url is None or isinstance(url, str): pathways = parse_gmt_file(url=url) elif isinstance(url, Iterable): pathways = [ pathway for u in url for pathway in parse_gmt_file(url=u) ] else: raise TypeError(f'Invalid type for url: {type(url)} ({url})') # Dictionaries to map across identifiers entrez_to_hgnc_symbol = hgnc_manager.build_entrez_id_symbol_mapping() hgnc_symbol_id = hgnc_manager.build_hgnc_symbol_id_mapping() entrez_id_protein = {} missing_entrez_ids = set() it = tqdm(pathways, desc='Loading WikiPathways') for pathway_name, species, wikipathways_id, gene_set in it: pathway = self.get_or_create_pathway( wikipathways_id=wikipathways_id, name=pathway_name.strip(), species=species, ) for entrez_id in gene_set: if entrez_id in entrez_id_protein: protein = entrez_id_protein[entrez_id] else: hgnc_symbol = entrez_to_hgnc_symbol.get(entrez_id) if not hgnc_symbol: it.write( f"({species}) ncbigene:{entrez_id} has no HGNC symbol" ) missing_entrez_ids.add(entrez_id) continue protein = self.get_or_create_protein( entrez_id, hgnc_symbol, hgnc_symbol_id[hgnc_symbol]) entrez_id_protein[entrez_id] = protein if pathway not in protein.pathways: protein.pathways.append(pathway) self.session.commit() if missing_entrez_ids: log.warning("Total of {} missing ENTREZ".format( len(missing_entrez_ids)))
def prerender(graph: BELGraph, hgnc_manager=None) -> Mapping[str, Mapping[str, Any]]: """Generate the annotations JSON for Ideogram.""" import bio2bel_hgnc from bio2bel_entrez.parser import get_human_refseq_slim_df from bio2bel_hgnc.models import HumanGene if hgnc_manager is None: hgnc_manager = bio2bel_hgnc.Manager() hgnc_symbols = { node.name for node in graph if isinstance(node, CentralDogma) and node.namespace.lower() == 'hgnc' } refseq_df = get_human_refseq_slim_df() result = { hgnc_symbol: dict(name=hgnc_symbol, start=start, stop=stop) for _, hgnc_symbol, start, stop in refseq_df[refseq_df['Symbol'].isin( hgnc_symbols)].values } human_genes = (hgnc_manager.session.query( HumanGene.symbol, HumanGene.location).filter(HumanGene.symbol.in_(hgnc_symbols)).all()) for human_gene in human_genes: if human_gene.symbol not in result: continue # something doesn't have a mapping in HGNC result[human_gene.symbol]['chr'] = (human_gene.location.split('q')[0] if 'q' in human_gene.location else human_gene.location.split('p')[0]) return result
def main(): """Make mapping files.""" manager = bio2bel_hgnc.Manager() if not manager.is_populated(): manager.populate() entrez_id_to_hgnc_id = manager.build_entrez_id_to_hgnc_id_mapping() with open('entrez_id_to_hgnc_id.json', 'w') as file: json.dump(entrez_id_to_hgnc_id, file, indent=2, sort_keys=True) entrez_id_to_hgnc_symbol = manager.build_entrez_id_to_hgnc_symbol_mapping() with open('entrez_id_to_hgnc_symbol.json', 'w') as file: json.dump(entrez_id_to_hgnc_symbol, file, indent=2, sort_keys=True)
def __init__(self, hgnc_manager: Optional[bio2bel_hgnc.Manager] = None) -> None: if hgnc_manager is None: logger.info('getting Bio2BEL HGNC manager') hgnc_manager = bio2bel_hgnc.Manager() if not hgnc_manager.is_populated(): logger.info('populating HGNC') hgnc_manager.populate() logger.info('generating hgnc symbol to chromosome mapping') self.hgnc_symbol_to_chromosome = { str(symbol): CHROMOSOME_SPLIT_RE.split(location)[0] for symbol, location in hgnc_manager.session.query( HumanGene.symbol, HumanGene.location) if location is not None } logger.info('generating hgnc id to chromosome mapping') self.hgnc_id_to_chromosome = { str(hgnc_id): CHROMOSOME_SPLIT_RE.split(location)[0] for hgnc_id, location in hgnc_manager.session.query( HumanGene.identifier, HumanGene.location) if location is not None } self.hgnc_id_to_positions = {} self.hgnc_symbol_to_positions = {} logger.info('generating hgnc symbol to chromosome mapping') self.entrez_id_to_hgnc_id = hgnc_manager.build_entrez_id_to_hgnc_id_mapping( ) logger.info('getting human refseq data') self.human_refseq_df = get_human_refseq_slim_df() logger.info('generating maps with refseq data') for entrez_id, symbol, start, end in self.human_refseq_df.values: hgnc_id = self.entrez_id_to_hgnc_id.get(str(entrez_id)) if hgnc_id is None: logger.debug( f'Could not find ncbigene:{entrez_id} in HGNC. May be withdrawn' ) continue self.hgnc_id_to_positions[hgnc_id] = start, end self.hgnc_symbol_to_positions[symbol] = start, end # These will get populated as graphs are added with update_chromosome_map() self.chromosome_to_edge_keys = defaultdict(list) self.cross_chromosome_to_edge_keys = defaultdict(list)
def _get_mappings(): hgnc_manager = bio2bel_hgnc.Manager() if not hgnc_manager.is_populated(): hgnc_manager.populate() hgnc_gene_symbol_to_hgnc_id = hgnc_manager.build_hgnc_symbol_id_mapping() mgi_manager = bio2bel_mgi.Manager() if not mgi_manager.is_populated(): mgi_manager.populate() mgi_gene_symbol_to_mgi_id = mgi_manager.build_mgi_gene_symbol_to_mgi_id_mapping( ) rgd_manager = bio2bel_rgd.Manager() if not rgd_manager.is_populated(): rgd_manager.populate() rgd_gene_symbol_to_rgd_id = rgd_manager.build_rgd_gene_symbol_to_rgd_id_mapping( ) return hgnc_gene_symbol_to_hgnc_id, mgi_gene_symbol_to_mgi_id, rgd_gene_symbol_to_rgd_id
def setUpClass(cls): """Create a temporary file and populate the database.""" super().setUpClass() cls.engine, cls.session = build_engine_session(connection=cls.connection) # HGNC manager cls.hgnc_manager = bio2bel_hgnc.Manager(engine=cls.engine, session=cls.session) cls.hgnc_manager.create_all() cls.hgnc_manager.populate(hgnc_file_path=hgnc_test_path, use_hcop=False) # create temporary database cls.manager = Manager(engine=cls.engine, session=cls.session) # fill temporary database with test data cls.manager.populate(url=pathlib.Path(gene_sets_path).as_uri()) # PyBEL manager cls.pybel_manager = pybel.Manager(engine=cls.engine, session=cls.session) cls.pybel_manager.create_all()
def get_pathway_graph_by_id(self, wikipathways_id: str) -> Optional[BELGraph]: """Return a new graph corresponding to the pathway. :param wikipathways_id: WikiPathways identifier :return: A BEL Graph corresponding to the WikiPathways identifier Example Usage: >>> manager = Manager() >>> manager.get_pathway_graph_by_id('WP61') # Notch signaling pathway """ pathway = self.get_pathway_by_id(wikipathways_id) if pathway is None: return graph = BELGraph( name=f'{pathway.name} ({pathway.species})', version='1.0.0', ) wikipathways_namespace = self.upload_bel_namespace() graph.namespace_url[ wikipathways_namespace.keyword] = wikipathways_namespace.url hgnc_manager = bio2bel_hgnc.Manager(engine=self.engine, session=self.session) hgnc_namespace = hgnc_manager.upload_bel_namespace() graph.namespace_url[hgnc_namespace.keyword] = hgnc_namespace.url pathway_node = pathway.serialize_to_pathway_node() for protein in pathway.proteins: graph.add_part_of(protein.serialize_to_protein_node(), pathway_node) return graph
def get_drug_to_hgnc_symbols(self, cache=True, recalculate=False) -> Dict[str, List[str]]: """Get a dictionary of drug names to HGNC gene symbols.""" if cache and not recalculate and os.path.exists( _dti_symbols_cache_path): log.debug('loading cached DTIs with gene symbols') with open(_dti_symbols_cache_path) as file: return json.load(file) hgnc_manager = bio2bel_hgnc.Manager(engine=self.engine, session=self.session) if not hgnc_manager.is_populated(): hgnc_manager.populate() hgnc_id_symbol_mapping = hgnc_manager.build_hgnc_id_symbol_mapping() drug_to_hgnc_ids = self.get_drug_to_hgnc_ids() rv = defaultdict(list) for drug, hgnc_ids in drug_to_hgnc_ids.items(): for hgnc_id in hgnc_ids: hgnc_symbol = hgnc_id_symbol_mapping.get(hgnc_id) if hgnc_symbol is None: log.warning('could not map HGNC identifier: %s', hgnc_id) continue rv[drug].append(hgnc_symbol) if cache: with open(_dti_symbols_cache_path, 'w') as file: log.info('dumping cached DTIs') json.dump(rv, file) return dict(rv)
def get_graph( force: bool = False, force_global: bool = False, names: Optional[NamesList] = None, resources_directory: Optional[str] = None, ) -> BELGraph: """Get all resources in a combine BELGraph. :param force: Should cached files be overwritten? :param force_global: Should the global cache file be overwritten? :param names: The name of the bio2bel packages to use and arguments :param resources_directory: A non-default place to store the resources """ pickle_path = os.path.join(resources_directory or RESOURCES, CACHE_NAME) if not force_global and os.path.exists(pickle_path): logger.info(f'Getting cached full graph') return from_pickle(pickle_path) if names is None: names = DEFAULT_NAMES logger.info('Generating graphs') graphs = [] for name, to_bel_kwargs in names: _graph = get_graph_by_manager(name, force=force, to_bel_kwargs=to_bel_kwargs) logger.info(_graph.summary_str()) graphs.append(_graph) logger.info('Merging graphs') graph = pybel.union(graphs) graph.name = f'Graph from: {", ".join(graph.name for graph in graphs)}' graph.version = '0.0.1' logger.info('Finished merging graphs') logger.info('Preparing HGNC mappings') hgnc_manager = bio2bel_hgnc.Manager() hgnc_symbol_to_id = hgnc_manager.build_hgnc_symbol_id_mapping() entrez_id_to_hgnc_symbol = hgnc_manager.build_entrez_id_to_hgnc_symbol_mapping( ) logger.info('Generating namespace mapping for nodes') mapping = {} for node in graph: namespace = node.get('namespace') if namespace is None: continue elif namespace.lower() in { 'ncbigene', 'egid' } and node.identifier in entrez_id_to_hgnc_symbol: name = entrez_id_to_hgnc_symbol[node.identifier] identifier = hgnc_symbol_to_id[name] mapping[node] = node.__class__( namespace='hgnc', name=name, identifier=identifier, ) logger.info('Relabeling nodes') nx.relabel_nodes(graph, mapping, copy=False) logger.info('Enriching central dogma') enrich_protein_and_rna_origins(graph) logger.info('Exporting snp2k pickle') to_pickle(graph, pickle_path) return graph
def init_app(self, app: flask.Flask) -> None: # noqa: C901 """Initialize a Flask app.""" self.app = app app.extensions['bio2bel'] = self try: import bio2bel_chebi except ImportError: pass else: logger.debug('Using Bio2BEL ChEBI') self.chebi_manager = bio2bel_chebi.Manager( connection=self.connection) self.chebi_manager.create_all() in_place_transformation( self.chebi_manager.enrich_chemical_hierarchy) try: import bio2bel_hgnc except ImportError: pass else: logger.debug('Using Bio2BEL HGNC') self.hgnc_manager = bio2bel_hgnc.Manager( connection=self.connection) self.hgnc_manager.create_all() in_place_transformation( self.hgnc_manager.enrich_genes_with_families) in_place_transformation( self.hgnc_manager.enrich_families_with_genes) try: import bio2bel_mirtarbase except ImportError: pass else: logger.debug('Using Bio2BEL miRTarBase') self.mirtarbase_manager = bio2bel_mirtarbase.Manager( connection=self.connection) self.mirtarbase_manager.create_all() in_place_transformation(self.mirtarbase_manager.enrich_mirnas) in_place_transformation(self.mirtarbase_manager.enrich_rnas) try: import bio2bel_expasy except ImportError: pass else: logger.debug('Using Bio2BEL ExPASy') self.expasy_manager = bio2bel_expasy.Manager( connection=self.connection) self.expasy_manager.create_all() in_place_transformation( self.expasy_manager.enrich_proteins_with_enzyme_families) in_place_transformation(self.expasy_manager.enrich_enzymes) try: import bio2bel_go except ImportError: pass else: logger.debug('Using Bio2BEL GO') self.go_manager = bio2bel_go.Manager(connection=self.connection) in_place_transformation(self.go_manager.enrich_bioprocesses) try: import bio2bel_entrez except ImportError: pass else: logger.debug('Using Bio2BEL Entrez') self.entrez_manager = bio2bel_entrez.Manager( connection=self.connection) self.entrez_manager.create_all() try: import bio2bel_interpro except ImportError: pass else: logger.debug('Using Bio2BEL InterPro') self.interpro_manager = bio2bel_interpro.Manager( connection=self.connection) self.interpro_manager.create_all() try: import bio2bel_ctd except ImportError: pass else: logger.debug('Using Bio2BEL CTD') self.ctd_manager = bio2bel_ctd.Manager(connection=self.connection) self.ctd_manager.create_all() in_place_transformation(self.ctd_manager.enrich_graph_genes) try: import bio2bel_hmdb except ImportError: pass else: logger.debug('Using Bio2BEL HMDB') self.hmdb_manager = bio2bel_hmdb.Manager( connection=self.connection) self.hmdb_manager.create_all() try: import bio2bel_hmdd except ImportError: pass else: logger.debug('Using Bio2BEL HMDD') self.hmdd_manager = bio2bel_hmdd.Manager( connection=self.connection) self.hmdd_manager.create_all() try: import bio2bel_mir2disease except ImportError: pass else: logger.debug('Using Bio2BEL mir2disease') self.mir2disease_manager = bio2bel_mir2disease.Manager( connection=self.connection) self.mir2disease_manager.create_all() try: import bio2bel_drugbank except ImportError: pass else: logger.debug('Using Bio2BEL DrugBank') self.drugbank_manager = bio2bel_drugbank.Manager( connection=self.connection) self.drugbank_manager.create_all() try: import bio2bel_phosphosite except ImportError: pass else: logger.debug('Using Bio2BEL PhosphoSitePlus') self.phosphosite_manager = bio2bel_phosphosite.Manager( connection=self.connection) self.phosphosite_manager.create_all() try: import bio2bel_sider except ImportError: pass else: logger.debug('Using Bio2BEL SIDER') self.sider_manager = bio2bel_sider.Manager( connection=self.connection) self.sider_manager.create_all() try: import bio2bel_mesh except ImportError: pass else: logger.debug('Using Bio2BEL MeSH') self.mesh_manager = bio2bel_mesh.Manager( connection=self.connection) self.mesh_manager.create_all() try: import bio2bel_mgi except ImportError: pass else: logger.debug('Using Bio2BEL MGI') self.mgi_manager = bio2bel_mgi.Manager(connection=self.connection) self.mgi_manager.create_all() try: import bio2bel_rgd except ImportError: pass else: logger.debug('Using Bio2BEL RGD') self.rgd_manager = bio2bel_rgd.Manager(connection=self.connection) self.rgd_manager.create_all() try: import conso.manager except ImportError: pass else: logger.debug('Using Bio2BEL CONSO') self.conso_manager = conso.manager.Manager() self.manager_dict.update({ name: manager for name, manager in self.__dict__.items() if name.endswith('_manager') and manager is not None })
def populate(self, source: Optional[str] = None, update: bool = False) -> None: """Populate database with the data from miRTarBase. :param source: path or link to data source needed for :func:`get_data` :param update: Should HGNC an miRBase be updated? """ hgnc_manager = bio2bel_hgnc.Manager(connection=self.connection) if not hgnc_manager.is_populated() or update: hgnc_manager.populate() mirbase_manager = bio2bel_mirbase.Manager(connection=self.connection) if not mirbase_manager.is_populated() or update: mirbase_manager.populate() t = time.time() logger.info('getting data') df = get_data(source) logger.info('got data in %.2f seconds', time.time() - t) name_mirna = {} target_set = {} species_set = {} interaction_set = {} emap = _build_entrez_map(hgnc_manager) logger.info('building models') t = time.time() it = tqdm(df.values, total=len(df.index)) for mirtarbase_id, mirna_name, mirna_species, gene_name, entrez_id, target_species, exp, sup_type, pubmed in it: # create new miRNA instance entrez_id = str(int(entrez_id)) interaction_key = (mirna_name, entrez_id) interaction = interaction_set.get(interaction_key) if interaction is None: mirna = name_mirna.get(mirna_name) if mirna is None: species = species_set.get(mirna_species) if species is None: species = species_set[mirna_species] = Species(name=mirna_species) self.session.add(species) mirna = name_mirna[mirna_name] = Mirna( name=mirna_name, species=species, ) self.session.add(mirna) target = target_set.get(entrez_id) if target is None: species = species_set.get(target_species) if species is None: species = species_set[target_species] = Species(name=target_species) self.session.add(species) target = target_set[entrez_id] = Target( entrez_id=entrez_id, species=species, name=gene_name, ) if entrez_id in emap: g_first = emap[entrez_id] target.hgnc_symbol = g_first.symbol target.hgnc_id = str(g_first.identifier) self.session.add(target) # create new interaction instance interaction = interaction_set[interaction_key] = Interaction( mirtarbase_id=mirtarbase_id, mirna=mirna, target=target ) self.session.add(interaction) # create new evidence instance new_evidence = Evidence( experiment=exp, support=sup_type, reference=pubmed, interaction=interaction, ) self.session.add(new_evidence) logger.info('built models in %.2f seconds', time.time() - t) logger.info('committing models') t = time.time() self.session.commit() logger.info('committed after %.2f seconds', time.time() - t)