def get_bel() -> pybel.BELGraph: """Get the ComPath mappings as BEL.""" graph = pybel.BELGraph( name='ComPath Mappings', version='1.1.0', description= 'Hierarchical and equivalence relations between entries in KEGG, Reactome, PathBank,' ' and WikiPathways.') df = get_df() for source_ns, source_id, source_name, relation, target_ns, target_id, target_name in df.values: source = BiologicalProcess( namespace=source_ns, identifier=source_id, name=source_name, ) target = BiologicalProcess( namespace=target_ns, identifier=target_id, name=target_name, ) if relation == 'isPartOf': graph.add_part_of(source, target) elif relation == 'equivalentTo': graph.add_equivalence(source, target) else: raise ValueError(f'invalid mapping with relation: {relation}') return graph
def gobp(name: str, identifier: str) -> BiologicalProcess: """Make a GO biological process node.""" return BiologicalProcess( namespace='go', name=name, identifier=identifier, )
def test_bioprocess(self): node = BiologicalProcess(namespace='GO', name='apoptosis') self.assertEqual('bp(GO:apoptosis)', str(node))
Gene('HGNC', 'NCF1'), Protein('HGNC', 'HBP1') ]), Protein('HGNC', 'HBP1'), ComplexAbundance([Protein('HGNC', 'FOS'), Protein('HGNC', 'JUN')]), Protein('HGNC', 'FOS'), Protein('HGNC', 'JUN'), Rna('HGNC', 'CFTR', variants=Hgvs('r.1521_1523delcuu')), Rna('HGNC', 'CFTR'), Rna('HGNC', 'CFTR', variants=Hgvs('r.1653_1655delcuu')), CompositeAbundance([ interleukin_23_complex, il6 ]), il6, BiologicalProcess('GO', 'cell cycle arrest'), hydrogen_peroxide, Protein('HGNC', 'CAT'), Gene('HGNC', 'CAT'), Protein('HGNC', 'HMGCR'), BiologicalProcess('GO', 'cholesterol biosynthetic process'), Gene('HGNC', 'APP', variants=Hgvs('c.275341G>C')), Gene('HGNC', 'APP'), Pathology('MESHD', 'Alzheimer Disease'), ComplexAbundance([Protein('HGNC', 'F3'), Protein('HGNC', 'F7')]), Protein('HGNC', 'F3'), Protein('HGNC', 'F7'), Protein('HGNC', 'F9'), Protein('HGNC', 'GSK3B', variants=ProteinModification('Ph', 'Ser', 9)), Protein('HGNC', 'GSK3B'), Pathology('MESHD', 'Psoriasis'),
ComplexAbundance, NamedComplexAbundance, Pathology, Protein, ProteinModification, ) from pybel.language import activity_mapping from pybel.testing.constants import test_jgif_path from tests.constants import TestGraphMixin logging.getLogger('pybel.parser').setLevel(20) calcium = Abundance('SCHEM', 'Calcium') calcineurin_complex = NamedComplexAbundance('SCOMP', 'Calcineurin Complex') foxo3 = Protein('HGNC', 'FOXO3') tcell_proliferation = BiologicalProcess( 'GO', 'CD8-positive, alpha-beta T cell proliferation') il15 = Protein('HGNC', 'IL15') il2rg = Protein('MGI', 'Il2rg') jgif_expected_nodes = { calcium, calcineurin_complex, foxo3, tcell_proliferation, il15, il2rg, Protein('HGNC', 'CXCR6'), Protein('HGNC', 'IL15RA'), BiologicalProcess('GO', 'lymphocyte chemotaxis'), Protein('HGNC', 'IL2RG'), Protein('HGNC', 'ZAP70'), NamedComplexAbundance('SCOMP', 'T Cell Receptor Complex'),
return {RELATION: x} def _rela(x, y=None): return {RELATION: x, OBJECT: activity(y)} def _assoc(y): return {RELATION: ASSOCIATION, 'association_type': y} a1 = Abundance('CHEBI', '1') p1 = Protein('HGNC', '1') pf1 = Protein('INTERPRO', '1') d1 = Pathology('MESH', '1') b1 = BiologicalProcess('GO', '1') b2 = BiologicalProcess('GO', '2') m1 = MicroRna('MIRBASE', '1') r1 = Rna('HGNC', '1') r2 = Rna('HGNC', '2') nca1 = NamedComplexAbundance('FPLX', '1') pop1 = Population('taxonomy', '1') p2 = Protein('HGNC', identifier='9236') p3 = Protein('HGNC', identifier='9212') r3 = p3.get_rna() g3 = r3.get_gene() c1 = ComplexAbundance([p2, g3]) c2 = ComplexAbundance([p1, p2]) c3 = ComplexAbundance([a1, p2])
# PyBEL manager cls.pybel_manager = pybel.Manager(engine=cls.engine, session=cls.session) cls.pybel_manager.create_all() @classmethod def tearDownClass(cls): """Close the connection in the manager and deletes the temporary database.""" cls.session.close() super().tearDownClass() protein_a = Protein(namespace=HGNC, identifier='2976', name='DNMT1') protein_b = Protein(namespace=HGNC, identifier='9173', name='POLA1') gene_c = Gene(namespace=HGNC, identifier='8903', name='PGLS') pathway_a = BiologicalProcess(namespace=WIKIPATHWAYS, identifier='WP1604', name='Codeine and Morphine Metabolism') def get_enrichment_graph(): """Build a simple test graph with 2 proteins, one gene, and one pathway all contained in HGNC.""" graph = BELGraph( name='My test graph for enrichment', version='0.0.1', ) graph.add_increases(protein_a, protein_b, citation='1234', evidence='') graph.add_decreases(protein_b, gene_c, citation='1234', evidence='') graph.add_part_of(gene_c, pathway_a) return graph
def normalize_graph_names(graph: BELGraph, database: str) -> None: """Normalize graph names.""" # Victim to Survivor (one to one node) mapping one_to_one_mapping = {} # Victim to Survivors (one to many nodes) mapping one_to_many_mapping = defaultdict(set) for node in graph.nodes(): # Skip ListAbundances and Reactions since they do not have a name if isinstance(node, ListAbundance) or isinstance( node, Reaction) or not node.name: continue # Normalize names: Lower case name and strip quotes or white spaces lower_name = node.name.lower().strip('"').strip() # Dealing with Genes/miRNAs if isinstance(node, CentralDogma): ################## # miRNA entities # ################## if lower_name.startswith("mir"): # Reactome preprocessing to flat multiple identifiers if database == REACTOME: reactome_cell = munge_reactome_gene(lower_name) if isinstance(reactome_cell, list): for lower_name in reactome_cell: one_to_many_mapping[node].add( MicroRna( node.namespace, name=lower_name.replace("mir-", "mir"), identifier=node.identifier, ), ) if lower_name.endswith(' genes'): lower_name = lower_name[:-len(' genes')] elif lower_name.endswith(' gene'): lower_name = lower_name[:-len(' gene')] one_to_one_mapping[node] = MicroRna( node.namespace, name=lower_name.replace( "mir-", "mir"), # Special case for Reactome ) continue # KEGG and Reactome one_to_one_mapping[node] = MicroRna( node.namespace, name=node.name.replace("mir-", "mir"), identifier=node.identifier, ) ################## # Genes entities # ################## else: # Reactome preprocessing to flat multiple identifiers if database == REACTOME: reactome_cell = munge_reactome_gene(lower_name) if isinstance(reactome_cell, list): for lower_name in reactome_cell: if lower_name in BLACK_LIST_REACTOME: # Filter entities in black list continue elif lower_name.startswith( "("): # remove redundant parentheses lower_name = lower_name.strip("(").strip(")") one_to_many_mapping[node].add( Protein(node.namespace, name=lower_name, identifier=node.identifier), ) else: one_to_one_mapping[node] = Protein( node.namespace, name=lower_name, identifier=node.identifier) continue # WikiPathways and KEGG do not require any processing of genes elif database == WIKIPATHWAYS and lower_name in WIKIPATHWAYS_BIOL_PROCESS: one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue one_to_one_mapping[node] = Protein(node.namespace, name=lower_name, identifier=node.identifier) ####################### # Metabolite entities # ####################### elif isinstance(node, Abundance): if database == 'wikipathways': # Biological processes that are captured as abundance in # BEL since they were characterized wrong in WikiPathways if lower_name in WIKIPATHWAYS_BIOL_PROCESS: one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Abundances to BiologicalProcesses elif (node.namespace in {'WIKIDATA', 'WIKIPATHWAYS', 'REACTOME'} and lower_name not in WIKIPATHWAYS_METAB): one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Fix naming in duplicate entity if lower_name in WIKIPATHWAYS_NAME_NORMALIZATION: lower_name = WIKIPATHWAYS_NAME_NORMALIZATION[lower_name] elif database == REACTOME: # Curated proteins that were coded as metabolites if lower_name in REACTOME_PROT: one_to_one_mapping[node] = Protein( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Flat multiple identifiers (this is not trivial because most of ChEBI names contain commas, # so a clever way to fix some of the entities is to check that all identifiers contain letters) elif "," in lower_name and all( string.isalpha() for string in lower_name.split(",")): for string in lower_name.split(","): one_to_many_mapping[node].add( Abundance(node.namespace, name=string, identifier=node.identifier), ) continue one_to_one_mapping[node] = Abundance(node.namespace, name=lower_name, identifier=node.identifier) ################################# # Biological Processes entities # ################################# elif isinstance(node, BiologicalProcess): # KEGG normalize name by removing the title prefix if lower_name.startswith('title:'): lower_name = lower_name[len('title:'):] one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) relabel_nodes(graph, one_to_one_mapping) multi_relabel(graph, one_to_many_mapping)
'AKT1', variants=[Hgvs('c.1521_1523delCTT'), Hgvs('p.Phe508del')]), Gene('HGNC', 'NCF1'), ComplexAbundance([Gene('HGNC', 'NCF1'), Protein('HGNC', 'HBP1')]), Protein('HGNC', 'HBP1'), ComplexAbundance([Protein('HGNC', 'FOS'), Protein('HGNC', 'JUN')]), Protein('HGNC', 'FOS'), Protein('HGNC', 'JUN'), Rna('HGNC', 'CFTR', variants=Hgvs('r.1521_1523delcuu')), Rna('HGNC', 'CFTR'), Rna('HGNC', 'CFTR', variants=Hgvs('r.1653_1655delcuu')), CompositeAbundance([interleukin_23_complex, il6]), il6, BiologicalProcess('GO', 'cell cycle arrest'), hydrogen_peroxide, Protein('HGNC', 'CAT'), Gene('HGNC', 'CAT'), Protein('HGNC', 'HMGCR'), BiologicalProcess('GO', 'cholesterol biosynthetic process'), Gene('HGNC', 'APP', variants=Hgvs('c.275341G>C')), Gene('HGNC', 'APP'), Pathology('MESHD', 'Alzheimer Disease'), ComplexAbundance([Protein('HGNC', 'F3'), Protein('HGNC', 'F7')]), Protein('HGNC', 'F3'), Protein('HGNC', 'F7'), Protein('HGNC', 'F9'), Protein('HGNC', 'GSK3B', variants=ProteinModification('Ph', 'Ser', 9)), Protein('HGNC', 'GSK3B'), Pathology('MESHD', 'Psoriasis'),