def to_bel(self) -> BELGraph: """Convert miRBase to BEL.""" result = BELGraph() for sequence in self._get_query(Sequence): mirbase_node = sequence.as_pybel() for xref in sequence.xrefs: result.add_equivalence( mirbase_node, MicroRna( namespace=xref.database, identifier=xref.database_id, )) return result
def test_mirna_reference(self): self.assertEqual('m(HGNC:MIR1)', str(MicroRna(namespace='HGNC', name='MIR1')))
CHEBI = 'CHEBI' g1 = Gene(namespace=HGNC, name='1') r1 = Rna(namespace=HGNC, name='1') p1 = Protein(HGNC, name='1') g2 = Gene(HGNC, name='2') r2 = Rna(HGNC, name='2') p2 = Protein(HGNC, name='2') g3 = Gene(namespace=HGNC, name='3') r3 = Rna(namespace=HGNC, name='3') p3 = Protein(namespace=HGNC, name='3') g4 = Gene(namespace=HGNC, name='4') m4 = MicroRna(namespace=HGNC, name='4') a5 = Abundance(namespace=CHEBI, name='5') p5 = Pathology(namespace=GO, name='5') class TestCollapseProteinInteractions(unittest.TestCase): def test_protein_interaction_1(self): graph = BELGraph() graph.add_node_from_data(p1) graph.add_node_from_data(p2) graph.add_node_from_data(a5) graph.add_node_from_data(p5) graph.add_qualified_edge(p1, p2, relation=POSITIVE_CORRELATION, citation=n(), evidence=n())
) BEL_THOROUGH_NODES = { oxygen_atom, tmprss2_erg_rna_fusion, tmprss2_erg_rna_fusion_unspecified, akt_methylated, bcr_jak2_rna_fusion, chchd4_aifm1_rna_fusion, akt1_gene, akt1_phe_508_del, akt1, Gene('HGNC', 'AKT1', variants=Hgvs('c.308G>A')), tmprss2_erg_gene_fusion, Gene('HGNC', 'AKT1', variants=[Hgvs('c.1521_1523delCTT'), Hgvs('c.308G>A'), Hgvs('p.Phe508del')]), MicroRna('HGNC', 'MIR21'), bcr_jak2_gene_fusion, Gene('HGNC', 'CFTR', variants=Hgvs('c.1521_1523delCTT')), Gene('HGNC', 'CFTR'), Gene('HGNC', 'CFTR', variants=Hgvs('g.117199646_117199648delCTT')), Gene('HGNC', 'CFTR', variants=Hgvs('c.1521_1523delCTT')), Protein('HGNC', 'AKT1', variants=ProteinModification('Ph', 'Ser', 473)), MicroRna('HGNC', 'MIR21', variants=Hgvs('p.Phe508del')), Protein('HGNC', 'AKT1', variants=Hgvs('p.C40*')), Protein('HGNC', 'AKT1', variants=[Hgvs('p.Ala127Tyr'), ProteinModification('Ph', 'Ser')]), chchd4_aifm1_gene_fusion, tmprss2_erg_protein_fusion, Protein('HGNC', 'AKT1', variants=Hgvs('p.Arg1851*')), bcr_jak2_protein_fusion, Protein('HGNC', 'AKT1', variants=Hgvs('p.40*')), chchd4_aifm1_protein_fusion,
def _rela(x, y=None): return {RELATION: x, OBJECT: activity(y)} def _assoc(y): return {RELATION: ASSOCIATION, 'association_type': y} a1 = Abundance('CHEBI', '1') p1 = Protein('HGNC', '1') pf1 = Protein('INTERPRO', '1') d1 = Pathology('MESH', '1') b1 = BiologicalProcess('GO', '1') b2 = BiologicalProcess('GO', '2') m1 = MicroRna('MIRBASE', '1') r1 = Rna('HGNC', '1') r2 = Rna('HGNC', '2') nca1 = NamedComplexAbundance('FPLX', '1') pop1 = Population('taxonomy', '1') p2 = Protein('HGNC', identifier='9236') p3 = Protein('HGNC', identifier='9212') r3 = p3.get_rna() g3 = r3.get_gene() c1 = ComplexAbundance([p2, g3]) c2 = ComplexAbundance([p1, p2]) c3 = ComplexAbundance([a1, p2]) converters_true_list = [
def normalize_graph_names(graph: BELGraph, database: str) -> None: """Normalize graph names.""" # Victim to Survivor (one to one node) mapping one_to_one_mapping = {} # Victim to Survivors (one to many nodes) mapping one_to_many_mapping = defaultdict(set) for node in graph.nodes(): # Skip ListAbundances and Reactions since they do not have a name if isinstance(node, ListAbundance) or isinstance( node, Reaction) or not node.name: continue # Normalize names: Lower case name and strip quotes or white spaces lower_name = node.name.lower().strip('"').strip() # Dealing with Genes/miRNAs if isinstance(node, CentralDogma): ################## # miRNA entities # ################## if lower_name.startswith("mir"): # Reactome preprocessing to flat multiple identifiers if database == REACTOME: reactome_cell = munge_reactome_gene(lower_name) if isinstance(reactome_cell, list): for lower_name in reactome_cell: one_to_many_mapping[node].add( MicroRna( node.namespace, name=lower_name.replace("mir-", "mir"), identifier=node.identifier, ), ) if lower_name.endswith(' genes'): lower_name = lower_name[:-len(' genes')] elif lower_name.endswith(' gene'): lower_name = lower_name[:-len(' gene')] one_to_one_mapping[node] = MicroRna( node.namespace, name=lower_name.replace( "mir-", "mir"), # Special case for Reactome ) continue # KEGG and Reactome one_to_one_mapping[node] = MicroRna( node.namespace, name=node.name.replace("mir-", "mir"), identifier=node.identifier, ) ################## # Genes entities # ################## else: # Reactome preprocessing to flat multiple identifiers if database == REACTOME: reactome_cell = munge_reactome_gene(lower_name) if isinstance(reactome_cell, list): for lower_name in reactome_cell: if lower_name in BLACK_LIST_REACTOME: # Filter entities in black list continue elif lower_name.startswith( "("): # remove redundant parentheses lower_name = lower_name.strip("(").strip(")") one_to_many_mapping[node].add( Protein(node.namespace, name=lower_name, identifier=node.identifier), ) else: one_to_one_mapping[node] = Protein( node.namespace, name=lower_name, identifier=node.identifier) continue # WikiPathways and KEGG do not require any processing of genes elif database == WIKIPATHWAYS and lower_name in WIKIPATHWAYS_BIOL_PROCESS: one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue one_to_one_mapping[node] = Protein(node.namespace, name=lower_name, identifier=node.identifier) ####################### # Metabolite entities # ####################### elif isinstance(node, Abundance): if database == 'wikipathways': # Biological processes that are captured as abundance in # BEL since they were characterized wrong in WikiPathways if lower_name in WIKIPATHWAYS_BIOL_PROCESS: one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Abundances to BiologicalProcesses elif (node.namespace in {'WIKIDATA', 'WIKIPATHWAYS', 'REACTOME'} and lower_name not in WIKIPATHWAYS_METAB): one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Fix naming in duplicate entity if lower_name in WIKIPATHWAYS_NAME_NORMALIZATION: lower_name = WIKIPATHWAYS_NAME_NORMALIZATION[lower_name] elif database == REACTOME: # Curated proteins that were coded as metabolites if lower_name in REACTOME_PROT: one_to_one_mapping[node] = Protein( node.namespace, name=lower_name, identifier=node.identifier, ) continue # Flat multiple identifiers (this is not trivial because most of ChEBI names contain commas, # so a clever way to fix some of the entities is to check that all identifiers contain letters) elif "," in lower_name and all( string.isalpha() for string in lower_name.split(",")): for string in lower_name.split(","): one_to_many_mapping[node].add( Abundance(node.namespace, name=string, identifier=node.identifier), ) continue one_to_one_mapping[node] = Abundance(node.namespace, name=lower_name, identifier=node.identifier) ################################# # Biological Processes entities # ################################# elif isinstance(node, BiologicalProcess): # KEGG normalize name by removing the title prefix if lower_name.startswith('title:'): lower_name = lower_name[len('title:'):] one_to_one_mapping[node] = BiologicalProcess( node.namespace, name=lower_name, identifier=node.identifier, ) relabel_nodes(graph, one_to_one_mapping) multi_relabel(graph, one_to_many_mapping)
'HGNC', 'TMPRSS2'), partner_3p=Rna('HGNC', 'ERG')) BEL_THOROUGH_NODES = { oxygen_atom, tmprss2_erg_rna_fusion, tmprss2_erg_rna_fusion_unspecified, akt_methylated, bcr_jak2_rna_fusion, chchd4_aifm1_rna_fusion, akt1_gene, akt1_phe_508_del, akt1, Gene('HGNC', 'AKT1', variants=Hgvs('c.308G>A')), tmprss2_erg_gene_fusion, Gene('HGNC', 'AKT1', variants=[ Hgvs('c.1521_1523delCTT'), Hgvs('c.308G>A'), Hgvs('p.Phe508del') ]), MicroRna('HGNC', 'MIR21'), bcr_jak2_gene_fusion, Gene('HGNC', 'CFTR', variants=Hgvs('c.1521_1523delCTT')), Gene('HGNC', 'CFTR'), Gene('HGNC', 'CFTR', variants=Hgvs('g.117199646_117199648delCTT')), Gene('HGNC', 'CFTR', variants=Hgvs('c.1521_1523delCTT')), Protein('HGNC', 'AKT1', variants=ProteinModification('Ph', 'Ser', 473)), MicroRna('HGNC', 'MIR21', variants=Hgvs('p.Phe508del')), Protein('HGNC', 'AKT1', variants=Hgvs('p.C40*')), Protein('HGNC', 'AKT1', variants=[Hgvs('p.Ala127Tyr'), ProteinModification('Ph', 'Ser')]), chchd4_aifm1_gene_fusion, tmprss2_erg_protein_fusion, Protein('HGNC', 'AKT1', variants=Hgvs('p.Arg1851*')), bcr_jak2_protein_fusion, Protein('HGNC', 'AKT1',