def test_regulate_amount4_subj_act(): mek = protein(name='MAP2K1', namespace='HGNC') erk = protein(name='MAPK1', namespace='HGNC') g = pybel.BELGraph() g.add_qualified_edge(mek, erk, relation=pc.INCREASES, subject_modifier=activity(name='tscript'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 assert isinstance(pbp.statements[0], IncreaseAmount) subj = pbp.statements[0].subj assert subj.name == 'MAP2K1' assert isinstance(subj.activity, ActivityCondition) assert subj.activity.activity_type == 'transcription' assert subj.activity.is_active == True assert len(pbp.statements[0].evidence) == 1 g = pybel.BELGraph() g.add_qualified_edge(mek, erk, relation=pc.INCREASES, subject_modifier=activity(name='act'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 assert isinstance(pbp.statements[0], IncreaseAmount) subj = pbp.statements[0].subj assert subj.name == 'MAP2K1' assert isinstance(subj.activity, ActivityCondition) assert subj.activity.activity_type == 'activity' assert subj.activity.is_active == True assert len(pbp.statements[0].evidence) == 1
def test_phosphorylation_one_site_with_evidence(): mek = protein(name='MAP2K1', namespace='HGNC') erk = protein(name='MAPK1', namespace='HGNC', variants=[pmod('Ph', position=185, code='Thr')]) g = pybel.BELGraph() ev_text = 'Some evidence.' ev_pmid = '123456' edge_hash = g.add_qualified_edge(mek, erk, relation=pc.DIRECTLY_INCREASES, evidence=ev_text, citation=ev_pmid, annotations={"TextLocation": 'Abstract'}) pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 assert isinstance(pbp.statements[0], Phosphorylation) assert pbp.statements[0].residue == 'T' assert pbp.statements[0].position == '185' enz = pbp.statements[0].enz sub = pbp.statements[0].sub assert enz.name == 'MAP2K1' assert enz.mods == [] assert sub.name == 'MAPK1' assert sub.mods == [] # Check evidence assert len(pbp.statements[0].evidence) == 1 ev = pbp.statements[0].evidence[0] assert ev.source_api == 'bel' assert ev.source_id == edge_hash assert ev.pmid == ev_pmid assert ev.text == ev_text assert ev.annotations == {'bel': 'p(HGNC:MAP2K1) directlyIncreases ' 'p(HGNC:MAPK1, pmod(Ph, Thr, 185))'} assert ev.epistemics == {'direct': True, 'section_type': 'abstract'}
def test_simple(self): graph = pybel.BELGraph() key = 'DGXP' a = protein('HGNC', 'A') b = protein('HGNC', 'B') c = protein('HGNC', 'c') d = bioprocess('GOBP', 'D') graph.add_node_from_data(a) graph.add_node_from_data(b) graph.add_node_from_data(c) graph.add_node_from_data(d) graph.nodes[a][key] = 2 graph.nodes[b][key] = -1 graph.nodes[c][key] = 1 graph.add_increases(a, b, citation=n(), evidence=n()) graph.add_decreases(b, d, citation=n(), evidence=n()) graph.add_increases(a, c, citation=n(), evidence=n()) graph.add_increases(c, d, citation=n(), evidence=n()) candidate_mechanisms = generate_bioprocess_mechanisms(graph, key) self.assertEqual(1, len(candidate_mechanisms)) self.assertIn(d, candidate_mechanisms)
def test_conversion(): enz = protein(name='PLCG1', namespace='HGNC') react_1 = abundance('SCHEM', '1-Phosphatidyl-D-myo-inositol 4,5-bisphosphate') p1 = abundance('SCHEM', 'Diacylglycerol') p2 = abundance('SCHEM', 'Inositol 1,4,5-trisphosphate') rxn = reaction( reactants=[react_1], products=[p1, p2], ) g = pybel.BELGraph() g.add_qualified_edge(enz, rxn, relation=pc.DIRECTLY_INCREASES, subject_modifier=activity(name='activity'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 stmt = pbp.statements[0] assert isinstance(stmt, Conversion) assert stmt.subj.name == 'PLCG1' assert stmt.subj.activity.activity_type == 'activity' assert stmt.subj.activity.is_active is True assert len(stmt.obj_from) == 1 assert isinstance(stmt.obj_from[0], Agent) assert stmt.obj_from[0].name == '1-Phosphatidyl-D-myo-inositol ' \ '4,5-bisphosphate' assert len(stmt.obj_to) == 2 assert stmt.obj_to[0].name == 'Diacylglycerol' assert stmt.obj_to[1].name == 'Inositol 1,4,5-trisphosphate' assert len(stmt.evidence) == 1
def get_bel() -> pybel.BELGraph: """Get the Rhea data.""" version = bioversions.get_version('rhea') # Parse the RDF file g = BIO2BEL_MODULE.ensure_rdf('rhea', version, url=URL) # Get a list of all the reactions in the database # (the bidirectionalReaction criterion is added to ensure that we only recieve the nondirectional version of a given reaction) rxns = g.query( """ SELECT ?reaction ?id ?reactionEquation WHERE { ?reaction rh:equation ?reactionEquation . ?reaction rh:bidirectionalReaction ?bdr . ?reaction rh:id ?id } """, ) rv = pybel.BELGraph(name='rhea', version=version) # Loop over reactions, adding reaction nodes to rv as we go # Rather than converting to a set (time-consuming), just let the PyBEL graph handle the occasional duplicate for (reaction_uri, reaction_id, reaction_equation) in rxns: # Retrieve the reactants and products of the reaction participants = _participants(g, reaction_uri) # Add a reaction node to the BELGraph reaction = dsl.Reaction( participants['reactants'], participants['products'], namespace='RHEA', name=reaction_equation, identifier=reaction_id, ) rv.add_node_from_data(reaction) return rv
def test_simple(self): graph = pybel.BELGraph() key = 'DGXP' a = PROTEIN, 'HGNC', 'A' b = PROTEIN, 'HGNC', 'B' c = PROTEIN, 'HGNC', 'c' d = BIOPROCESS, 'GOBP', 'D' graph.add_simple_node(*a) graph.add_simple_node(*b) graph.add_simple_node(*c) graph.add_simple_node(*d) graph.node[a][key] = 2 graph.node[b][key] = -1 graph.node[c][key] = 1 graph.add_edge(a, b, attr_dict={RELATION: INCREASES}) graph.add_edge(b, d, attr_dict={RELATION: DECREASES}) graph.add_edge(a, c, attr_dict={RELATION: INCREASES}) graph.add_edge(c, d, attr_dict={RELATION: INCREASES}) candidate_mechanisms = cmpa.generate_bioprocess_mechanisms(graph, key) self.assertEqual(1, len(candidate_mechanisms)) self.assertIn(d, candidate_mechanisms)
def reset_bel_graph(self): """ Assigns a new, empty ``pybel.BELGraph`` instance to the ``bel_graph`` attribute. """ self.bel_graph = pybel.BELGraph()
def get_bel() -> pybel.BELGraph: """Get the ComPath mappings as BEL.""" graph = pybel.BELGraph( name='ComPath Mappings', version='1.1.0', description= 'Hierarchical and equivalence relations between entries in KEGG, Reactome, PathBank,' ' and WikiPathways.') df = get_df() for source_ns, source_id, source_name, relation, target_ns, target_id, target_name in df.values: source = BiologicalProcess( namespace=source_ns, identifier=source_id, name=source_name, ) target = BiologicalProcess( namespace=target_ns, identifier=target_id, name=target_name, ) if relation == 'isPartOf': graph.add_part_of(source, target) elif relation == 'equivalentTo': graph.add_equivalence(source, target) else: raise ValueError(f'invalid mapping with relation: {relation}') return graph
def __init__(self, stmts=None, name=None, description=None, version=None, authors=None, contact=None, license=None, copyright=None, disclaimer=None): if stmts is None: self.statements = [] else: self.statements = stmts if name is None: name = 'indra' if version is None: version = str(uuid.uuid4()) # Create the model and assign metadata self.model = pybel.BELGraph( name=name, description=description, version=version, authors=authors, contact=contact, license=license, copyright=copyright, disclaimer=disclaimer, ) ns_dict = { 'HGNC': 'https://arty.scai.fraunhofer.de/artifactory/bel/' 'namespace/hgnc-human-genes/hgnc-human-genes-20170725.belns', 'UP': 'https://arty.scai.fraunhofer.de/artifactory/bel/' 'namespace/swissprot/swissprot-20170725.belns', 'IP': 'https://arty.scai.fraunhofer.de/artifactory/bel/' 'namespace/interpro/interpro-20170731.belns', 'FPLX': 'https://raw.githubusercontent.com/sorgerlab/famplex/' '5f5b573fe26d7405dbccb711ae8e5697b6a3ec7e/export/famplex.belns', #'PFAM': #'NXPFA': 'CHEBI': 'https://arty.scai.fraunhofer.de/artifactory/bel/' 'namespace/chebi-ids/chebi-ids-20170725.belns', 'GO': 'https://arty.scai.fraunhofer.de/artifactory/bel/' 'namespace/go/go-20180109.belns', 'MESH': 'https://arty.scai.fraunhofer.de/artifactory/bel/' 'namespace/mesh-processes/mesh-processes-20170725.belns' } self.model.namespace_url.update(ns_dict) self.model.namespace_pattern['PUBCHEM'] = '\d+'
def test_regulate_amount1_prot_obj(): mek = protein(name='MAP2K1', namespace='HGNC') erk = protein(name='MAPK1', namespace='HGNC') g = pybel.BELGraph() g.add_qualified_edge(mek, erk, relation=pc.INCREASES, evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 assert isinstance(pbp.statements[0], IncreaseAmount) assert len(pbp.statements[0].evidence) == 1
def test_controlled_transloc_loc_cond(): """Controlled translocations are currently not handled.""" subj = protein(name='MAP2K1', namespace='HGNC') obj = protein(name='MAPK1', namespace='HGNC') g = pybel.BELGraph() transloc = translocation(from_loc=entity('GOCC', 'intracellular'), to_loc=entity('GOCC', 'extracellular space')) g.add_qualified_edge(subj, obj, relation=pc.INCREASES, object_modifier=transloc, evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert not pbp.statements
def test_regulate_amount3_deg(): # FIXME: Create a stability-specific statement for p->deg(p(Foo)) mek = protein(name='MAP2K1', namespace='HGNC') erk = protein(name='MAPK1', namespace='HGNC') g = pybel.BELGraph() g.add_qualified_edge(mek, erk, relation=pc.INCREASES, object_modifier=degradation(), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 assert isinstance(pbp.statements[0], DecreaseAmount) assert len(pbp.statements[0].evidence) == 1
def test_activation_bioprocess(): bax = protein(name='BAX', namespace='HGNC') apoptosis = bioprocess(name='apoptosis', namespace='GOBP') g = pybel.BELGraph() g.add_qualified_edge(bax, apoptosis, relation=pc.INCREASES, evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 stmt = pbp.statements[0] assert isinstance(stmt, Activation) assert stmt.subj.name == 'BAX' assert stmt.obj.name == 'apoptosis' assert stmt.obj.db_refs == {} # FIXME: Update when GO lookup is implemented assert len(pbp.statements[0].evidence) == 1
def __init__(self, stmts=None, name=None, description=None, version=None, **kwargs): if stmts is None: self.statements = [] else: self.statements = stmts # Create the model and assign metadata self.model = pybel.BELGraph(name=name, version=version, description=description, **kwargs)
def test_gap(): sos = protein(name='RASA1', namespace='HGNC') kras = protein(name='KRAS', namespace='HGNC') g = pybel.BELGraph() g.add_qualified_edge(sos, kras, relation=pc.DIRECTLY_DECREASES, subject_modifier=activity(name='activity'), object_modifier=activity(name='gtp'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 stmt = pbp.statements[0] assert isinstance(stmt, Gap) assert stmt.gap.name == 'RASA1' assert stmt.ras.name == 'KRAS' assert stmt.gap.activity.activity_type == 'activity' assert stmt.gap.activity.is_active is True assert stmt.ras.activity is None assert len(pbp.statements[0].evidence) == 1
def test_subject_transloc_loc_cond(): """Translocations of the subject are treated as location conditions on the subject (using the to_loc location as the condition)""" subj = protein(name='MAP2K1', namespace='HGNC') obj = protein(name='MAPK1', namespace='HGNC') transloc = translocation(from_loc=entity('GOCC', 'intracellular'), to_loc=entity('GOCC', 'extracellular space')) g = pybel.BELGraph() g.add_qualified_edge(subj, obj, relation=pc.INCREASES, subject_modifier=transloc, evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 stmt = pbp.statements[0] assert isinstance(stmt, IncreaseAmount) assert stmt.subj.name == 'MAP2K1' assert stmt.subj.location == 'extracellular space' assert stmt.obj.name == 'MAPK1'
def test_indirect_gef_is_activation(): sos = protein(name='SOS1', namespace='HGNC') kras = protein(name='KRAS', namespace='HGNC') g = pybel.BELGraph() g.add_qualified_edge(sos, kras, relation=pc.INCREASES, subject_modifier=activity(name='activity'), object_modifier=activity(name='gtp'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 stmt = pbp.statements[0] assert isinstance(stmt, Activation) assert stmt.subj.name == 'SOS1' assert stmt.obj.name == 'KRAS' assert stmt.subj.activity.activity_type == 'activity' assert stmt.subj.activity.is_active is True assert stmt.obj.activity is None assert stmt.obj_activity == 'gtpbound' assert len(pbp.statements[0].evidence) == 1
def test_phosphorylation_two_sites(): mek = protein(name='MAP2K1', namespace='HGNC') erk = protein(name='MAPK1', namespace='HGNC', variants=[pmod('Ph', position=185, code='Thr'), pmod('Ph', position=187, code='Tyr')]) g = pybel.BELGraph() g.add_qualified_edge(mek, erk, relation=pc.DIRECTLY_INCREASES, evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 2 stmt1 = pbp.statements[0] stmt2 = pbp.statements[1] assert stmt1.residue == 'T' assert stmt1.position == '185' assert stmt2.residue == 'Y' assert stmt2.position == '187' assert stmt1.sub.mods == [] assert stmt2.sub.mods == [] assert len(pbp.statements[0].evidence) == 1
def test_gtpactivation(): kras = protein(name='KRAS', namespace='HGNC') braf = protein(name='BRAF', namespace='HGNC') g = pybel.BELGraph() g.add_qualified_edge(kras, braf, relation=pc.DIRECTLY_INCREASES, subject_modifier=activity(name='gtp'), object_modifier=activity(name='kin'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 stmt = pbp.statements[0] assert isinstance(stmt, GtpActivation) assert stmt.subj.name == 'KRAS' assert stmt.subj.activity.activity_type == 'gtpbound' assert stmt.subj.activity.is_active is True assert stmt.obj.name == 'BRAF' assert stmt.obj.activity is None assert stmt.obj_activity == 'kinase' assert len(stmt.evidence) == 1
def process_bel_stmt(bel: str, squeeze: bool = False): """Process a single BEL statement and return the PybelProcessor or a single statement if ``squeeze`` is True. Parameters ---------- bel : str A BEL statement. See example below. squeeze : Optional[bool] If squeeze and there's only one statement in the processor, it will be unpacked. Returns ------- statements : Union[Statement, PybelProcessor] A list of INDRA statments derived from the BEL statement. If squeeze is true and there was only one statement, the unpacked INDRA statement will be returned. Examples -------- >>> from indra.sources.bel import process_bel_stmt >>> bel_s = 'kin(p(FPLX:MEK)) -> kin(p(FPLX:ERK))' >>> process_bel_stmt(bel_s, squeeze=True) Activation(MEK(kinase), ERK(), kinase) """ r = pybel.parse(bel) # make sure activations in the right place for a, b in [(pc.SOURCE, pc.SOURCE_MODIFIER), (pc.TARGET, pc.TARGET_MODIFIER)]: side = r[a] for c in [pc.MODIFIER, pc.EFFECT, pc.FROM_LOC, pc.TO_LOC, pc.LOCATION]: if c in side: r.setdefault(b, {})[c] = side.pop(c) graph = pybel.BELGraph() add_sbel_row(graph, r) bp = process_pybel_graph(graph) if squeeze and len(bp.statements) == 1: return bp.statements[0] return bp
def test_regulate_activity(): mek = protein(name='MAP2K1', namespace='HGNC') erk = protein(name='MAPK1', namespace='HGNC') g = pybel.BELGraph() g.add_qualified_edge(mek, erk, relation=pc.INCREASES, subject_modifier=activity(name='kin'), object_modifier=activity(name='kin'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 assert isinstance(pbp.statements[0], Activation) subj = pbp.statements[0].subj assert subj.name == 'MAP2K1' assert isinstance(subj.activity, ActivityCondition) assert subj.activity.activity_type == 'kinase' assert subj.activity.is_active == True obj = pbp.statements[0].obj assert obj.name == 'MAPK1' assert obj.activity is None assert pbp.statements[0].obj_activity == 'kinase' assert len(pbp.statements[0].evidence) == 1
def test_subject_transloc_active_form(): """ActiveForms where the subject is a translocation--should draw on the to-location of the subject.""" subj = protein(name='MAP2K1', namespace='HGNC') obj = protein(name='MAP2K1', namespace='HGNC') transloc = translocation(from_loc=entity('GOCC', 'intracellular'), to_loc=entity('GOCC', 'extracellular space')) g = pybel.BELGraph() g.add_qualified_edge(subj, obj, relation=pc.INCREASES, subject_modifier=transloc, object_modifier=activity(name='kin'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 stmt = pbp.statements[0] assert isinstance(stmt, ActiveForm) assert stmt.agent.name == 'MAP2K1' assert stmt.agent.location == 'extracellular space' assert stmt.agent.activity is None assert stmt.activity == 'kinase' assert stmt.is_active is True
def test_active_form(): p53_pmod = protein(name='TP53', namespace='HGNC', variants=[pmod('Ph', position=33, code='Ser')]) p53_obj = protein(name='TP53', namespace='HGNC') g = pybel.BELGraph() g.add_qualified_edge(p53_pmod, p53_obj, relation=pc.INCREASES, object_modifier=activity(name='tscript'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 1 stmt = pbp.statements[0] assert isinstance(stmt, ActiveForm) assert stmt.activity == 'transcription' assert stmt.is_active is True ag = stmt.agent assert ag.name == 'TP53' assert len(ag.mods) == 1 mc = ag.mods[0] assert mc.mod_type == 'phosphorylation' assert mc.residue == 'S' assert mc.position == '33' assert len(pbp.statements[0].evidence) == 1
def test_complex_stmt_with_activation(): raf = protein(name='BRAF', namespace='HGNC') mek = protein(name='MAP2K1', namespace='HGNC') erk = protein(name='MAPK1', namespace='HGNC') cplx = complex_abundance([raf, mek]) g = pybel.BELGraph() g.add_qualified_edge(cplx, erk, relation=pc.DIRECTLY_INCREASES, object_modifier=activity(name='kin'), evidence="Some evidence.", citation='123456') pbp = bel.process_pybel_graph(g) assert pbp.statements assert len(pbp.statements) == 2 stmt1 = pbp.statements[0] assert isinstance(stmt1, Complex) assert len(stmt1.agent_list()) == 2 assert sorted([ag.name for ag in stmt1.agent_list()]) == ['BRAF', 'MAP2K1'] assert stmt1.evidence stmt2 = pbp.statements[1] assert isinstance(stmt2, Activation) assert stmt2.subj.name == 'BRAF' assert stmt2.subj.bound_conditions[0].agent.name == 'MAP2K1' assert stmt2.obj.name == 'MAPK1' assert stmt2.obj.activity is None assert stmt2.obj_activity == 'kinase'
def get_bel() -> pybel.BELGraph: """Get the HMDD data.""" # category mir disease pmid description path = ensure_path(PREFIX, URL) df = pd.read_csv( path, sep='\t', dtype=str, encoding="ISO-8859-1", ) failed_mirnas = 0 mirna_to_dsl = {} mirnas = df['mir'].unique() it = tqdm(mirnas, desc='mapping miRNA names') for text in it: _, identifier, name = pyobo.ground('mirbase', text) if identifier is None: it.write(f'[mirbase] could not ground: {text}') failed_mirnas += 1 continue mirna_to_dsl[text] = pybel.dsl.MicroRna( namespace='mirbase', identifier=identifier, name=name, ) logger.info(f'failed on {failed_mirnas}/{len(mirnas)} miRNAs') failed_diseases = 0 disease_to_dsl = {} diseases = df['disease'].unique() it = tqdm(diseases, desc='mapping disease names') for text in it: prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], text) if identifier is None and ', ' in text: i = text.index(', ') left, right = text[:i], text[i + 2:] x = f'{right} {left}' prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], x) if identifier is None and ', ' in x: x2 = ' '.join(z.strip() for z in text.split(',')[::-1]) prefix, identifier, name = pyobo.ground(['mondo', 'doid', 'efo', 'hp', 'mesh'], x2) if identifier is None: it.write(f'could not ground {text}') failed_diseases += 1 continue disease_to_dsl[text] = pybel.dsl.Pathology( namespace=prefix, identifier=identifier, name=name, ) logger.info(f'failed on {failed_diseases}/{len(diseases)} diseases') rv = pybel.BELGraph(name='HMDD', version=VERSION) for _category, mir, disease, pmid, text in df.values: source = mirna_to_dsl.get(mir) target = disease_to_dsl.get(disease) if not source or not target: continue rv.add_regulates( source, target, citation=pmid, evidence=text, ) return rv
def get_graph(identifier: str, *, rows: Optional[int] = None) -> pybel.BELGraph: """Get the graph surrounding a given GO term and its descendants.""" graph = pybel.BELGraph() enrich_graph(graph, identifier, rows=rows) return graph
def get_similarity_graph( *, fullgraph=DEFAULT_FULLGRAPH_WITHOUT_CHEMSIM_PICKLE, rebuild: bool = False, mapping_file=DEFAULT_CHEMICALS_MAPPING_PATH, chemsim_graph_path=DEFAULT_CHEMSIM_PICKLE, clustered: bool = True, similarity=0.7, name='Chemical Similarity Graph', version='1.1.0', authors='', contact='', description='', ): """ Create a BELGraph with chemicals as nodes, and similarity as edges. :param similarity: the percent in which the chemicals are similar :param mapping_file: an existing dataframe with pubchemIDs and Smiles """ if not rebuild and os.path.exists(DEFAULT_CHEMSIM_PICKLE): return nx.read_edgelist(DEFAULT_CHEMSIM_PICKLE) if type(fullgraph) == pybel.struct.graph.BELGraph: fullgraph_without_chemsim = fullgraph else: fullgraph_without_chemsim = pybel.from_pickle(fullgraph) pubchem_ids = [] for node in fullgraph_without_chemsim.nodes(): if node.namespace != 'pubchem.compound': continue pubchem_ids.append(node.identifier) if os.path.exists(mapping_file): chemicals_mapping = pd.read_csv( mapping_file, sep="\t", dtype={ 'PubchemID': str, 'Smiles': str }, index_col=False, ) pubchem_id_to_smiles = {} new_chemicals = [] smiles = [] for pubchem_id in tqdm(pubchem_ids, desc="Getting SMILES"): if chemicals_mapping.loc[chemicals_mapping["PubchemID"] == pubchem_id].empty: chemical_smiles = cid_to_smiles(pubchem_id) if not isinstance(chemical_smiles, str): chemical_smiles = chemical_smiles.decode("utf-8") pubchem_id_to_smiles[pubchem_id] = chemical_smiles new_chemicals.append(pubchem_id) smiles.append(chemical_smiles) else: pubchem_id_to_smiles[pubchem_id] = chemicals_mapping.loc[ chemicals_mapping["PubchemID"] == pubchem_id, "Smiles"].iloc[0] new_df = pd.DataFrame({"PubchemID": new_chemicals, "Smiles": smiles}) chemicals_mapping = chemicals_mapping.append(new_df) chemicals_mapping.to_csv(mapping_file, sep='\t', index=False) else: pubchem_id_to_smiles = get_smiles(pubchem_ids) pubchem_id_to_fingerprint = get_fingerprints(pubchem_id_to_smiles) chemsim_graph = pybel.BELGraph(name, version, description, authors, contact) if clustered: clustered_df = cluster_chemicals( rebuild=True, chemicals_dict=pubchem_id_to_fingerprint) clusters = clustered_df['Cluster'].unique().tolist() for cluster in tqdm(clusters, desc='Creating similarity BELGraph'): chemicals = clustered_df.loc[clustered_df.Cluster == cluster] if len(chemicals) == 1: continue for ind, row in chemicals.iterrows(): for ind1, row1 in chemicals.iterrows(): if row['PubchemID'] == row1['PubchemID']: continue chemical_01 = pybel.dsl.Abundance( namespace='pubchem.compound', identifier=row['PubchemID']) chemical_02 = pybel.dsl.Abundance( namespace='pubchem.compound', identifier=row1['PubchemID']) if chemsim_graph.has_edge( chemical_01, chemical_02) or chemsim_graph.has_edge( chemical_02, chemical_01): continue chemsim_graph.add_unqualified_edge(chemical_01, chemical_02, 'association') else: similarities = get_similarity(pubchem_id_to_fingerprint) for (source_pubchem_id, target_pubchem_id), sim in tqdm( similarities.items(), desc='Creating similarity BELGraph'): if sim < similarity: continue chemsim_graph.add_unqualified_edge( pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE, identifier=source_pubchem_id), pybel.dsl.Abundance(namespace=PUBCHEM_NAMESPACE, identifier=target_pubchem_id), 'association', ) pybel.to_pickle(chemsim_graph, chemsim_graph_path) return chemsim_graph
CITATION_TYPE: evidence['citation']['type'], CITATION_REFERENCE: evidence['citation']['id'] } annotation_map = { 'tissue': 'Tissue', 'disease': 'Disease', 'species_common_name': 'Species' } species_map = {'human': '9606', 'rat': '10116', 'mouse': '10090'} annotation_value_map = {'Species': species_map} graph = pybel.BELGraph() parser = pybel.parser.BelParser(graph) for edge in res['graph']['edges']: for evidence in edge['metadata']['evidences']: if 'citation' not in evidence or not evidence['citation']: continue d = {} if 'biological_context' in evidence: annotations = evidence['biological_context'] if annotations['tissue']: d['Tissue'] = annotations['tissue']