def build_nodes(hetio: Dict, **kwargs) -> List[Node]: force_rebuild = kwargs.get("force_rebuild", False) save_checkpoint = kwargs.get("save_checkpoint", True) if not force_rebuild: if os.path.exists(NODES_CHECKPOINT): return Node.deserialize_bunch(NODES_CHECKPOINT) else: log.info("Node checkpoint does not exist, building nodes.") nodes = [ Node(h["identifier"], h["name"], h["kind"], h["data"].get("source", None) or h["data"].get("sources", []), h["data"].get("license", None), h["data"].get("url", None)) for h in hetio["nodes"] ] assert len(nodes) == len(hetio["nodes"]) umls = kwargs.get("umls", None) do = kwargs.get("do", None) if umls is not None: nodes = add_compound_metadata(hetio, nodes, umls) if do is not None: nodes = add_disease_metadata(hetio, nodes, do) if save_checkpoint: log.info("Checkpointing nodes...") Node.serialize_bunch(nodes, NODES_CHECKPOINT) return nodes
def build_nodes(repodb: pd.DataFrame, **kwargs) -> List[Node]: force_rebuild = kwargs.get("force_rebuild", False) save_checkpoint = kwargs.get("save_checkpoint", True) if not force_rebuild: if os.path.exists(NODES_CHECKPOINT): return Node.deserialize_bunch(NODES_CHECKPOINT) else: log.info("Node checkpoint does not exist, building nodes.") nodes = [] drug_ids = repodb["drug_id"].unique() disease_ids = repodb["ind_id"].unique() for drug_id in drug_ids: match = repodb[repodb["drug_id"] == drug_id] drug_names = match["drug_name"].unique() assert len(drug_names) == 1 node = Node(drug_id, drug_names[0], kind="Compound", sources=["RepoDB"]) nodes.append(node) for disease_id in disease_ids: match = repodb[repodb["ind_id"] == disease_id] disease_names = match["ind_name"].unique() assert len(disease_names) == 1 node = Node(disease_id, disease_names[0], kind="Disease", sources=["RepoDB"]) nodes.append(node) log.info(f"Built {len(nodes)} nodes from RepoDB.") if save_checkpoint: log.info("Checkpointing nodes...") Node.serialize_bunch(nodes, NODES_CHECKPOINT) return nodes