Example #1
0
def build_nodes(hetio: Dict, **kwargs) -> List[Node]:
    force_rebuild = kwargs.get("force_rebuild", False)
    save_checkpoint = kwargs.get("save_checkpoint", True)

    if not force_rebuild:
        if os.path.exists(NODES_CHECKPOINT):
            return Node.deserialize_bunch(NODES_CHECKPOINT)
        else:
            log.info("Node checkpoint does not exist, building nodes.")

    nodes = [
        Node(h["identifier"], h["name"], h["kind"],
             h["data"].get("source", None) or h["data"].get("sources", []),
             h["data"].get("license", None), h["data"].get("url", None))
        for h in hetio["nodes"]
    ]

    assert len(nodes) == len(hetio["nodes"])

    umls = kwargs.get("umls", None)
    do = kwargs.get("do", None)

    if umls is not None:
        nodes = add_compound_metadata(hetio, nodes, umls)

    if do is not None:
        nodes = add_disease_metadata(hetio, nodes, do)

    if save_checkpoint:
        log.info("Checkpointing nodes...")
        Node.serialize_bunch(nodes, NODES_CHECKPOINT)

    return nodes
Example #2
0
def build_nodes(repodb: pd.DataFrame, **kwargs) -> List[Node]:
    force_rebuild = kwargs.get("force_rebuild", False)
    save_checkpoint = kwargs.get("save_checkpoint", True)

    if not force_rebuild:
        if os.path.exists(NODES_CHECKPOINT):
            return Node.deserialize_bunch(NODES_CHECKPOINT)
        else:
            log.info("Node checkpoint does not exist, building nodes.")

    nodes = []

    drug_ids = repodb["drug_id"].unique()
    disease_ids = repodb["ind_id"].unique()

    for drug_id in drug_ids:
        match = repodb[repodb["drug_id"] == drug_id]
        drug_names = match["drug_name"].unique()
        assert len(drug_names) == 1

        node = Node(drug_id,
                    drug_names[0],
                    kind="Compound",
                    sources=["RepoDB"])
        nodes.append(node)

    for disease_id in disease_ids:
        match = repodb[repodb["ind_id"] == disease_id]
        disease_names = match["ind_name"].unique()
        assert len(disease_names) == 1

        node = Node(disease_id,
                    disease_names[0],
                    kind="Disease",
                    sources=["RepoDB"])
        nodes.append(node)

    log.info(f"Built {len(nodes)} nodes from RepoDB.")

    if save_checkpoint:
        log.info("Checkpointing nodes...")
        Node.serialize_bunch(nodes, NODES_CHECKPOINT)

    return nodes