Example #1
0
def sdf_to_pt_custom(src_root, dst_root, dataset_name, geometry="qm"):
    target_csv_f = osp.join(src_root, "{}_target.csv".format(dataset_name))
    extra_target_f = osp.join(src_root,
                              "{}_extra_target.pt".format(dataset_name))
    extra_target = torch.load(extra_target_f)
    target_csv = pd.read_csv(target_csv_f)
    indexes = target_csv["f_name"].values.reshape(-1).tolist()
    sdf = [
        osp.join(src_root, "{}_data".format(dataset_name),
                 "{}.{}.sdf".format(i, geometry)) for i in indexes
    ]

    data_list = []
    for i in tqdm(range(target_csv.shape[0])):
        this_info = Gauss16Info(qm_sdf=sdf[i],
                                dipole=extra_target["dipole"][i],
                                prop_dict_raw=target_csv.iloc[i].to_dict())
        data = this_info.get_torch_data()
        data_edge = my_pre_transform(data,
                                     edge_version="cutoff",
                                     do_sort_edge=True,
                                     cal_efg=False,
                                     cutoff=10.0,
                                     boundary_factor=100.,
                                     use_center=True,
                                     mol=None,
                                     cal_3body_term=False,
                                     bond_atom_sep=False,
                                     record_long_range=True)
        data_list.append(data_edge)
    torch.save(
        torch_geometric.data.InMemoryDataset.collate(data_list),
        osp.join(dst_root, "{}_raw_{}.pt".format(dataset_name, geometry)))
Example #2
0
def sdf_to_pt(n_heavy, src_root, dst_root, geometry="qm"):
    """
    Preprocess Frag20 dataset into PyTorch geometric format
    :param n_heavy:
    :param src_root:
    :param dst_root:
    :param geometry:
    :return:
    """
    data_list = []

    target_csv_f = osp.join(src_root,
                            "Frag20_{}_target.csv".format(n_heavy, n_heavy))
    extra_target_f = osp.join(src_root,
                              "Frag20_{}_extra_target.pt".format(n_heavy))
    extra_target = torch.load(extra_target_f)
    target_csv = pd.read_csv(target_csv_f)

    _f_name = ".opt" if geometry == "qm" else ""

    if n_heavy >= 10:
        indexes = target_csv["idx_name"].values.reshape(-1).tolist()
        sdf = [
            osp.join(src_root, "Frag20_{}_data".format(n_heavy),
                     "{}{}.sdf".format(i, _f_name)) for i in indexes
        ]
    else:
        index_csv = pd.read_csv(
            osp.join(src_root, "Frag20_{}_index.csv".format(n_heavy, n_heavy)))
        indexes = index_csv["idx"].values.reshape(-1).tolist()
        sources = index_csv["source"].values.reshape(-1).tolist()
        sdf = [
            osp.join(src_root, "Frag20_{}_data".format(n_heavy),
                     "{}".format(s), "{}{}.sdf".format(i, _f_name))
            for i, s in zip(indexes, sources)
        ]

    for i in tqdm(range(target_csv.shape[0]),
                  "processing heavy: {}".format(n_heavy)):
        this_info = Gauss16Info(qm_sdf=sdf[i],
                                dipole=extra_target["dipole"][i],
                                prop_dict_raw=target_csv.iloc[i].to_dict())
        data = this_info.get_torch_data()
        data_edge = my_pre_transform(data,
                                     edge_version="cutoff",
                                     do_sort_edge=True,
                                     cal_efg=False,
                                     cutoff=10.0,
                                     boundary_factor=100.,
                                     use_center=True,
                                     mol=None,
                                     cal_3body_term=False,
                                     bond_atom_sep=False,
                                     record_long_range=True)
        data_list.append(data_edge)

    torch.save(
        torch_geometric.data.InMemoryDataset.collate(data_list),
        osp.join(dst_root, "frag20_{}_{}_raw.pt".format(n_heavy, geometry)))
Example #3
0
def infuse_energy(sol_csv, dataset_p, dataset_root):
    dataset = DummyIMDataset(root=dataset_root, dataset_name=dataset_p)
    sol_df = pd.read_csv(sol_csv).set_index("f_name")
    data_list = []
    current_idx = 0
    split = {"train_index": [], "valid_index": [], "test_index": []}
    warning_keys = set()
    for i in range(len(dataset)):
        this_data = dataset[i]
        this_idx = int(this_data["f_name"])
        if this_idx in sol_df.index:
            info = sol_df.loc[this_idx]
            for key in [
                    "gasEnergy", "watEnergy", "octEnergy", "CalcSol",
                    "CalcOct", "watOct", "activity", "CalcLogP"
            ]:
                if key in info:
                    setattr(this_data, key, torch.as_tensor(info[key]))
                else:
                    if key not in warning_keys:
                        print(f"{key} not present in csv file!")
                    warning_keys.add(key)

            group = info["group"] if "group" in info else "test"
            split[f"{group}_index"].append(current_idx)
            current_idx += 1

            this_data = my_pre_transform(this_data,
                                         edge_version="cutoff",
                                         do_sort_edge=True,
                                         cal_efg=False,
                                         cutoff=10.0,
                                         boundary_factor=100.,
                                         use_center=True,
                                         mol=None,
                                         cal_3body_term=False,
                                         bond_atom_sep=False,
                                         record_long_range=True)
            data_list.append(this_data)

    collated = torch_geometric.data.InMemoryDataset.collate(data_list)
    torch.save(
        collated,
        osp.join(dataset_root, "processed",
                 dataset_p.split(".")[0] + ".pt"))
    torch.save(
        split,
        osp.join(dataset_root, "processed",
                 "split_" + dataset_p.split(".")[0] + ".pt"))
Example #4
0
def convert_pt():
    sdf_folder = "raw/lipop_sdfs"
    target = pd.read_csv("lipop.csv")
    dst = "processed/lipop_mmff.pt"
    split_dst = "processed/lipop_split.pt"
    data_list = []
    for i in tqdm(target.idx_name):
        sdf_f = osp.join(sdf_folder, "{}.mmff.sdf".format(i))
        more_target = {"activity": torch.as_tensor(target["activity"][i]).view(-1),
                       "group": np.array([target["group"][i]], dtype=object).reshape(-1)}
        info = Gauss16Info(qm_sdf=sdf_f, prop_dict_raw={"dd_target": more_target})
        data_edge = my_pre_transform(info.get_torch_data(), edge_version="cutoff", do_sort_edge=True, cal_efg=False,
                                     cutoff=10.0, boundary_factor=100., use_center=True, mol=None, cal_3body_term=False,
                                     bond_atom_sep=False, record_long_range=True)
        data_list.append(data_edge)

    torch.save(torch_geometric.data.InMemoryDataset.collate(data_list),
               osp.join(dst))
    group = target["group"].values
    split = {"train_index": target.idx_name[group == "train"],
             "valid_index": target.idx_name[group == "valid"],
             "test_index": target.idx_name[group == "test"]}
    torch.save(split, split_dst)
Example #5
0
def preprocess_frag20_sol():
    geometry = "mmff_gen"
    dd_csv_folder = "/scratch/projects/yzlab/group/temp_dd/solvation/calculated/"
    train_csv = pd.read_csv(osp.join(dd_csv_folder, "all.csv"))
    valid_csv = pd.read_csv(osp.join(dd_csv_folder, "valid.csv"))
    test_csv = pd.read_csv(osp.join(dd_csv_folder, "test.csv"))
    # concatenate them in this order
    concat_csv = pd.concat([train_csv, valid_csv, test_csv], ignore_index=True)

    jl_root = "/ext3"
    extra_info_heavy = {
        i: torch.load(osp.join(jl_root, "Frag20_{}_extra_target.pt".format(i)))
        for i in range(9, 21)
    }
    tgt_info_heavy = {
        i: pd.read_csv(osp.join(jl_root, "Frag20_{}_target.csv".format(i)))
        for i in range(9, 21)
    }
    # different naming for different geometries
    frag20_ext = ".opt" if geometry == "qm" else ""
    cccd_ext = ".opt" if geometry == "qm" else "_min"

    ccdc_root = "/scratch/sx801/data/CSD20/CSD20/CSD20_data"
    ccdc_extra_target = torch.load("/ext3/CSD20_extra_target.pt")
    ccdc_target = pd.read_csv("/ext3/CSD20_target.csv")

    save_root = "/scratch/sx801/data/Frag20-Sol"
    os.makedirs(save_root, exist_ok=True)

    data_list = []
    success_map = []
    for i in tqdm(range(concat_csv.shape[0])):
        this_id = int(concat_csv["ID"].iloc[i])
        this_source = concat_csv["SourceFile"].iloc[i]
        if geometry in ["qm", "mmff", "mmff_gen"]:
            if this_source == "ccdc":
                mask = (ccdc_target["idx_name"] == this_id).values.reshape(-1)
                tgt_dict = ccdc_target.loc[mask].iloc[0].to_dict()
                sdf_file = osp.join(ccdc_root,
                                    "{}{}.sdf".format(this_id, cccd_ext))
                dipole = ccdc_extra_target["dipole"][mask]
            else:
                n_heavy = 9 if this_source == "less10" else int(this_source)
                mask = (tgt_info_heavy[n_heavy]["idx_name"] == this_id
                        ).values.reshape(-1)
                tgt_dict = tgt_info_heavy[n_heavy].loc[mask].iloc[0].to_dict()
                if n_heavy > 9:
                    sdf_file = osp.join(jl_root,
                                        "Frag20_{}_data".format(n_heavy),
                                        "{}{}.sdf".format(this_id, frag20_ext))
                else:
                    sdf_file = osp.join(jl_root,
                                        "Frag20_{}_data".format(n_heavy),
                                        "pubchem",
                                        "{}{}.sdf".format(this_id, frag20_ext))
                dipole = extra_info_heavy[n_heavy]["dipole"][mask]
        else:
            raise ValueError("invalid geometry: " + geometry)

        if geometry == "mmff_gen":
            sdf_file = osp.join("/ext3/mmff_sdfs/{}.mmff.sdf".format(i))

        if not osp.exists(sdf_file):
            success_map.append(0)
            continue

        tmp = {}
        for name in [
                "gasEnergy", "watEnergy", "octEnergy", "CalcSol", "CalcOct",
                "calcLogP"
        ]:
            tmp[name] = torch.as_tensor(concat_csv[name].iloc[i]).view(-1)
        tgt_dict["dd_target"] = tmp

        this_info = Gauss16Info(qm_sdf=sdf_file,
                                dipole=dipole,
                                prop_dict_raw=tgt_dict)

        data = this_info.get_torch_data()
        data_edge = my_pre_transform(data,
                                     edge_version="cutoff",
                                     do_sort_edge=True,
                                     cal_efg=False,
                                     cutoff=10.0,
                                     boundary_factor=100.,
                                     use_center=True,
                                     mol=None,
                                     cal_3body_term=False,
                                     bond_atom_sep=False,
                                     record_long_range=True)
        data_list.append(data_edge)

        success_map.append(1)

    print("collating and saving...")
    torch.save(
        torch_geometric.data.InMemoryDataset.collate(data_list),
        osp.join(save_root, "frag20_sol_{}_cutoff-10.pt".format(geometry)))

    success_map = torch.as_tensor(success_map).long().view(-1)
    print("Success: {}/{}".format(success_map.sum(), success_map.shape[0]))
    torch.save(success_map, "success_map.pt")

    train_size = train_csv.shape[0]
    valid_size = valid_csv.shape[0]
    test_size = test_csv.shape[0]
    torch.save(
        {
            "train_index":
            torch.arange(train_size),
            "valid_index":
            torch.arange(train_size, train_size + valid_size),
            "test_index":
            torch.arange(train_size + valid_size,
                         train_size + valid_size + test_size)
        },
        osp.join(save_root,
                 "frag20_sol_split_{}_03222021.pt".format(geometry)))