def sdf_to_pt_custom(src_root, dst_root, dataset_name, geometry="qm"): target_csv_f = osp.join(src_root, "{}_target.csv".format(dataset_name)) extra_target_f = osp.join(src_root, "{}_extra_target.pt".format(dataset_name)) extra_target = torch.load(extra_target_f) target_csv = pd.read_csv(target_csv_f) indexes = target_csv["f_name"].values.reshape(-1).tolist() sdf = [ osp.join(src_root, "{}_data".format(dataset_name), "{}.{}.sdf".format(i, geometry)) for i in indexes ] data_list = [] for i in tqdm(range(target_csv.shape[0])): this_info = Gauss16Info(qm_sdf=sdf[i], dipole=extra_target["dipole"][i], prop_dict_raw=target_csv.iloc[i].to_dict()) data = this_info.get_torch_data() data_edge = my_pre_transform(data, edge_version="cutoff", do_sort_edge=True, cal_efg=False, cutoff=10.0, boundary_factor=100., use_center=True, mol=None, cal_3body_term=False, bond_atom_sep=False, record_long_range=True) data_list.append(data_edge) torch.save( torch_geometric.data.InMemoryDataset.collate(data_list), osp.join(dst_root, "{}_raw_{}.pt".format(dataset_name, geometry)))
def sdf_to_pt(n_heavy, src_root, dst_root, geometry="qm"): """ Preprocess Frag20 dataset into PyTorch geometric format :param n_heavy: :param src_root: :param dst_root: :param geometry: :return: """ data_list = [] target_csv_f = osp.join(src_root, "Frag20_{}_target.csv".format(n_heavy, n_heavy)) extra_target_f = osp.join(src_root, "Frag20_{}_extra_target.pt".format(n_heavy)) extra_target = torch.load(extra_target_f) target_csv = pd.read_csv(target_csv_f) _f_name = ".opt" if geometry == "qm" else "" if n_heavy >= 10: indexes = target_csv["idx_name"].values.reshape(-1).tolist() sdf = [ osp.join(src_root, "Frag20_{}_data".format(n_heavy), "{}{}.sdf".format(i, _f_name)) for i in indexes ] else: index_csv = pd.read_csv( osp.join(src_root, "Frag20_{}_index.csv".format(n_heavy, n_heavy))) indexes = index_csv["idx"].values.reshape(-1).tolist() sources = index_csv["source"].values.reshape(-1).tolist() sdf = [ osp.join(src_root, "Frag20_{}_data".format(n_heavy), "{}".format(s), "{}{}.sdf".format(i, _f_name)) for i, s in zip(indexes, sources) ] for i in tqdm(range(target_csv.shape[0]), "processing heavy: {}".format(n_heavy)): this_info = Gauss16Info(qm_sdf=sdf[i], dipole=extra_target["dipole"][i], prop_dict_raw=target_csv.iloc[i].to_dict()) data = this_info.get_torch_data() data_edge = my_pre_transform(data, edge_version="cutoff", do_sort_edge=True, cal_efg=False, cutoff=10.0, boundary_factor=100., use_center=True, mol=None, cal_3body_term=False, bond_atom_sep=False, record_long_range=True) data_list.append(data_edge) torch.save( torch_geometric.data.InMemoryDataset.collate(data_list), osp.join(dst_root, "frag20_{}_{}_raw.pt".format(n_heavy, geometry)))
def infuse_energy(sol_csv, dataset_p, dataset_root): dataset = DummyIMDataset(root=dataset_root, dataset_name=dataset_p) sol_df = pd.read_csv(sol_csv).set_index("f_name") data_list = [] current_idx = 0 split = {"train_index": [], "valid_index": [], "test_index": []} warning_keys = set() for i in range(len(dataset)): this_data = dataset[i] this_idx = int(this_data["f_name"]) if this_idx in sol_df.index: info = sol_df.loc[this_idx] for key in [ "gasEnergy", "watEnergy", "octEnergy", "CalcSol", "CalcOct", "watOct", "activity", "CalcLogP" ]: if key in info: setattr(this_data, key, torch.as_tensor(info[key])) else: if key not in warning_keys: print(f"{key} not present in csv file!") warning_keys.add(key) group = info["group"] if "group" in info else "test" split[f"{group}_index"].append(current_idx) current_idx += 1 this_data = my_pre_transform(this_data, edge_version="cutoff", do_sort_edge=True, cal_efg=False, cutoff=10.0, boundary_factor=100., use_center=True, mol=None, cal_3body_term=False, bond_atom_sep=False, record_long_range=True) data_list.append(this_data) collated = torch_geometric.data.InMemoryDataset.collate(data_list) torch.save( collated, osp.join(dataset_root, "processed", dataset_p.split(".")[0] + ".pt")) torch.save( split, osp.join(dataset_root, "processed", "split_" + dataset_p.split(".")[0] + ".pt"))
def convert_pt(): sdf_folder = "raw/lipop_sdfs" target = pd.read_csv("lipop.csv") dst = "processed/lipop_mmff.pt" split_dst = "processed/lipop_split.pt" data_list = [] for i in tqdm(target.idx_name): sdf_f = osp.join(sdf_folder, "{}.mmff.sdf".format(i)) more_target = {"activity": torch.as_tensor(target["activity"][i]).view(-1), "group": np.array([target["group"][i]], dtype=object).reshape(-1)} info = Gauss16Info(qm_sdf=sdf_f, prop_dict_raw={"dd_target": more_target}) data_edge = my_pre_transform(info.get_torch_data(), edge_version="cutoff", do_sort_edge=True, cal_efg=False, cutoff=10.0, boundary_factor=100., use_center=True, mol=None, cal_3body_term=False, bond_atom_sep=False, record_long_range=True) data_list.append(data_edge) torch.save(torch_geometric.data.InMemoryDataset.collate(data_list), osp.join(dst)) group = target["group"].values split = {"train_index": target.idx_name[group == "train"], "valid_index": target.idx_name[group == "valid"], "test_index": target.idx_name[group == "test"]} torch.save(split, split_dst)
def preprocess_frag20_sol(): geometry = "mmff_gen" dd_csv_folder = "/scratch/projects/yzlab/group/temp_dd/solvation/calculated/" train_csv = pd.read_csv(osp.join(dd_csv_folder, "all.csv")) valid_csv = pd.read_csv(osp.join(dd_csv_folder, "valid.csv")) test_csv = pd.read_csv(osp.join(dd_csv_folder, "test.csv")) # concatenate them in this order concat_csv = pd.concat([train_csv, valid_csv, test_csv], ignore_index=True) jl_root = "/ext3" extra_info_heavy = { i: torch.load(osp.join(jl_root, "Frag20_{}_extra_target.pt".format(i))) for i in range(9, 21) } tgt_info_heavy = { i: pd.read_csv(osp.join(jl_root, "Frag20_{}_target.csv".format(i))) for i in range(9, 21) } # different naming for different geometries frag20_ext = ".opt" if geometry == "qm" else "" cccd_ext = ".opt" if geometry == "qm" else "_min" ccdc_root = "/scratch/sx801/data/CSD20/CSD20/CSD20_data" ccdc_extra_target = torch.load("/ext3/CSD20_extra_target.pt") ccdc_target = pd.read_csv("/ext3/CSD20_target.csv") save_root = "/scratch/sx801/data/Frag20-Sol" os.makedirs(save_root, exist_ok=True) data_list = [] success_map = [] for i in tqdm(range(concat_csv.shape[0])): this_id = int(concat_csv["ID"].iloc[i]) this_source = concat_csv["SourceFile"].iloc[i] if geometry in ["qm", "mmff", "mmff_gen"]: if this_source == "ccdc": mask = (ccdc_target["idx_name"] == this_id).values.reshape(-1) tgt_dict = ccdc_target.loc[mask].iloc[0].to_dict() sdf_file = osp.join(ccdc_root, "{}{}.sdf".format(this_id, cccd_ext)) dipole = ccdc_extra_target["dipole"][mask] else: n_heavy = 9 if this_source == "less10" else int(this_source) mask = (tgt_info_heavy[n_heavy]["idx_name"] == this_id ).values.reshape(-1) tgt_dict = tgt_info_heavy[n_heavy].loc[mask].iloc[0].to_dict() if n_heavy > 9: sdf_file = osp.join(jl_root, "Frag20_{}_data".format(n_heavy), "{}{}.sdf".format(this_id, frag20_ext)) else: sdf_file = osp.join(jl_root, "Frag20_{}_data".format(n_heavy), "pubchem", "{}{}.sdf".format(this_id, frag20_ext)) dipole = extra_info_heavy[n_heavy]["dipole"][mask] else: raise ValueError("invalid geometry: " + geometry) if geometry == "mmff_gen": sdf_file = osp.join("/ext3/mmff_sdfs/{}.mmff.sdf".format(i)) if not osp.exists(sdf_file): success_map.append(0) continue tmp = {} for name in [ "gasEnergy", "watEnergy", "octEnergy", "CalcSol", "CalcOct", "calcLogP" ]: tmp[name] = torch.as_tensor(concat_csv[name].iloc[i]).view(-1) tgt_dict["dd_target"] = tmp this_info = Gauss16Info(qm_sdf=sdf_file, dipole=dipole, prop_dict_raw=tgt_dict) data = this_info.get_torch_data() data_edge = my_pre_transform(data, edge_version="cutoff", do_sort_edge=True, cal_efg=False, cutoff=10.0, boundary_factor=100., use_center=True, mol=None, cal_3body_term=False, bond_atom_sep=False, record_long_range=True) data_list.append(data_edge) success_map.append(1) print("collating and saving...") torch.save( torch_geometric.data.InMemoryDataset.collate(data_list), osp.join(save_root, "frag20_sol_{}_cutoff-10.pt".format(geometry))) success_map = torch.as_tensor(success_map).long().view(-1) print("Success: {}/{}".format(success_map.sum(), success_map.shape[0])) torch.save(success_map, "success_map.pt") train_size = train_csv.shape[0] valid_size = valid_csv.shape[0] test_size = test_csv.shape[0] torch.save( { "train_index": torch.arange(train_size), "valid_index": torch.arange(train_size, train_size + valid_size), "test_index": torch.arange(train_size + valid_size, train_size + valid_size + test_size) }, osp.join(save_root, "frag20_sol_split_{}_03222021.pt".format(geometry)))