def HIV_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds): if data_type == 'kernel': if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'): K = mol_build_K(list_SMILES) np.save('data/' + DB + '/' + DB + '_K', K) else: K = np.load('data/' + DB + '/' + DB + '_K.npy') list_assignment = np.zeros(K.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] K_local = K[indices, :] K_local = K_local[:, indices] local_assignment = Khierarchical_cluster(K_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'features': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = np.zeros(X.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] X_local = X[indices, :] local_assignment = Xkmeans_cluster(X_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'standard': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = np.zeros(X.shape[0]) skf = model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X, list_y) ifold = 0 for train_index, test_index in skf.split(X, list_y): list_assignment[test_index] = ifold ifold += 1 # import pdb; pdb.Pdb().set_trace() c = collections.Counter(list_assignment) print(c) folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())] fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w') for ifold in range(n_folds): fo.write("ifold" + str(ifold) + '\n') fo.write(str(collections.Counter(list_y[folds[ifold]])) + '\n') print(ifold, collections.Counter(list_y[folds[ifold]])) fo.write('\n') return folds
def AtomizationEnergy_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds): if data_type == 'kernel': if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'): K = mol_build_K(list_SMILES) np.save('data/' + DB + '/' + DB + '_K', K) else: K = np.load('data/' + DB + '/' + DB + '_K.npy') list_assignment = Khierarchical_cluster(K, n_folds) elif data_type == 'features': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = Xkmeans_cluster(X, n_folds) elif data_type == 'standard': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') list_assignment = np.zeros(X.shape[0]) skf = model_selection.KFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X) ifold = 0 for train_index, test_index in skf.split(X): list_assignment[test_index] = ifold ifold += 1 # import pdb; pdb.Pdb().set_trace() c = collections.Counter(list_assignment) print(c) folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())] fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w') fo.write(str(c) + '\n') fo.close() return folds
def PCBA_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds): if data_type == 'kernel': if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'): K = mol_build_K(list_SMILES) np.save('data/' + DB + '/' + DB + '_K', K) else: K = np.load('data/' + DB + '/' + DB + '_K.npy') if DB == 'PCBA': # if Kmedoid # list_assignment, medoids = Kmedoid_cluster(K, n_folds) # if agglomerative clustering list_assignment = Khierarchical_cluster(K, n_folds) else: list_assignment = np.zeros(K.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] K_local = K[indices, :] K_local = K_local[:, indices] local_assignment = Khierarchical_cluster(K_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'features': if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): X = mol_build_X(list_SMILES) np.save('data/' + DB + '/' + DB + '_X', X) else: X = np.load('data/' + DB + '/' + DB + '_X.npy') if DB == 'PCBA': list_assignment = Xkmeans_cluster(X, n_folds) else: list_assignment = np.zeros(X.shape[0]) for y in [0, 1]: indices = np.where(list_y == y)[0] X_local = X[indices, :] local_assignment = Xkmeans_cluster(X_local, n_folds) list_assignment[indices] = local_assignment elif data_type == 'standard': # if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'): # X = mol_build_X(list_SMILES) # np.save('data/' + DB + '/' + DB + '_X', X) # else: # X = np.load('data/' + DB + '/' + DB + '_X.npy') list_ID = pickle.load(open('data/' + DB + '/' + DB + '_list_ID.data', 'rb')) list_y = np.array(pickle.load(open('data/' + DB + '/' + DB + '_list_y.data', 'rb'))) X = np.zeros((len(list_ID), 1)) list_assignment = np.zeros(X.shape[0]) if DB not in ['PCBA', 'PCBA10', 'PCBA100']: skf = model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X, list_y) ifold = 0 for train_index, test_index in skf.split(X, list_y): list_assignment[test_index] = ifold ifold += 1 else: skf = model_selection.KFold(n_folds, shuffle=True, random_state=92) skf.get_n_splits(X) ifold = 0 for train_index, test_index in skf.split(X): list_assignment[test_index] = ifold ifold += 1 # import pdb; pdb.Pdb().set_trace() c = collections.Counter(list_assignment) print(c) folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())] fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w') for ifold in range(n_folds): fo.write("ifold" + str(ifold) + '\n') if DB in ['PCBA', 'PCBA10', 'PCBA100']: for iclass in range(list_y.shape[1]): fo.write("iclass " + str(iclass) + ' ' + str(collections.Counter(list_y[folds[ifold], iclass])) + '\n') print("iclass " + str(iclass) + ' ' + str(collections.Counter(list_y[folds[ifold], iclass]))) else: fo.write(str(collections.Counter(list_y[folds[ifold]])) + '\n') print(ifold, collections.Counter(list_y[folds[ifold]])) fo.write('\n') return folds
open('data/' + DB + '/' + DB + '_Kprot.data', 'rb')) elif DB in LIST_PROT_DATASETS + LIST_AA_DATASETS: list_ID = pickle.load( open('data/' + DB + '/' + DB + '_list_ID.data', 'rb')) dict_prot2ind = {prot: ind for ind, prot in enumerate(list_ID)} Kprot = pickle.load( open('data/' + DB + '/' + DB + '_Kprot.data', 'rb')) Kmol, dict_mol2ind = None, None elif DB in LIST_MOL_DATASETS: if DB not in ['HIV']: Kmol = pickle.load( open('data/' + DB + '/' + DB + '_Kmol.data', 'rb')) else: list_SMILES = pickle.load( open('data/' + DB + '/' + DB + '_list_SMILES.data', 'rb')) Kmol = mol_build_K(list_SMILES) Kprot, dict_prot2ind = None, None list_ID = pickle.load( open('data/' + DB + '/' + DB + '_list_ID.data', 'rb')) dict_mol2ind = {mol: ind for ind, mol in enumerate(list_ID)} nfolds, seed = 5, 92 x_tr, y_tr, x_val, y_val, x_te, y_te = \ get_fold_data(DB, nfolds, fold_val, fold_te, setting, ratio_tr, ratio_te) # import pdb; pdb.Pdb().set_trace() K_tr, K_val, K_te = \ get_kernels(DB, x_tr, x_te, x_val, Kmol, Kprot, dict_prot2ind, dict_mol2ind) if args.cv_val: list_perf = cv(DB, K_tr, y_tr, K_val, y_val)