コード例 #1
0
def HIV_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds):
    if data_type == 'kernel':
        if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'):
            K = mol_build_K(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_K', K)
        else:
            K = np.load('data/' + DB + '/' + DB + '_K.npy')

        list_assignment = np.zeros(K.shape[0])
        for y in [0, 1]:
            indices = np.where(list_y == y)[0]
            K_local = K[indices, :]
            K_local = K_local[:, indices]
            local_assignment = Khierarchical_cluster(K_local, n_folds)
            list_assignment[indices] = local_assignment

    elif data_type == 'features':
        if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'):
            X = mol_build_X(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_X', X)
        else:
            X = np.load('data/' + DB + '/' + DB + '_X.npy')

        list_assignment = np.zeros(X.shape[0])
        for y in [0, 1]:
            indices = np.where(list_y == y)[0]
            X_local = X[indices, :]
            local_assignment = Xkmeans_cluster(X_local, n_folds)
            list_assignment[indices] = local_assignment

    elif data_type == 'standard':
        if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'):
            X = mol_build_X(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_X', X)
        else:
            X = np.load('data/' + DB + '/' + DB + '_X.npy')

        list_assignment = np.zeros(X.shape[0])
        skf = model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=92)
        skf.get_n_splits(X, list_y)
        ifold = 0
        for train_index, test_index in skf.split(X, list_y):
            list_assignment[test_index] = ifold
            ifold += 1

    # import pdb; pdb.Pdb().set_trace()
    c = collections.Counter(list_assignment)
    print(c)
    folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())]

    fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w')
    for ifold in range(n_folds):
        fo.write("ifold" + str(ifold) + '\n')
        fo.write(str(collections.Counter(list_y[folds[ifold]])) + '\n')
        print(ifold, collections.Counter(list_y[folds[ifold]]))
        fo.write('\n')

    return folds
コード例 #2
0
def AtomizationEnergy_CV(DB, data_type, list_ID, list_y, list_SMILES,
                         dict_id2smile, n_folds):
    if data_type == 'kernel':
        if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'):
            K = mol_build_K(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_K', K)
        else:
            K = np.load('data/' + DB + '/' + DB + '_K.npy')

        list_assignment = Khierarchical_cluster(K, n_folds)

    elif data_type == 'features':
        if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'):
            X = mol_build_X(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_X', X)
        else:
            X = np.load('data/' + DB + '/' + DB + '_X.npy')

        list_assignment = Xkmeans_cluster(X, n_folds)

    elif data_type == 'standard':
        if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'):
            X = mol_build_X(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_X', X)
        else:
            X = np.load('data/' + DB + '/' + DB + '_X.npy')

        list_assignment = np.zeros(X.shape[0])
        skf = model_selection.KFold(n_folds, shuffle=True, random_state=92)
        skf.get_n_splits(X)
        ifold = 0
        for train_index, test_index in skf.split(X):
            list_assignment[test_index] = ifold
            ifold += 1

    # import pdb; pdb.Pdb().set_trace()
    c = collections.Counter(list_assignment)
    print(c)
    folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())]

    fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w')
    fo.write(str(c) + '\n')
    fo.close()

    return folds
コード例 #3
0
ファイル: process_PCBA.py プロジェクト: jcheminform/NNk_DTI
def PCBA_CV(DB, data_type, list_ID, list_y, list_SMILES, dict_id2smile, n_folds):

    if data_type == 'kernel':
        if not os.path.isfile('data/' + DB + '/' + DB + '_K.npy'):
            K = mol_build_K(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_K', K)
        else:
            K = np.load('data/' + DB + '/' + DB + '_K.npy')

        if DB == 'PCBA':
            # if Kmedoid
            # list_assignment, medoids = Kmedoid_cluster(K, n_folds)

            # if agglomerative clustering
            list_assignment = Khierarchical_cluster(K, n_folds)
        else:
            list_assignment = np.zeros(K.shape[0])
            for y in [0, 1]:
                indices = np.where(list_y == y)[0]
                K_local = K[indices, :]
                K_local = K_local[:, indices]
                local_assignment = Khierarchical_cluster(K_local, n_folds)
                list_assignment[indices] = local_assignment

    elif data_type == 'features':
        if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'):
            X = mol_build_X(list_SMILES)
            np.save('data/' + DB + '/' + DB + '_X', X)
        else:
            X = np.load('data/' + DB + '/' + DB + '_X.npy')

        if DB == 'PCBA':
            list_assignment = Xkmeans_cluster(X, n_folds)
        else:
            list_assignment = np.zeros(X.shape[0])
            for y in [0, 1]:
                indices = np.where(list_y == y)[0]
                X_local = X[indices, :]
                local_assignment = Xkmeans_cluster(X_local, n_folds)
                list_assignment[indices] = local_assignment

    elif data_type == 'standard':
        # if not os.path.isfile('data/' + DB + '/' + DB + '_X.npy'):
        #     X = mol_build_X(list_SMILES)
        #     np.save('data/' + DB + '/' + DB + '_X', X)
        # else:
        #     X = np.load('data/' + DB + '/' + DB + '_X.npy')
        list_ID = pickle.load(open('data/' + DB + '/' + DB + '_list_ID.data', 'rb'))
        list_y = np.array(pickle.load(open('data/' + DB + '/' + DB + '_list_y.data', 'rb')))
        X = np.zeros((len(list_ID), 1))
        list_assignment = np.zeros(X.shape[0])
        if DB not in ['PCBA', 'PCBA10', 'PCBA100']:
            skf = model_selection.StratifiedKFold(n_folds, shuffle=True, random_state=92)
            skf.get_n_splits(X, list_y)
            ifold = 0
            for train_index, test_index in skf.split(X, list_y):
                list_assignment[test_index] = ifold
                ifold += 1
        else:
            skf = model_selection.KFold(n_folds, shuffle=True, random_state=92)
            skf.get_n_splits(X)
            ifold = 0
            for train_index, test_index in skf.split(X):
                list_assignment[test_index] = ifold
                ifold += 1

    # import pdb; pdb.Pdb().set_trace()
    c = collections.Counter(list_assignment)
    print(c)
    folds = [np.where(list_assignment == cle)[0] for cle in list(c.keys())]

    fo = open('data/' + DB + '/' + DB + '_folds.txt', 'w')
    for ifold in range(n_folds):
        fo.write("ifold" + str(ifold) + '\n')
        if DB in ['PCBA', 'PCBA10', 'PCBA100']:
            for iclass in range(list_y.shape[1]):
                fo.write("iclass " + str(iclass) + ' ' +
                         str(collections.Counter(list_y[folds[ifold], iclass])) + '\n')
                print("iclass " + str(iclass) + ' ' +
                      str(collections.Counter(list_y[folds[ifold], iclass])))
        else:
            fo.write(str(collections.Counter(list_y[folds[ifold]])) + '\n')
            print(ifold, collections.Counter(list_y[folds[ifold]]))
        fo.write('\n')

    return folds
コード例 #4
0
                open('data/' + DB + '/' + DB + '_Kprot.data', 'rb'))
        elif DB in LIST_PROT_DATASETS + LIST_AA_DATASETS:
            list_ID = pickle.load(
                open('data/' + DB + '/' + DB + '_list_ID.data', 'rb'))
            dict_prot2ind = {prot: ind for ind, prot in enumerate(list_ID)}
            Kprot = pickle.load(
                open('data/' + DB + '/' + DB + '_Kprot.data', 'rb'))
            Kmol, dict_mol2ind = None, None
        elif DB in LIST_MOL_DATASETS:
            if DB not in ['HIV']:
                Kmol = pickle.load(
                    open('data/' + DB + '/' + DB + '_Kmol.data', 'rb'))
            else:
                list_SMILES = pickle.load(
                    open('data/' + DB + '/' + DB + '_list_SMILES.data', 'rb'))
                Kmol = mol_build_K(list_SMILES)
            Kprot, dict_prot2ind = None, None
            list_ID = pickle.load(
                open('data/' + DB + '/' + DB + '_list_ID.data', 'rb'))
            dict_mol2ind = {mol: ind for ind, mol in enumerate(list_ID)}

        nfolds, seed = 5, 92

        x_tr, y_tr, x_val, y_val, x_te, y_te = \
            get_fold_data(DB, nfolds, fold_val, fold_te, setting, ratio_tr, ratio_te)
        # import pdb; pdb.Pdb().set_trace()
        K_tr, K_val, K_te = \
            get_kernels(DB, x_tr, x_te, x_val, Kmol, Kprot, dict_prot2ind, dict_mol2ind)

        if args.cv_val:
            list_perf = cv(DB, K_tr, y_tr, K_val, y_val)