コード例 #1
0
def get_test_folds(DB_version, DB_type):
    """ 
    Load the test folds

    Parameters
    ----------
    DB_version : str
        string of the DrugBank version number
        format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1"
    DB_type : str
        string of the DrugBank type

    Returns
    -------
    test_folds : ListInteractions 
    """

    # data_dir variable
    data_dir = 'data/' + DB_version + '/' + DB_type + '/'

    cv_dirname = root + data_dir + '/cross_validation/'

    test_folds_array_filename = cv_dirname + 'test_folds/' \
        + DB_type + '_test_folds_array_20200724.data'

    test_folds_array = pickle.load(open(test_folds_array_filename, 'rb'))
    nb_folds = len(test_folds_array)

    test_folds = []
    for ifold in range(nb_folds):

        test_fold = get_couples_from_array(test_folds_array[ifold])
        test_folds.append(test_fold)

    return test_folds
コード例 #2
0
    # # Get the train datasets
    # train_folds = get_train_folds(args.DB_version, args.DB_type)

    # nb_folds = len(train_folds)
    # nb_clf = len(train_folds[0])

    # Get the nested folds
    nested_cv_dirname = root + data_dir + 'cross_validation/nested_folds/'
    nested_folds_array_filename = nested_cv_dirname + args.DB_type + '_nested_folds_array.data'

    nested_folds_array = pickle.load(open(nested_folds_array_filename, 'rb'))

    list_folds = []
    for ifold in range(len(nested_folds_array)):
        fold_dataset = get_couples_from_array(nested_folds_array[ifold])
        list_folds.append(fold_dataset)

    test_folds = [[list_folds[0],
                   list_folds[1]], [list_folds[2], list_folds[3]],
                  [list_folds[4], list_folds[5]]]
    train_folds = [
        [list_folds[2], list_folds[3], list_folds[4], list_folds[5]],
        [list_folds[0], list_folds[1], list_folds[4], list_folds[5]],
        [list_folds[0], list_folds[1], list_folds[2], list_folds[3]]
    ]

    nb_folds = len(test_folds)

    cv_list_clf = []
    # cv_list_couples_of_clf = []
コード例 #3
0
    # get the classifiers
    list_clf = pickle.load(open(clf_filename, 'rb'))
    nb_clf = len(list_clf)

    # Get the train datasets
    train_datasets_array_filename = train_datasets_dirname + pattern_name + \
        '_train_datasets_array.data'
    train_datasets_array = pickle.load(
        open(train_datasets_array_filename, 'rb'))

    nb_clf = len(train_datasets_array)

    list_train_datasets = []
    for iclf in range(nb_clf):
        train_dataset = get_couples_from_array(train_datasets_array[iclf])
        list_train_datasets.append(train_dataset)

    # Process the predictions
    pred = np.zeros((DB_proteins.nb, nb_clf))

    # If the drug is in the DrugBank database
    if check_drug(args.dbid, DB.drugs) == True:

        list_couples_predict = []
        for ind in range(DB.proteins.nb):
            list_couples_predict.append(
                (DB.proteins.dict_ind2prot[ind], args.dbid))

        X_pred = get_Xcouple(list_couples_predict, X_mol, X_prot,
                             DB.drugs.dict_mol2ind, DB.proteins.dict_prot2ind)
コード例 #4
0
    # for iclf in range(nb_clf):

    # train_dataset = list_train_datasets[iclf]

    # W is a binary matrix to indicate what are the train data (pairs that can be used to train)
    # W = np.zeros(intMat.shape)
    # for prot_id, mol_id in train_dataset.list_couples:
    #     W[DB.drugs.dict_mol2ind[mol_id], DB.proteins.dict_prot2ind[prot_id]] = 1

    train_dataset = pd.DataFrame(
        train_datasets_array[0],
        columns=['UniProt ID', 'DrugbankID', 'interaction_bool'])
    train_true_interactions = train_dataset[train_dataset['interaction_bool']
                                            == '1']
    train_true_interactions_np = train_true_interactions.to_numpy()
    train_true = get_couples_from_array(train_true_interactions_np)

    train_intMat = np.zeros(intMat_final.shape)
    for prot_id, mol_id in train_true.list_couples:
        train_intMat[DB_drugs_final.dict_mol2ind[mol_id],
                     DB.proteins.dict_prot2ind[prot_id]] = 1

    # Prepare the NRLMF classifier
    seed = 92
    best_param = {'c': 5, 'K1': 5, 'K2': 5, 'r': 50, 'lambda_d': 0.125, \
        'lambda_t': 0.125, 'alpha': 0.25, 'beta': 0.125, 'theta': 0.5, \
        'max_iter': 100}
    model = NRLMF(cfix=best_param['c'],
                  K1=best_param['K1'],
                  K2=best_param['K2'],
                  num_factors=best_param['r'],
コード例 #5
0
    # + '_nested_folds_double_balanced_' + str(args.nb_clf) \
    # + '_clf_array.data'

    # test_folds_array = pickle.load(open(test_folds_filename, 'rb'))

    nb_clf = args.nb_clf

    if nb_clf == 1:
        nb_folds = len(nested_folds_array)

        list_folds = []
        cv_list_clf = []
        test_folds = []
        for ifold in range(len(nested_folds_array)):
            # train folds
            fold_dataset = get_couples_from_array(nested_folds_array[ifold])
            list_folds.append([fold_dataset])

            # classifier
            cv_list_clf.append([list_clf[ifold]])

            # test folds
            test_fold_dataset = get_couples_from_array(
                nested_folds_array[ifold])
            test_folds.append(test_fold_dataset)

    else:
        nb_folds = len(nested_folds_array[0])

        # train folds
        list_folds = []
コード例 #6
0
def correct_interactions(protein_dbid, drug_dbid, corrected_interaction_bool,
                         DB):
    """
    Correct 1 to 0 in the matrix of interactions, interactions that haven't \
    been proven experimentally.

    Parameters
    ----------
    interaction : tuple of length 2
        (UniprotID, DrugbankID)
    DB : tuple of length 8
        got with the function process_dataset.process_DB.get_DB()

    Returns
    -------
    corrected_DB : tuple of length 8 
    """

    # 1 - l'interaction est déjà dans DB
    if check_couple(protein_dbid, drug_dbid, DB.couples) == True:

        couples_pd = pd.DataFrame(DB.couples.array)
        couples_pd.columns = ['UniprotID', 'DrugBankID', 'interaction_bool']
        couple_index = couples_pd[(couples_pd['UniprotID']==protein_dbid) & \
            (couples_pd['DrugBankID']==drug_dbid)].index[0]

        initial_interaction_bool = int(couples_pd.at[couple_index,
                                                     "interaction_bool"])

        corrected_couples_pd = copy.deepcopy(couples_pd)
        if initial_interaction_bool != corrected_interaction_bool:
            corrected_couples_pd.at[
                couple_index, 'interaction_bool'] = corrected_interaction_bool

        corrected_couples = get_couples_from_array(
            corrected_couples_pd.to_numpy())

    # 2 - l'interaction n'est pas dans DB
    else:

        # 2A - drug_dbid est dans Drugs, protein_dbid est dans Proteins
        if check_protein(protein_dbid, DB.proteins):

            if check_drug(drug_dbid, DB.drugs):

                new_couple = Couples(list_couples=[(protein_dbid, drug_dbid)],
                                     interaction_bool=np.array([
                                         corrected_interaction_bool
                                     ]).reshape(-1, 1))

                corrected_couples = DB.couples + new_couple

    # 2B - drug_dbid n'est pas dans Drugs

    # 2C - protein_dbid n'est pas dans Proteins

    corrected_DB = FormattedDB(drugs=DB.drugs,
                               proteins=DB.proteins,
                               couples=corrected_couples)

    return corrected_DB
コード例 #7
0
def get_train_folds(DB_version, DB_type):
    """ 
    Load the train folds

    Parameters
    ----------
    DB_version : str
        string of the DrugBank version number
        format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1"
    DB_type : str
        string of the DrugBank type

    Returns
    -------
    train_folds : ListInteractions 
    """

    # data_dir variable
    data_dir = 'data/' + DB_version + '/' + DB_type + '/'

    cv_dirname = root + data_dir + '/cross_validation/'

    train_folds_array_filename = cv_dirname + 'train_folds/' \
        + DB_type + '_train_folds_array_20200724.data'

    train_folds_array = pickle.load(open(train_folds_array_filename, 'rb'))

    nb_folds = len(train_folds_array)
    nb_clf = len(train_folds_array[0])

    train_folds = []
    for ifold in range(nb_folds):

        train_datasets_per_fold = []
        for iclf in range(nb_clf):
            train_dataset = get_couples_from_array(
                train_folds_array[ifold][iclf])
            train_datasets_per_fold.append(train_dataset)
        train_folds.append(train_datasets_per_fold)

    return train_folds


# def get_train_folds(DB_version, DB_type, nb_clf):

#     """
#     Load the train folds

#     Parameters
#     ----------
#     DB_version : str
#         string of the DrugBank version number
#         format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1"
#     DB_type : str
#         string of the DrugBank type

#     Returns
#     -------
#     train_folds : InteractionsTrainDataset
#     """

#     # data_dir variable
#     data_dir = 'data/' + DB_version + '/' + DB_type + '/'
#     cv_dirname = root + data_dir + 'cross_validation/'

#     # True interactions

#     train_true_folds_array_filename = cv_dirname + 'train_folds/' \
#         + DB_type + '_train_true_folds_' + str(nb_clf) + '_clf_array.data'

#     train_true_folds_array = pickle.load(open(train_true_folds_array_filename, 'rb'))

#     nb_folds = len(train_true_folds_array)
#     train_true_folds = []

#     for ifold in range(nb_folds):

#         train_true_fold = get_couples_from_array(train_true_folds_array[ifold])
#         train_true_folds.append(train_true_fold)

#     # False interactions

#     train_false_folds_array_filename = cv_dirname + 'train_folds/' \
#         + DB_type + '_train_false_folds_' + str(nb_clf) + '_clf_array.data'

#     train_false_folds_array = pickle.load(open(train_false_folds_array_filename, 'rb'))

#     train_datasets = []
#     for ifold in range(nb_folds):

#         train_datasets_per_fold = []
#         for iclf in range(nb_clf):

#             train_false_fold = get_couples_from_array(train_false_folds_array[iclf])
#             train_dataset = InteractionsTrainDataset(true_inter=train_true_folds[ifold],
#                                                      false_inter=train_false_fold)

#             train_datasets_per_fold.append(train_dataset)

#         train_datasets.append(train_datasets_per_fold)

#     return train_datasets