def del_temp_K_prot(DB_version, DB_type, delete):
    """ 
    Check (and -optional- delete) LAkernel output files which are not completed.  

    Parameters
    ----------
    DB_version : str
        string of the DrugBank version number
        format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1"
    DB_type : str
        string of the DrugBank type exemple: "S0h"
    delete : boolean
        whether or not to delete the file 

    Returns
    -------
    None
    """

    # data_dir variable
    data_dir = 'data/' + DB_version + '/' + DB_type + '/'

    # kernels directory
    kernels_dir = root + data_dir + 'kernels/'

    # get the DBdataBase preprocessed
    preprocessed_DB = get_DB(DB_version, DB_type)
    dict_ind2prot = preprocessed_DB.proteins.dict_ind2prot

    nb_prot = preprocessed_DB.proteins.nb

    list_ = []
    for index in range(nb_prot):

        # output_filename
        dbid = dict_ind2prot[index]
        output_filename = kernels_dir + 'LAkernel/LA_' + DB_type + \
            '_' + dbid + '.txt'

        if os.path.isfile(output_filename):
            output_file = open(output_filename)
            count_nb_line = 0
            for line in output_file.readlines():
                count_nb_line += 1
            if count_nb_line != nb_prot - index:
                list_.append(output_filename)
        else:
            list_.append(output_filename)
    print(list_)

    print('delete', delete)

    if delete == True:
        for outf in list_:
            if os.path.isfile(outf):
                os.remove(outf)
def check_temp_K_prot(DB_version, DB_type):
    """ 
    Check and process make_temp_K_prot() for the proteins for which the \
    LAkernel has not been processed.

    See the description of make_temp_K_prot for more details 

    Parameters
    ----------
    DB_version : str
        string of the DrugBank version number
        format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1"
    DB_type : str
        string of the DrugBank type exemple: "S0h"

    Returns
    -------
    None
    """

    # data_dir variable
    data_dir = 'data/' + DB_version + '/' + DB_type + '/'

    # kernels directory
    kernels_dir = root + data_dir + 'kernels/'

    # get the DBdataBase preprocessed
    preprocessed_DB = get_DB(DB_version, DB_type)
    dict_ind2prot = preprocessed_DB.proteins.dict_ind2prot

    nb_prot = preprocessed_DB.proteins.nb

    list_ = []
    for index in range(nb_prot):

        # output_filename
        dbid = dict_ind2prot[index]
        output_filename = kernels_dir + 'LAkernel/LA_' + DB_type + \
            '_' + dbid + '.txt'

        if not os.path.isfile(output_filename):
            list_.append(dbid)
    print("list of uncompleted proteins", list_)
Exemple #3
0
    # data_dir variable
    data_dir = 'data/' + args.DB_version + '/' + args.DB_type + '/'

    if not os.path.exists(root + data_dir + '/' + 'cross_validation/kronSVM'):
        os.mkdir(root + data_dir + '/' + 'cross_validation/kronSVM')
        print("kronSVM cross validation directory for", args.DB_type, ",",
              args.DB_version, "created.")
    else:
        print("kronSVM cross validation directory for", args.DB_type, ",",
              args.DB_version, "already exists.")

    cv_dirname = root + data_dir + 'cross_validation/'
    kronsvm_cv_dirname = root + data_dir + 'cross_validation/kronSVM/'

    preprocessed_DB = get_DB(args.DB_version, args.DB_type)

    kernels = get_K_mol_K_prot(args.DB_version, args.DB_type, args.center_norm,
                               args.norm)

    # # Get the train datasets
    # train_folds = get_train_folds(args.DB_version, args.DB_type)

    # nb_folds = len(train_folds)
    # nb_clf = len(train_folds[0])

    # Get the nested folds
    nested_cv_dirname = root + data_dir + 'cross_validation/nested_folds/'
    nested_folds_array_filename = nested_cv_dirname + args.DB_type + '_nested_folds_array.data'

    nested_folds_array = pickle.load(open(nested_folds_array_filename, 'rb'))
Exemple #4
0
    # data_dir variable 
    data_dir = 'data/' + args.DB_version + '/' + args.DB_type + '/'

    if not os.path.exists(root + data_dir + '/' + 'cross_validation/NRLMF'):
        os.mkdir(root + data_dir + '/' + 'cross_validation/NRLMF')
        print("NRLMF cross validation directory for", args.DB_type, ",", args.DB_version,
        "created.")
    else:
        print("NRLMF cross validation directory for", args.DB_type, ",", args.DB_version,
        "already exists.")

    cv_dirname = root + data_dir + 'cross_validation/'
    nrlmf_cv_dirname = root + data_dir + 'cross_validation/NRLMF/'

    DB = get_DB(args.DB_version, args.DB_type)

    kernels = get_K_mol_K_prot(args.DB_version, args.DB_type, center_norm=True, norm=False)
    DB_drugs_kernel = kernels[0]
    DB_proteins_kernel = kernels[1]

    # Get the nested folds
    nested_cv_dirname = root + data_dir + 'cross_validation/nested_folds/'

    if args.balanced_on_proteins == True:
        if args.balanced_on_drugs == True:
            nested_folds_array_filename = nested_cv_dirname \
            + args.DB_type + '_nested_folds_double_balanced_1_clf_array.data'
            output_filename = nrlmf_cv_dirname + args.DB_type + \
            '_NRLMF_cv_nested_double_balanced_1_clf_pred.data'
        else:
Exemple #5
0
def make_group_K_prot(DB_version, DB_type):
    """
    Process the similarity between all the proteins with LAkernel

    Use make_K_mol.center_and_normalise_kernel()

    Write 2 files:
        - ..._K_prot.data : LA Kernel
        - ..._K_prot_norm.data : Centered and Normalised Kernel

    Parameters
    ----------
    DB_version : str
        string of the DrugBank version number
        format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1"
    DB_type : str
        string of the DrugBank type

    Returns
    -------
    None
    """

    # data_dir variable 
    data_dir = 'data/' + DB_version + '/' + DB_type + '/'

    # kernels directory
    kernels_dir = root + data_dir + 'kernels/'

    # get the DBdataBase preprocessed
    preprocessed_DB = get_DB(DB_version, DB_type)
    dict_ind2prot = preprocessed_DB.proteins.dict_ind2prot

    nb_prot = preprocessed_DB.proteins.nb
    X = np.zeros((nb_prot, nb_prot))
    for i in range(nb_prot):


        # output_filename
        dbid = dict_ind2prot[i]
        output_filename = kernels_dir + 'LAkernel/LA_' + DB_type + \
            '_' + dbid + '.txt'

        j = i
        for line in open(output_filename, 'r'):
            r = float(line.rstrip())
            X[i, j] = r
            X[j, i] = X[i, j]
            j += 1
        if j != nb_prot:
            print(dbid, 'kernel, corresponding to the protein nb ', i, 
            ', is uncompleted')

    # normalized or unnormalized
    for norm_type in ['norm', 'unnorm']:
        if norm_type == 'unnorm':
            kernel_filename = kernels_dir + DB_type + '_K_prot.data'
            pickle.dump(X, open(kernel_filename, 'wb'), protocol=2)
        elif norm_type == 'norm':
            K_norm = center_and_normalise_kernel(X)
            kernel_filename = kernels_dir + DB_type + '_K_prot_norm.data'
            pickle.dump(K_norm, open(kernel_filename, 'wb'), protocol=2)

    print(X[100, 100], K_norm[100, 100])
    print(X[100, :], K_norm[100, :])
    print(X[500, 500], K_norm[500, 500])
    print(X[500, :], K_norm[500, :])
Exemple #6
0
def make_temp_K_prot(DB_version, DB_type, index):
    """ 
    Process the similarity of one particular protein with the others

    Process the similarity (thanks to the L(ocal)A(lignment) Kernel) of \
        the protein (with the key *index* in the dict_ind2prot dictionary, and \
        corresponding to the fasta FASTA_A) with the proteins between *index+1*\
        and *nb_prot* (corresponding to fasta FASTA_B) in the dict_ind2prot \
        dictionary, with the command: \
        'LAkernel_direct FASTA_A FASTA B'

    Then append the output to the file LA_kernel/LA_..._[dbid].txt, where dbid \
        is the value of the key *index* in the dict_ind2prot dictionary. 

    Parameters
    ----------
    DB_version : str
        string of the DrugBank version number
        format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1"
    DB_type : str
        string of the DrugBank type exemple: "S0h"
    index : int
        index of the protein in the dictionaries

    Returns
    -------
    None
    """   

    # data_dir variable 
    data_dir = 'data/' + DB_version + '/' + DB_type + '/'

    # kernels directory
    kernels_dir = root + data_dir + 'kernels/'

    #create LAkernel directory
    if not os.path.exists(kernels_dir + 'LAkernel/'):
        os.mkdir(kernels_dir + 'LAkernel/')
        print("LAkernel directory for", data_dir, "created")

    # get the DataBase preprocessed
    preprocessed_DB = get_DB(DB_version, DB_type)
    dict_protein = preprocessed_DB.proteins.dict_protein
    dict_ind2prot = preprocessed_DB.proteins.dict_ind2prot

    # output_filename
    dbid = dict_ind2prot[index]
    output_filename = kernels_dir + 'LAkernel/LA_' + DB_type + \
        '_' + dbid + '.txt'

    nb_prot = preprocessed_DB.proteins.nb
    FASTA1 = dict_protein[dbid]
    if not os.path.isfile(output_filename):
        print(index, ":", dbid)
        for j in range(index, nb_prot):
            dbid2 = dict_ind2prot[j]
            FASTA2 = dict_protein[dbid2]
            com = LAkernel_path + ' ' + FASTA1 + ' ' + FASTA2 + \
                ' >> ' + output_filename
            cmd = os.popen(com)
            cmd.read()
        print("completed")
Exemple #7
0
from DTI_prediction.process_dataset.process_DB import get_DB
from DTI_prediction.process_dataset.DB_utils import Drugs, Proteins, Couples, FormattedDB
from DTI_prediction.process_dataset.DB_utils import check_drug, check_protein, check_couple, get_couples_from_array

root = '../CFTR_PROJECT/'

DB_version = "drugbank_v5.1.5"
DB_type = "S0h"
process_name = "VMO"

# pattern_name variable
pattern_name = DB_type + '_' + process_name
# data_dir variable
data_dir = 'data/' + DB_version + '/' + DB_type + '/' + pattern_name + '/'

preprocessed_DB = get_DB(DB_version, DB_type)

from DTI_prediction.process_dataset.correct_interactions import get_orphan

ivacaftor_dbid = 'DB08820'
corrected_DB = copy.deepcopy(preprocessed_DB)
corrected_DB = get_orphan(DB=corrected_DB, dbid=ivacaftor_dbid)

# Get the kernels

from DTI_prediction.make_kernels.get_kernels import get_K_mol_K_prot
from DTI_prediction.make_kernels.make_K_mol import center_and_normalise_kernel

kernels = get_K_mol_K_prot(DB_version, DB_type, norm=False)
K_mol = kernels[0]
K_prot = kernels[1]