def del_temp_K_prot(DB_version, DB_type, delete): """ Check (and -optional- delete) LAkernel output files which are not completed. Parameters ---------- DB_version : str string of the DrugBank version number format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1" DB_type : str string of the DrugBank type exemple: "S0h" delete : boolean whether or not to delete the file Returns ------- None """ # data_dir variable data_dir = 'data/' + DB_version + '/' + DB_type + '/' # kernels directory kernels_dir = root + data_dir + 'kernels/' # get the DBdataBase preprocessed preprocessed_DB = get_DB(DB_version, DB_type) dict_ind2prot = preprocessed_DB.proteins.dict_ind2prot nb_prot = preprocessed_DB.proteins.nb list_ = [] for index in range(nb_prot): # output_filename dbid = dict_ind2prot[index] output_filename = kernels_dir + 'LAkernel/LA_' + DB_type + \ '_' + dbid + '.txt' if os.path.isfile(output_filename): output_file = open(output_filename) count_nb_line = 0 for line in output_file.readlines(): count_nb_line += 1 if count_nb_line != nb_prot - index: list_.append(output_filename) else: list_.append(output_filename) print(list_) print('delete', delete) if delete == True: for outf in list_: if os.path.isfile(outf): os.remove(outf)
def check_temp_K_prot(DB_version, DB_type): """ Check and process make_temp_K_prot() for the proteins for which the \ LAkernel has not been processed. See the description of make_temp_K_prot for more details Parameters ---------- DB_version : str string of the DrugBank version number format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1" DB_type : str string of the DrugBank type exemple: "S0h" Returns ------- None """ # data_dir variable data_dir = 'data/' + DB_version + '/' + DB_type + '/' # kernels directory kernels_dir = root + data_dir + 'kernels/' # get the DBdataBase preprocessed preprocessed_DB = get_DB(DB_version, DB_type) dict_ind2prot = preprocessed_DB.proteins.dict_ind2prot nb_prot = preprocessed_DB.proteins.nb list_ = [] for index in range(nb_prot): # output_filename dbid = dict_ind2prot[index] output_filename = kernels_dir + 'LAkernel/LA_' + DB_type + \ '_' + dbid + '.txt' if not os.path.isfile(output_filename): list_.append(dbid) print("list of uncompleted proteins", list_)
# data_dir variable data_dir = 'data/' + args.DB_version + '/' + args.DB_type + '/' if not os.path.exists(root + data_dir + '/' + 'cross_validation/kronSVM'): os.mkdir(root + data_dir + '/' + 'cross_validation/kronSVM') print("kronSVM cross validation directory for", args.DB_type, ",", args.DB_version, "created.") else: print("kronSVM cross validation directory for", args.DB_type, ",", args.DB_version, "already exists.") cv_dirname = root + data_dir + 'cross_validation/' kronsvm_cv_dirname = root + data_dir + 'cross_validation/kronSVM/' preprocessed_DB = get_DB(args.DB_version, args.DB_type) kernels = get_K_mol_K_prot(args.DB_version, args.DB_type, args.center_norm, args.norm) # # Get the train datasets # train_folds = get_train_folds(args.DB_version, args.DB_type) # nb_folds = len(train_folds) # nb_clf = len(train_folds[0]) # Get the nested folds nested_cv_dirname = root + data_dir + 'cross_validation/nested_folds/' nested_folds_array_filename = nested_cv_dirname + args.DB_type + '_nested_folds_array.data' nested_folds_array = pickle.load(open(nested_folds_array_filename, 'rb'))
# data_dir variable data_dir = 'data/' + args.DB_version + '/' + args.DB_type + '/' if not os.path.exists(root + data_dir + '/' + 'cross_validation/NRLMF'): os.mkdir(root + data_dir + '/' + 'cross_validation/NRLMF') print("NRLMF cross validation directory for", args.DB_type, ",", args.DB_version, "created.") else: print("NRLMF cross validation directory for", args.DB_type, ",", args.DB_version, "already exists.") cv_dirname = root + data_dir + 'cross_validation/' nrlmf_cv_dirname = root + data_dir + 'cross_validation/NRLMF/' DB = get_DB(args.DB_version, args.DB_type) kernels = get_K_mol_K_prot(args.DB_version, args.DB_type, center_norm=True, norm=False) DB_drugs_kernel = kernels[0] DB_proteins_kernel = kernels[1] # Get the nested folds nested_cv_dirname = root + data_dir + 'cross_validation/nested_folds/' if args.balanced_on_proteins == True: if args.balanced_on_drugs == True: nested_folds_array_filename = nested_cv_dirname \ + args.DB_type + '_nested_folds_double_balanced_1_clf_array.data' output_filename = nrlmf_cv_dirname + args.DB_type + \ '_NRLMF_cv_nested_double_balanced_1_clf_pred.data' else:
def make_group_K_prot(DB_version, DB_type): """ Process the similarity between all the proteins with LAkernel Use make_K_mol.center_and_normalise_kernel() Write 2 files: - ..._K_prot.data : LA Kernel - ..._K_prot_norm.data : Centered and Normalised Kernel Parameters ---------- DB_version : str string of the DrugBank version number format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1" DB_type : str string of the DrugBank type Returns ------- None """ # data_dir variable data_dir = 'data/' + DB_version + '/' + DB_type + '/' # kernels directory kernels_dir = root + data_dir + 'kernels/' # get the DBdataBase preprocessed preprocessed_DB = get_DB(DB_version, DB_type) dict_ind2prot = preprocessed_DB.proteins.dict_ind2prot nb_prot = preprocessed_DB.proteins.nb X = np.zeros((nb_prot, nb_prot)) for i in range(nb_prot): # output_filename dbid = dict_ind2prot[i] output_filename = kernels_dir + 'LAkernel/LA_' + DB_type + \ '_' + dbid + '.txt' j = i for line in open(output_filename, 'r'): r = float(line.rstrip()) X[i, j] = r X[j, i] = X[i, j] j += 1 if j != nb_prot: print(dbid, 'kernel, corresponding to the protein nb ', i, ', is uncompleted') # normalized or unnormalized for norm_type in ['norm', 'unnorm']: if norm_type == 'unnorm': kernel_filename = kernels_dir + DB_type + '_K_prot.data' pickle.dump(X, open(kernel_filename, 'wb'), protocol=2) elif norm_type == 'norm': K_norm = center_and_normalise_kernel(X) kernel_filename = kernels_dir + DB_type + '_K_prot_norm.data' pickle.dump(K_norm, open(kernel_filename, 'wb'), protocol=2) print(X[100, 100], K_norm[100, 100]) print(X[100, :], K_norm[100, :]) print(X[500, 500], K_norm[500, 500]) print(X[500, :], K_norm[500, :])
def make_temp_K_prot(DB_version, DB_type, index): """ Process the similarity of one particular protein with the others Process the similarity (thanks to the L(ocal)A(lignment) Kernel) of \ the protein (with the key *index* in the dict_ind2prot dictionary, and \ corresponding to the fasta FASTA_A) with the proteins between *index+1*\ and *nb_prot* (corresponding to fasta FASTA_B) in the dict_ind2prot \ dictionary, with the command: \ 'LAkernel_direct FASTA_A FASTA B' Then append the output to the file LA_kernel/LA_..._[dbid].txt, where dbid \ is the value of the key *index* in the dict_ind2prot dictionary. Parameters ---------- DB_version : str string of the DrugBank version number format : "drugbank_vX.X.X" exemple : "drugbank_v5.1.1" DB_type : str string of the DrugBank type exemple: "S0h" index : int index of the protein in the dictionaries Returns ------- None """ # data_dir variable data_dir = 'data/' + DB_version + '/' + DB_type + '/' # kernels directory kernels_dir = root + data_dir + 'kernels/' #create LAkernel directory if not os.path.exists(kernels_dir + 'LAkernel/'): os.mkdir(kernels_dir + 'LAkernel/') print("LAkernel directory for", data_dir, "created") # get the DataBase preprocessed preprocessed_DB = get_DB(DB_version, DB_type) dict_protein = preprocessed_DB.proteins.dict_protein dict_ind2prot = preprocessed_DB.proteins.dict_ind2prot # output_filename dbid = dict_ind2prot[index] output_filename = kernels_dir + 'LAkernel/LA_' + DB_type + \ '_' + dbid + '.txt' nb_prot = preprocessed_DB.proteins.nb FASTA1 = dict_protein[dbid] if not os.path.isfile(output_filename): print(index, ":", dbid) for j in range(index, nb_prot): dbid2 = dict_ind2prot[j] FASTA2 = dict_protein[dbid2] com = LAkernel_path + ' ' + FASTA1 + ' ' + FASTA2 + \ ' >> ' + output_filename cmd = os.popen(com) cmd.read() print("completed")
from DTI_prediction.process_dataset.process_DB import get_DB from DTI_prediction.process_dataset.DB_utils import Drugs, Proteins, Couples, FormattedDB from DTI_prediction.process_dataset.DB_utils import check_drug, check_protein, check_couple, get_couples_from_array root = '../CFTR_PROJECT/' DB_version = "drugbank_v5.1.5" DB_type = "S0h" process_name = "VMO" # pattern_name variable pattern_name = DB_type + '_' + process_name # data_dir variable data_dir = 'data/' + DB_version + '/' + DB_type + '/' + pattern_name + '/' preprocessed_DB = get_DB(DB_version, DB_type) from DTI_prediction.process_dataset.correct_interactions import get_orphan ivacaftor_dbid = 'DB08820' corrected_DB = copy.deepcopy(preprocessed_DB) corrected_DB = get_orphan(DB=corrected_DB, dbid=ivacaftor_dbid) # Get the kernels from DTI_prediction.make_kernels.get_kernels import get_K_mol_K_prot from DTI_prediction.make_kernels.make_K_mol import center_and_normalise_kernel kernels = get_K_mol_K_prot(DB_version, DB_type, norm=False) K_mol = kernels[0] K_prot = kernels[1]