from HMF.code.cross_validation.cross_validation_hmf import CrossValidation from HMF.drug_sensitivity.load_dataset import load_data_without_empty, load_data_filter import numpy ''' Load datasets ''' location = project_location+"HMF/drug_sensitivity/data/overlap/" location_data = location+"data_row_01/" location_features_drugs = location+"features_drugs/" location_features_cell_lines = location+"features_cell_lines/" location_kernels = location+"kernels_features/" R_ccle_ec, M_ccle_ec, cell_lines, drugs = load_data_without_empty(location_data+"ccle_ec50_row_01.txt") R_ctrp, M_ctrp = load_data_filter(location_data+"ctrp_ec50_row_01.txt",cell_lines,drugs) R_gdsc, M_gdsc = load_data_filter(location_data+"gdsc_ic50_row_01.txt",cell_lines,drugs) R_ccle_ic, M_ccle_ic = load_data_filter(location_data+"ccle_ic50_row_01.txt",cell_lines,drugs) ''' Settings HMF ''' iterations, burn_in, thinning = 200, 150, 2 # 500, 400, 2 no_folds = 10 hyperparameters = { 'alphatau' : 1., 'betatau' : 1., 'alpha0' : 0.001, 'beta0' : 0.001, 'lambdaF' : 0.1, 'lambdaG' : 0.1,
from HMF.code.models.nmf_np import nmf_np from HMF.code.cross_validation.multiple_nmf_nested_matrix_cross_validation import MultipleNMFNestedCrossValidation from HMF.drug_sensitivity.load_dataset import load_data_without_empty, load_data_filter import numpy, random ''' Load datasets ''' location = project_location + "HMF/drug_sensitivity/data/overlap/" location_data = location + "data_row_01/" location_features_drugs = location + "features_drugs/" location_features_cell_lines = location + "features_cell_lines/" location_kernels = location + "kernels_features/" ''' Concatenate the datasets by ROWS. We remove the columns of the other datasets ''' R_ccle_ec, M_ccle_ec, cell_lines, drugs = load_data_without_empty( location_data + "ccle_ec50_row_01.txt") R_ctrp, M_ctrp = load_data_filter(location_data + "ctrp_ec50_row_01.txt", rows=cell_lines, columns=None) R_gdsc, M_gdsc = load_data_filter(location_data + "gdsc_ic50_row_01.txt", rows=cell_lines, columns=None) R_ccle_ic, M_ccle_ic = load_data_filter(location_data + "ccle_ic50_row_01.txt", rows=cell_lines, columns=None) R_concat = numpy.concatenate((R_ccle_ec, R_gdsc, R_ctrp, R_ccle_ic), axis=1) #columns M_concat = numpy.concatenate((M_ccle_ec, M_gdsc, M_ctrp, M_ccle_ic), axis=1) #columns _, no_columns = R_ccle_ec.shape ''' Remove entirely empty columns, due to the other three datasets that we concatenate '''
from HMF.code.models.nmf_np import nmf_np from HMF.code.cross_validation.multiple_nmf_nested_matrix_cross_validation import MultipleNMFNestedCrossValidation from HMF.drug_sensitivity.load_dataset import load_data_without_empty, load_data_filter import numpy, random ''' Load datasets ''' location = project_location+"HMF/drug_sensitivity/data/overlap/" location_data = location+"data_row_01/" location_features_drugs = location+"features_drugs/" location_features_cell_lines = location+"features_cell_lines/" location_kernels = location+"kernels_features/" ''' Concatenate the datasets by ROWS. We remove the columns of the other datasets ''' R_gdsc, M_gdsc, cell_lines, drugs = load_data_without_empty(location_data+"gdsc_ic50_row_01.txt") R_ctrp, M_ctrp = load_data_filter(location_data+"ctrp_ec50_row_01.txt",rows=None,columns=drugs) R_ccle_ec, M_ccle_ec = load_data_filter(location_data+"ccle_ec50_row_01.txt",rows=None,columns=drugs) R_ccle_ic, M_ccle_ic = load_data_filter(location_data+"ccle_ic50_row_01.txt",rows=None,columns=drugs) R_concat = numpy.concatenate((R_gdsc,R_ctrp,R_ccle_ec,R_ccle_ic),axis=0) #rows M_concat = numpy.concatenate((M_gdsc,M_ctrp,M_ccle_ec,M_ccle_ic),axis=0) #rows no_rows, _ = R_gdsc.shape ''' Remove entirely empty rows, due to the other three datasets that we concatenate ''' def remove_empty_rows(R,M): new_R, new_M = [], [] for i,sum_row in enumerate(M.sum(axis=1)): if sum_row > 0: new_R.append(R[i]) new_M.append(M[i]) return numpy.array(new_R), numpy.array(new_M)
''' Model settings ''' n_estimators = 100 # number of trees max_depth = None # until what depth of feature splits we go ''' Load datasets ''' location = project_location+"HMF/drug_sensitivity/data/overlap/" location_data = location+"data_row_01/" location_features_drugs = location+"features_drugs/" location_features_cell_lines = location+"features_cell_lines/" location_kernels = location+"kernels_features/" R_main, M_main, cell_lines, drugs = load_data_without_empty(location_data+"gdsc_ic50_row_01.txt") R_cnv, M_cnv = load_data_filter(location_features_cell_lines+"cnv.txt", cell_lines) #R_cnv_std, M_cnv_std = load_data_filter(location_features_cell_lines+"cnv_std.txt", cell_lines) R_mutation, M_mutation = load_data_filter(location_features_cell_lines+"mutation.txt", cell_lines) #R_ge, M_ge = load_data_filter(location_features_cell_lines+"gene_expression.txt", cell_lines) #R_ge_std, M_ge_std = load_data_filter(location_features_cell_lines+"gene_expression_std.txt", cell_lines) R_fp, M_fp = load_data_filter(location_features_drugs+"drug_fingerprints.txt", drugs) R_targets, M_targets = load_data_filter(location_features_drugs+"drug_targets.txt", drugs) R_1d2d, M_1d2d = load_data_filter(location_features_drugs+"drug_1d2d.txt", drugs) #R_1d2d_std, M_1d2d_std = load_data_filter(location_features_drugs+"drug_1d2d_std.txt", drugs) features_drugs = [R_fp, R_targets, R_1d2d] features_cell_lines = [R_cnv, R_mutation] ''' Split the mask M into folds '''
from HMF.code.cross_validation.cross_validation_hmf import CrossValidation from HMF.drug_sensitivity.load_dataset import load_data_without_empty, load_data_filter import numpy, random ''' Load datasets ''' location = project_location + "HMF/drug_sensitivity/data/overlap/" location_data = location + "data_row_01/" location_features_drugs = location + "features_drugs/" location_features_cell_lines = location + "features_cell_lines/" location_kernels = location + "kernels_features/" R_gdsc, M_gdsc, cell_lines, drugs = load_data_without_empty( location_data + "gdsc_ic50_row_01.txt") C_cnv_std, M_cnv_std = load_data_filter(location_kernels + "cnv_std.txt", cell_lines, cell_lines) C_ge_std, M_ge_std = load_data_filter( location_kernels + "gene_expression_std.txt", cell_lines, cell_lines) C_mutation, M_mutation = load_data_filter(location_kernels + "mutation.txt", cell_lines, cell_lines) C_1d2d_std, M_1d2d_std = load_data_filter( location_kernels + "drug_1d2d_std.txt", drugs, drugs) C_fp, M_fp = load_data_filter(location_kernels + "drug_fingerprints.txt", drugs, drugs) C_targets, M_targets = load_data_filter(location_kernels + "drug_targets.txt", drugs, drugs) ''' Settings HMF ''' iterations, burn_in, thinning = 100, 80, 2 no_folds = 10