sys.path.append(project_location) from HMF.code.cross_validation.cross_validation_hmf import CrossValidation from HMF.drug_sensitivity.load_dataset import load_data_without_empty, load_data_filter import numpy ''' Load datasets ''' location = project_location+"HMF/drug_sensitivity/data/overlap/" location_data = location+"data_row_01/" location_features_drugs = location+"features_drugs/" location_features_cell_lines = location+"features_cell_lines/" location_kernels = location+"kernels_features/" R_ccle_ec, M_ccle_ec, cell_lines, drugs = load_data_without_empty(location_data+"ccle_ec50_row_01.txt") R_ctrp, M_ctrp = load_data_filter(location_data+"ctrp_ec50_row_01.txt",cell_lines,drugs) R_gdsc, M_gdsc = load_data_filter(location_data+"gdsc_ic50_row_01.txt",cell_lines,drugs) R_ccle_ic, M_ccle_ic = load_data_filter(location_data+"ccle_ic50_row_01.txt",cell_lines,drugs) ''' Settings HMF ''' iterations, burn_in, thinning = 200, 150, 2 # 500, 400, 2 no_folds = 10 hyperparameters = { 'alphatau' : 1., 'betatau' : 1., 'alpha0' : 0.001, 'beta0' : 0.001, 'lambdaF' : 0.1,
project_location = os.path.dirname(__file__) + "/../../../../../" sys.path.append(project_location) from HMF.code.models.bnmf_gibbs import bnmf_gibbs from HMF.code.cross_validation.nested_matrix_cross_validation import MatrixNestedCrossValidation from HMF.drug_sensitivity.load_dataset import load_data_without_empty import numpy, random ''' Load datasets ''' location = project_location + "HMF/drug_sensitivity/data/overlap/" location_data = location + "data_row_01/" location_features_drugs = location + "features_drugs/" location_features_cell_lines = location + "features_cell_lines/" location_kernels = location + "kernels_features/" R_gdsc, M_gdsc, _, _ = load_data_without_empty(location_data + "gdsc_ic50_row_01.txt") R_ctrp, M_ctrp, _, _ = load_data_without_empty(location_data + "ctrp_ec50_row_01.txt") R_ccle_ec, M_ccle_ec, _, _ = load_data_without_empty(location_data + "ccle_ec50_row_01.txt") R_ccle_ic, M_ccle_ic, _, _ = load_data_without_empty(location_data + "ccle_ic50_row_01.txt") R, M = R_ccle_ec, M_ccle_ec ''' Settings BNMF ''' no_folds, no_threads = 10, 5 iterations, burn_in, thinning = 1000, 900, 2 init_UV = 'random' K_range = range(1, 3 + 1)
import numpy, random, itertools ''' Model settings ''' n_estimators = 100 # number of trees max_depth = None # until what depth of feature splits we go ''' Load datasets ''' location = project_location+"HMF/drug_sensitivity/data/overlap/" location_data = location+"data_row_01/" location_features_drugs = location+"features_drugs/" location_features_cell_lines = location+"features_cell_lines/" location_kernels = location+"kernels_features/" R_main, M_main, cell_lines, drugs = load_data_without_empty(location_data+"gdsc_ic50_row_01.txt") R_cnv, M_cnv = load_data_filter(location_features_cell_lines+"cnv.txt", cell_lines) #R_cnv_std, M_cnv_std = load_data_filter(location_features_cell_lines+"cnv_std.txt", cell_lines) R_mutation, M_mutation = load_data_filter(location_features_cell_lines+"mutation.txt", cell_lines) #R_ge, M_ge = load_data_filter(location_features_cell_lines+"gene_expression.txt", cell_lines) #R_ge_std, M_ge_std = load_data_filter(location_features_cell_lines+"gene_expression_std.txt", cell_lines) R_fp, M_fp = load_data_filter(location_features_drugs+"drug_fingerprints.txt", drugs) R_targets, M_targets = load_data_filter(location_features_drugs+"drug_targets.txt", drugs) R_1d2d, M_1d2d = load_data_filter(location_features_drugs+"drug_1d2d.txt", drugs) #R_1d2d_std, M_1d2d_std = load_data_filter(location_features_drugs+"drug_1d2d_std.txt", drugs) features_drugs = [R_fp, R_targets, R_1d2d] features_cell_lines = [R_cnv, R_mutation]
''' Settings ''' fractions_unknown = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9] repeats = 20 iterations = 1000 init_UV = 'random' expo_prior = 1. K = 2 metrics = ['MSE', 'R^2', 'Rp'] ''' Load data ''' location = project_location+"DI_MMTF/data/datasets_drug_sensitivity/overlap/" location_data = location+"data_row_01/" R, M_original, _, _ = load_data_without_empty(location_data+"ctrp_ec50_row_01.txt") #''' Seed all of the methods the same ''' #numpy.random.seed(0) #random.seed(0) ''' Generate matrices M - one list of (M_train,M_test)'s for each fraction ''' M_attempts = 10000 all_Ms_train_test = [ [try_generate_M_from_M(M=M_original,fraction=fraction,attempts=M_attempts) for r in range(0,repeats)] for fraction in fractions_unknown ] ''' Make sure each M has no empty rows or columns ''' def check_empty_rows_columns(M,fraction): sums_columns = M.sum(axis=0)
location_features_cell_lines = location + "features_cell_lines/" location_kernels = location + "kernels_features/" file_gdsc = location_data + "gdsc_ic50_row_01.txt" file_ctrp = location_data + "ctrp_ec50_row_01.txt" file_ccle_ic = location_data + "ccle_ic50_row_01.txt" file_ccle_ec = location_data + "ccle_ec50_row_01.txt" cell_lines, drugs = load_names() ''' Datasets containing all drugs and cell lines. ''' R_gdsc, M_gdsc = load_data(file_gdsc) R_ccle_ec, M_ccle_ec = load_data(file_ctrp) R_ctrp, M_ctrp = load_data(file_ccle_ic) R_ccle_ic, M_ccle_ic = load_data(file_ccle_ec) ''' Datasets containing only drugs and cell lines with observed entries. ''' R_gdsc_filtered, M_gdsc_filtered, i_cl_gdsc, i_drugs_gdsc = load_data_without_empty( file_gdsc) R_ctrp_filtered, M_ctrp_filtered, i_cl_ctrp, i_drugs_ctrp = load_data_without_empty( file_ctrp) R_ccle_ic_filtered, M_ccle_ic_filtered, i_cl_ccle_ic, i_drugs_ccle_ic = load_data_without_empty( file_ccle_ic) R_ccle_ec_filtered, M_ccle_ec_filtered, i_cl_ccle_ec, i_drugs_ccle_ec = load_data_without_empty( file_ccle_ec) cell_lines_gdsc_filtered, drugs_gdsc_filtered = numpy.array( cell_lines)[i_cl_gdsc], numpy.array(drugs)[i_drugs_gdsc] cell_lines_ctrp_filtered, drugs_ctrp_filtered = numpy.array( cell_lines)[i_cl_ctrp], numpy.array(drugs)[i_drugs_ctrp] cell_lines_ccle_ic_filtered, drugs_ccle_ic_filtered = numpy.array( cell_lines)[i_cl_ccle_ic], numpy.array(drugs)[i_drugs_ccle_ic] cell_lines_ccle_ec_filtered, drugs_ccle_ec_filtered = numpy.array( cell_lines)[i_cl_ccle_ec], numpy.array(drugs)[i_drugs_ccle_ec]
fraction_overlap_2 = n_overlap_2 / float(n_cell_lines * n_drugs) M_overlap_3 = M_main * M3 n_overlap_3 = M_overlap_3.sum() fraction_overlap_3 = n_overlap_3 / float(n_cell_lines * n_drugs) print "Dataset %s." % names[0] print "Number cell lines: %s. Number drugs: %s." % (n_cell_lines,n_drugs) print "Number observed: %s. Fraction observed: %s." % (n_observed,fraction_observed) print "%s. Number overlap: %s. Fraction overlap: %s." % (names[1],n_overlap_1,fraction_overlap_1) print "%s. Number overlap: %s. Fraction overlap: %s." % (names[2],n_overlap_2,fraction_overlap_2) print "%s. Number overlap: %s. Fraction overlap: %s." % (names[3],n_overlap_3,fraction_overlap_3) ''' GDSC IC50 as the main dataset ''' R_gdsc, M_gdsc, cell_lines, drugs = load_data_without_empty(location_data+"gdsc_ic50_row_01.txt") R_ctrp, M_ctrp = load_data_filter(location_data+"ctrp_ec50_row_01.txt",cell_lines,drugs) R_ccle_ic, M_ccle_ic = load_data_filter(location_data+"ccle_ic50_row_01.txt",cell_lines,drugs) R_ccle_ec, M_ccle_ec = load_data_filter(location_data+"ccle_ec50_row_01.txt",cell_lines,drugs) print_overlap(M_gdsc,M_ctrp,M_ccle_ic,M_ccle_ec,['GDSC','CTRP','CCLE IC','CCLE EC']) ''' CTRP EC50 as the main dataset ''' R_ctrp, M_ctrp, cell_lines, drugs = load_data_without_empty(location_data+"ctrp_ec50_row_01.txt") R_ccle_ec, M_ccle_ec = load_data_filter(location_data+"ccle_ec50_row_01.txt",cell_lines,drugs) R_gdsc, M_gdsc = load_data_filter(location_data+"gdsc_ic50_row_01.txt",cell_lines,drugs) R_ccle_ic, M_ccle_ic = load_data_filter(location_data+"ccle_ic50_row_01.txt",cell_lines,drugs) print_overlap(M_ctrp,M_gdsc,M_ccle_ic,M_ccle_ec,['CTRP','GDSC','CCLE IC','CCLE EC']) ''' CCLE IC50 as the main dataset '''