sys.path.append(project_location)

from HMF.code.cross_validation.cross_validation_hmf import CrossValidation
from HMF.drug_sensitivity.load_dataset import load_data_without_empty, load_data_filter

import numpy


''' Load datasets '''
location = project_location+"HMF/drug_sensitivity/data/overlap/"
location_data =                 location+"data_row_01/"
location_features_drugs =       location+"features_drugs/"
location_features_cell_lines =  location+"features_cell_lines/"
location_kernels =              location+"kernels_features/"

R_ccle_ec,  M_ccle_ec, cell_lines, drugs = load_data_without_empty(location_data+"ccle_ec50_row_01.txt")
R_ctrp,     M_ctrp                       = load_data_filter(location_data+"ctrp_ec50_row_01.txt",cell_lines,drugs)
R_gdsc,     M_gdsc                       = load_data_filter(location_data+"gdsc_ic50_row_01.txt",cell_lines,drugs)
R_ccle_ic,  M_ccle_ic                    = load_data_filter(location_data+"ccle_ic50_row_01.txt",cell_lines,drugs)


''' Settings HMF '''
iterations, burn_in, thinning = 200, 150, 2 # 500, 400, 2
no_folds = 10

hyperparameters = {
    'alphatau' : 1.,
    'betatau'  : 1.,
    'alpha0'   : 0.001,
    'beta0'    : 0.001,
    'lambdaF'  : 0.1,
Beispiel #2
0
project_location = os.path.dirname(__file__) + "/../../../../../"
sys.path.append(project_location)

from HMF.code.models.bnmf_gibbs import bnmf_gibbs
from HMF.code.cross_validation.nested_matrix_cross_validation import MatrixNestedCrossValidation
from HMF.drug_sensitivity.load_dataset import load_data_without_empty

import numpy, random
''' Load datasets '''
location = project_location + "HMF/drug_sensitivity/data/overlap/"
location_data = location + "data_row_01/"
location_features_drugs = location + "features_drugs/"
location_features_cell_lines = location + "features_cell_lines/"
location_kernels = location + "kernels_features/"

R_gdsc, M_gdsc, _, _ = load_data_without_empty(location_data +
                                               "gdsc_ic50_row_01.txt")
R_ctrp, M_ctrp, _, _ = load_data_without_empty(location_data +
                                               "ctrp_ec50_row_01.txt")
R_ccle_ec, M_ccle_ec, _, _ = load_data_without_empty(location_data +
                                                     "ccle_ec50_row_01.txt")
R_ccle_ic, M_ccle_ic, _, _ = load_data_without_empty(location_data +
                                                     "ccle_ic50_row_01.txt")

R, M = R_ccle_ec, M_ccle_ec
''' Settings BNMF '''
no_folds, no_threads = 10, 5
iterations, burn_in, thinning = 1000, 900, 2
init_UV = 'random'

K_range = range(1, 3 + 1)
Beispiel #3
0
import numpy, random, itertools


''' Model settings '''
n_estimators = 100 # number of trees
max_depth = None    # until what depth of feature splits we go


''' Load datasets '''
location = project_location+"HMF/drug_sensitivity/data/overlap/"
location_data =                 location+"data_row_01/"
location_features_drugs =       location+"features_drugs/"
location_features_cell_lines =  location+"features_cell_lines/"
location_kernels =              location+"kernels_features/"

R_main, M_main, cell_lines, drugs = load_data_without_empty(location_data+"gdsc_ic50_row_01.txt")

R_cnv,      M_cnv =      load_data_filter(location_features_cell_lines+"cnv.txt",                 cell_lines)
#R_cnv_std,  M_cnv_std =  load_data_filter(location_features_cell_lines+"cnv_std.txt",             cell_lines)
R_mutation, M_mutation = load_data_filter(location_features_cell_lines+"mutation.txt",            cell_lines)
#R_ge,       M_ge =       load_data_filter(location_features_cell_lines+"gene_expression.txt",     cell_lines)
#R_ge_std,   M_ge_std =   load_data_filter(location_features_cell_lines+"gene_expression_std.txt", cell_lines)

R_fp,       M_fp =       load_data_filter(location_features_drugs+"drug_fingerprints.txt", drugs)
R_targets,  M_targets =  load_data_filter(location_features_drugs+"drug_targets.txt",      drugs)
R_1d2d,     M_1d2d =     load_data_filter(location_features_drugs+"drug_1d2d.txt",         drugs)
#R_1d2d_std, M_1d2d_std = load_data_filter(location_features_drugs+"drug_1d2d_std.txt",     drugs)

features_drugs = [R_fp, R_targets, R_1d2d]
features_cell_lines = [R_cnv, R_mutation]
Beispiel #4
0
''' Settings '''
fractions_unknown = [0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9]
repeats = 20
iterations = 1000

init_UV = 'random'
expo_prior = 1.
K = 2

metrics = ['MSE', 'R^2', 'Rp']

''' Load data '''
location = project_location+"DI_MMTF/data/datasets_drug_sensitivity/overlap/"
location_data = location+"data_row_01/"
R, M_original, _, _ = load_data_without_empty(location_data+"ctrp_ec50_row_01.txt")

#''' Seed all of the methods the same '''
#numpy.random.seed(0)
#random.seed(0)

''' Generate matrices M - one list of (M_train,M_test)'s for each fraction '''
M_attempts = 10000
all_Ms_train_test = [ 
    [try_generate_M_from_M(M=M_original,fraction=fraction,attempts=M_attempts) for r in range(0,repeats)]
    for fraction in fractions_unknown
]

''' Make sure each M has no empty rows or columns '''
def check_empty_rows_columns(M,fraction):
    sums_columns = M.sum(axis=0)
Beispiel #5
0
location_features_cell_lines = location + "features_cell_lines/"
location_kernels = location + "kernels_features/"

file_gdsc = location_data + "gdsc_ic50_row_01.txt"
file_ctrp = location_data + "ctrp_ec50_row_01.txt"
file_ccle_ic = location_data + "ccle_ic50_row_01.txt"
file_ccle_ec = location_data + "ccle_ec50_row_01.txt"

cell_lines, drugs = load_names()
''' Datasets containing all drugs and cell lines. '''
R_gdsc, M_gdsc = load_data(file_gdsc)
R_ccle_ec, M_ccle_ec = load_data(file_ctrp)
R_ctrp, M_ctrp = load_data(file_ccle_ic)
R_ccle_ic, M_ccle_ic = load_data(file_ccle_ec)
''' Datasets containing only drugs and cell lines with observed entries. '''
R_gdsc_filtered, M_gdsc_filtered, i_cl_gdsc, i_drugs_gdsc = load_data_without_empty(
    file_gdsc)
R_ctrp_filtered, M_ctrp_filtered, i_cl_ctrp, i_drugs_ctrp = load_data_without_empty(
    file_ctrp)
R_ccle_ic_filtered, M_ccle_ic_filtered, i_cl_ccle_ic, i_drugs_ccle_ic = load_data_without_empty(
    file_ccle_ic)
R_ccle_ec_filtered, M_ccle_ec_filtered, i_cl_ccle_ec, i_drugs_ccle_ec = load_data_without_empty(
    file_ccle_ec)

cell_lines_gdsc_filtered, drugs_gdsc_filtered = numpy.array(
    cell_lines)[i_cl_gdsc], numpy.array(drugs)[i_drugs_gdsc]
cell_lines_ctrp_filtered, drugs_ctrp_filtered = numpy.array(
    cell_lines)[i_cl_ctrp], numpy.array(drugs)[i_drugs_ctrp]
cell_lines_ccle_ic_filtered, drugs_ccle_ic_filtered = numpy.array(
    cell_lines)[i_cl_ccle_ic], numpy.array(drugs)[i_drugs_ccle_ic]
cell_lines_ccle_ec_filtered, drugs_ccle_ec_filtered = numpy.array(
    cell_lines)[i_cl_ccle_ec], numpy.array(drugs)[i_drugs_ccle_ec]
    fraction_overlap_2 = n_overlap_2 / float(n_cell_lines * n_drugs)
    
    M_overlap_3 = M_main * M3
    n_overlap_3 = M_overlap_3.sum()
    fraction_overlap_3 = n_overlap_3 / float(n_cell_lines * n_drugs)
    
    print "Dataset %s." % names[0]
    print "Number cell lines: %s. Number drugs: %s." % (n_cell_lines,n_drugs)
    print "Number observed: %s. Fraction observed: %s." % (n_observed,fraction_observed)
    print "%s. Number overlap: %s. Fraction overlap: %s." % (names[1],n_overlap_1,fraction_overlap_1)
    print "%s. Number overlap: %s. Fraction overlap: %s." % (names[2],n_overlap_2,fraction_overlap_2)
    print "%s. Number overlap: %s. Fraction overlap: %s." % (names[3],n_overlap_3,fraction_overlap_3)


''' GDSC IC50 as the main dataset '''
R_gdsc, M_gdsc, cell_lines, drugs = load_data_without_empty(location_data+"gdsc_ic50_row_01.txt")
R_ctrp,     M_ctrp                        = load_data_filter(location_data+"ctrp_ec50_row_01.txt",cell_lines,drugs)
R_ccle_ic,  M_ccle_ic                     = load_data_filter(location_data+"ccle_ic50_row_01.txt",cell_lines,drugs)
R_ccle_ec,  M_ccle_ec                     = load_data_filter(location_data+"ccle_ec50_row_01.txt",cell_lines,drugs)

print_overlap(M_gdsc,M_ctrp,M_ccle_ic,M_ccle_ec,['GDSC','CTRP','CCLE IC','CCLE EC'])

''' CTRP EC50 as the main dataset '''
R_ctrp,     M_ctrp,   cell_lines, drugs   = load_data_without_empty(location_data+"ctrp_ec50_row_01.txt")
R_ccle_ec,  M_ccle_ec                     = load_data_filter(location_data+"ccle_ec50_row_01.txt",cell_lines,drugs)
R_gdsc,     M_gdsc                        = load_data_filter(location_data+"gdsc_ic50_row_01.txt",cell_lines,drugs)
R_ccle_ic,  M_ccle_ic                     = load_data_filter(location_data+"ccle_ic50_row_01.txt",cell_lines,drugs)

print_overlap(M_ctrp,M_gdsc,M_ccle_ic,M_ccle_ec,['CTRP','GDSC','CCLE IC','CCLE EC'])

''' CCLE IC50 as the main dataset '''