def lin_models(lasso=True, traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'], nbsnps=10000, verbose=0, hot=False, unif=False, reps=1): alpha = [0.01] R = {} for t in traits: print(t) x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif) x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33) if hot: x_tr = convert_to_individual_alleles(x_tr) x_val = convert_to_individual_alleles(x_val) x_tst = convert_to_individual_alleles(x_tst) nb_snps = x_tr.shape[1] res = np.zeros((len(alpha), 3)) n = 0 for a in alpha: print(a) for i in range(0, reps): m = Sequential() if lasso: m.add(Dense(1, input_dim=nb_snps, kernel_regularizer=l1(a))) else: m.add(Dense(1, input_dim=nb_snps, kernel_regularizer=l2(a))) m.compile(loss='mse', optimizer='adam') m.fit(x_tr, y_tr, epochs=1000, callbacks=[EarlyStopping()], validation_data=(x_val, y_val), verbose=verbose) if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]: print(r(m.predict(x_val).ravel(), y_val)[0]) print(i) res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0] res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0] K.clear_session() print(res[n, 1]) n += 1 R[t + "val"] = res[:, 0] R[t + "tst"] = res[:, 1] R["alpha"] = alpha print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False)) logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
def train_and_score(geneparam, dataset): """Train the model, return test loss. Args: geneparam (dict): the parameters of the network dataset (str): Dataset to use for training/evaluating """ logging.info("Getting datasets") x_train, x_test, y_train, y_test = retrieve_data(dataset.trait, dataset.k, unif=dataset.unif) input_shape = x_train.shape[1] train_data = np.expand_dims(x_train, axis=2) test_data = np.expand_dims(x_test, axis=2) input_shape = (input_shape, 1) logging.info("Compling Keras model") model = compile_model_cnn(geneparam, input_shape) model.fit( train_data, y_train, epochs=1200, # using early stopping so no real limit - don't want to waste time on horrible architectures verbose=1, validation_data=(test_data, y_test), callbacks=[early_stopper]) score = model.evaluate(test_data, y_test, verbose=0) print('Test mse:', score[0]) print('Test mae:', score[1]) r = pearsonr(model.predict(test_data).ravel(), y_test)[0] print("Test r:", r) K.clear_session() # we do not care about keeping any of this in memory - # we just need to know the final scores and the architecture if r != r: r = -1.0 return r
def CNN(traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'], verbose=0, unif=False, nbsnps=10000, p=None, reps=1): #cnn1 param = list({ 'optimizer': 'nadam', 'size_window': 2, 'activation': 'softplus', 'nb_neurons': 64, 'stride': 'one', 'nb_cnn_layers': 1, 'filters': 16, 'weight_decay': 0.0, 'nb_layers': 3, 'dropout': 0.01, 'batch_norm': True }) #cnn2 param.append({ 'optimizer': 'nadam', 'size_window': 2, 'activation': 'elu', 'nb_neurons': 32, 'stride': 'one', 'nb_cnn_layers': 1, 'filters': 32, 'weight_decay': 0.0, 'nb_layers': 3, 'dropout': 0.01, 'batch_norm': False }) #cnn3 param.append({ 'optimizer': 'rmsprop', 'size_window': 3, 'activation': 'linear', 'nb_neurons': 32, 'stride': 'one', 'nb_cnn_layers': 1, 'filters': 16, 'weight_decay': 0.0, 'nb_layers': 1, 'dropout': 0.01, 'batch_norm': False }) R = {} for t in traits: best = 0 print(t) x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif) x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33) n_snps = x_tr.shape[1] x_tr = np.expand_dims(x_tr, axis=2) x_val = np.expand_dims(x_val, axis=2) x_tst = np.expand_dims(x_tst, axis=2) f = os.path.join( os.path.expanduser("~"), 'Code/genomic_cnn/models', "Model_" + t + "_cnn_" + str(n_snps / 1000) + "k" + ("_unif" if unif else "_best") + ".h5") n = 0 if p is None: res = np.zeros((len(param), 2)) for g in param: print(g) for x in range(0, reps): m = compile_model_cnn(g, (n_snps, 1)) m.fit(x_tr, y_tr, epochs=1200, verbose=verbose, validation_data=(x_val, y_val), callbacks=[early_stopper]) if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]: print(r(m.predict(x_val).ravel(), y_val)[0]) print(x) res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0] res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0] if res[n, 0] > best: print("A better network was found with r: %.3f" % res[n, 0]) print(g) m.save(f) best = res[n, 0] n = n + 1 else: res = np.zeros((reps, 2)) g = param[p] for i in range(0, reps): m = compile_model_cnn(g, (n_snps, 1)) m.fit(x_tr, y_tr, epochs=1200, verbose=verbose, validation_data=(x_val, y_val), callbacks=[early_stopper]) res[i, :] = (r(m.predict(x_val).ravel(), y_val)[0], r(m.predict(x_tst).ravel(), y_tst)[0]) R[t + "_tr"] = res[:, 0] R[t + "_tst"] = res[:, 1] print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False)) logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))
def MLP(traits=['height', 'BMI', 'WHR', 'BHMD', 'SBP'], verbose=0, unif=False, nbsnps=10000, p=None, reps=1, hot=False): #mlp1 geneparam = list({ 'optimizer': 'rmsprop', 'activation': 'elu', 'nb_neurons': 32, 'weight_decay': 0.01, 'nb_layers': 1, 'dropout': 0.02 }) # mlp2 geneparam.append({ 'optimizer': 'adagrad', 'activation': 'elu', 'nb_neurons': 64, 'weight_decay': 0.01, 'nb_layers': 2, 'dropout': 0.03 }) # mlp3 geneparam.append({ 'optimizer': 'adam', 'activation': 'softplus', 'nb_neurons': 32, 'weight_decay': 0.01, 'nb_layers': 5, 'dropout': 0.02 }) R = {} for t in traits: print(t) best = 0 x_tr, x_tst, y_tr, y_tst = retrieve_data(t, nbsnps, unif=unif) x_tr, x_val, y_tr, y_val = train_test_split(x_tr, y_tr, test_size=0.33) if hot: x_tr = convert_to_individual_alleles(x_tr) x_val = convert_to_individual_alleles(x_val) x_tst = convert_to_individual_alleles(x_tst) n_snps = x_tr.shape[1] f = os.path.join(os.path.expanduser("~"), 'Code/genomic_cnn/models', "Model_" + t + "_mlp_" + str(n_snps / 1000) \ + "kHot" + ("_unif" if unif else "_best") + ".h5") else: n_snps = x_tr.shape[1] f = os.path.join( os.path.expanduser("~"), 'Code/genomic_cnn/models', "Model_" + t + "_mlp_" + str(n_snps / 1000) + "k" + ("_unif" if unif else "_best") + ".h5") n = 0 if p is None: res = np.zeros((len(geneparam), 2)) for g in geneparam: print(g) for x in range(0, reps): m = compile_model_mlp(g, n_snps) m.fit(x_tr, y_tr, epochs=1200, validation_data=(x_val, y_val), callbacks=[early_stopper], verbose=verbose) if r(m.predict(x_val).ravel(), y_val)[0] > res[n, 0]: print(r(m.predict(x_val).ravel(), y_val)[0]) print(x) res[n, 0] = r(m.predict(x_val).ravel(), y_val)[0] res[n, 1] = r(m.predict(x_tst).ravel(), y_tst)[0] if res[n, 0] > best: print("A better network was found with r: %.3f" % res[n, 0]) print(g) m.save(f) best = res[n, 0] K.clear_session() n = n + 1 else: res = np.zeros((reps, 2)) g = geneparam[p] for i in range(0, reps): m = compile_model_mlp(g, n_snps) m.fit(x_tr, y_tr, epochs=1200, verbose=verbose, validation_data=(x_val, y_val), callbacks=[early_stopper]) res[i, :] = (r(m.predict(x_val).ravel(), y_val)[0], r(m.predict(x_tst).ravel(), y_tst)[0]) R[t + "_tr"] = res[:, 0] R[t + "_tst"] = res[:, 1] print(pd.DataFrame(R).to_csv(float_format='%.3f', index=False)) logging.info(pd.DataFrame(R).to_csv(float_format='%.3f', index=False))