def filter_features_2bit(args): from utils import read_hdf5_dataset logger.info('read genotypes from: ' + args.genotype_file) genotypes = read_hdf5_dataset(args.genotype_file) indices = None if args.indices_file is not None: logger.info('read indices from: ' + args.indices_file) indices = read_hdf5_dataset(args.indices_file) genotypes = np.take(genotypes, indices, axis=0) logger.info('number of samples: %d' % indices.shape[0]) pvalues = {} for phenotype_file in args.phenotype_file: logger.info('read phenotypes from: ' + phenotype_file) phenotypes, dataset = read_hdf5_dataset(phenotype_file, return_name=True) if indices is not None: phenotypes = np.take(phenotypes, indices) if args.metric == 'anova': logger.info('calculate ANOVA p-values') pvalues[dataset] = fast_anova_2bit(genotypes, phenotypes) logger.info('save p-values to file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: for dataset in pvalues.keys(): f.create_dataset(dataset, data=pvalues[dataset])
def random_cv_split(args): import numpy as np import h5py from utils import read_hdf5_single, cv_split_emaize, get_indices_table, prepare_output_file, read_hdf5_dataset logger.info('read training indices file: ' + args.train_index_file) train_indices_all = read_hdf5_dataset(args.train_index_file) logger.info('read parent table file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) indices_table, mask = get_indices_table(train_indices_all, parent_table) logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: for k in range(args.n_datasets): row_indices = np.random.choice(indices_table.shape[0], 5, replace=False) col_indices = np.random.choice(indices_table.shape[1], 5, replace=False) test_indices = np.union1d( indices_table[row_indices, :].reshape((-1, )), indices_table[:, col_indices].reshape((-1, ))) train_indices = np.setdiff1d(train_indices_all, test_indices) test_indices = np.intersect1d(test_indices, train_indices_all) train_indices = np.intersect1d(train_indices, train_indices_all) g = f.create_group(str(k)) g.create_dataset('train', data=train_indices) g.create_dataset('test', data=test_indices)
def test_single_snp(args): import fastlmm from pysnptools.snpreader import SnpData, Pheno, SnpReader from fastlmm.association import single_snp from utils import read_hdf5_dataset import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt import fastlmm.util.util as flutil logger.info('read phenotypes from file: ' + args.phenotype_file) phenotypes = pd.read_table(args.phenotype_file) iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis], 2, axis=1) if args.sample_indices_file is not None: logger.info('read indices from file: ' + args.sample_indices_file) sample_indices = read_hdf5_dataset(args.sample_indices_file) else: sample_indices = np.nonzero( (phenotypes['type'] == 'training').values)[0] logger.info('read SNP file (for test): ' + args.snp_file) test_snps = get_snpdata(iid, args.snp_file, sample_indices=sample_indices) logger.info('read SNP file (for K0): ' + args.k0_file) K0 = get_snpdata(iid, args.k0_file) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) df_pheno = phenotypes[phenotypes['type'] == 'training'].copy() df_pheno['fid'] = df_pheno['id'] df_pheno['iid'] = df_pheno['id'] traits = ('trait1', 'trait2', 'trait3') for trait in traits: pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait) logger.info('create Pheno file: ' + pheno_file) df_pheno[['fid', 'iid', trait]].to_csv(pheno_file, index=False, sep='\t', header=False) pheno = Pheno(pheno_file) logger.info('run FastLMM for single SNP test for %s' % trait) results_df = single_snp(test_snps, pheno, K0=K0, count_A1=True, GB_goal=args.GB_goal) result_file = os.path.join(args.output_dir, 'single_snp.' + trait) logger.info('save results to file: ' + result_file) results_df.to_hdf(result_file, trait) if args.manhattan: plot_file = os.path.join(args.output_dir, 'manhattan.%s.pdf' % trait) logger.info('create Manhattan plot: ' + plot_file) plt.clf() flutil.manhattan_plot(results_df.as_matrix( ["Chr", "ChrPos", "PValue"]), pvalue_line=1e-5, xaxis_unit_bp=False) plt.savefig(plot_file)
def anova_linregress(args): from utils import read_hdf5_dataset from tqdm import tqdm from statsmodels.sandbox.stats.multicomp import multipletests logger.info('read genotypes from: ' + args.genotype_file) genotypes = read_hdf5_dataset(args.genotype_file) indices = None if args.sample_indices_file is not None: logger.info('read indices from: ' + args.sample_indices_file) indices = read_hdf5_dataset(args.sample_indices_file) genotypes = np.take(genotypes, indices, axis=1) logger.info('number of samples: %d' % indices.shape[0]) logger.info('read phenotypes from: ' + args.phenotype_file) phenotypes, dataset = read_hdf5_dataset(args.phenotype_file, return_name=True) logger.info('perform ANOVA for dataset: %s' % dataset) if indices is not None: phenotypes = np.take(phenotypes, indices) if args.batch_size is not None: slicegen = BatchSliceGenerator(genotypes.shape[0], batch_size=args.batch_size) outputs = [] for start, stop in tqdm(slicegen(), total=slicegen.n_batches): outputs.append(fast_linregress(genotypes[start:stop], phenotypes)) w, b, pvalues = [np.concatenate(a) for a in zip(*outputs)] del outputs else: w, b, pvalues = fast_linregress(genotypes, phenotypes) reject, qvalues, _, _ = multipletests(pvalues, alpha=args.alpha, method='fdr_bh') reject = np.nonzero(reject)[0] logger.info('save results to file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset('pvalue', data=pvalues) f.create_dataset('slope', data=w.astype('float32')) f.create_dataset('intercept', data=b.astype('float32')) f.create_dataset('qvalue', data=qvalues) f.create_dataset('reject', data=reject)
def normalize_genotypes(args): from utils import read_hdf5_dataset, prepare_output_file import numpy as np import h5py logger.info('read input file: ' + args.input_file) X, dataset = read_hdf5_dataset(args.input_file, return_name=True) n_snps, n_samples = X.shape # allele frequencies p = X.sum(axis=1).astype('float32')/n_samples multiplier = 1.0/np.sqrt(2.0*p*(1.0 - p)) multiplier = multiplier.astype('float32') logger.info('save mean and multipliers to output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset('mean', data=p) f.create_dataset('multiplier', data=multiplier)
def create_gsm(args): import h5py import numpy as np from utils import prepare_output_file, read_hdf5_dataset ''' logger.info('read genomic positions from file: ' + args.genomic_pos_file) positions = {} with h5py.File(args.genomic_pos_file, 'r') as f: for i in range(1, 11): positions['chr%d'%i] = f['chr%d'%i][:] n_snps_per_chrom = {chrom:positions[chrom].shape[0] for chrom in positions.keys()} n_snps_total = sum(n_snps_per_chrom.values()) X = [] for chrom in positions.keys(): genotype_file = os.path.join(args.input_dir, chrom) logger.info('read genotype file: ' + genotype_file) with h5py.File(genotype_file, 'r') as f: n_sel = int(np.round(args.n_snps*float(n_snps_per_chrom[chrom])/n_snps_total)) ind = np.random.choice(n_snps_per_chrom[chrom], size=n_sel) X.append(f['data'][:][ind]) X = np.concatenate(X, axis=0).astype('float32') ''' logger.info('read genotypes from file: ' + args.input_file) X = read_hdf5_dataset(args.input_file).astype('float64') logger.info('number of selected SNPs: %d' % X.shape[0]) logger.info('calculate GSM') X -= X.mean(axis=1)[:, np.newaxis] X_std = np.sqrt(np.sum(X**2, axis=1)) X_std[np.isclose(X_std, 0.0)] = 1.0 X = X / X_std[:, np.newaxis] logger.info('calculate K') K = np.dot(X.T, X) logger.info('run SVD on X') U, S, V = np.linalg.svd(X.T, full_matrices=False) V = V.T logger.info('save GSM to file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset('K', data=K) f.create_dataset('U', data=U) f.create_dataset('S', data=S) f.create_dataset('V', data=V)
def convert_2bit_to_minor(args): from utils import read_hdf5_dataset, prepare_output_file import numpy as np import h5py import numba @numba.jit(nopython=True) def _2bit_to_minor(X_2bit, X_minor): n_snps = X_minor.shape[0] n_samples = X_minor.shape[1] max_freq = n_samples for i in range(n_snps): freq = 0 for j in range(n_samples): count = X_2bit[i, 1, j] - X_2bit[i, 0, j] + 1 freq += count X_minor[i, j] = count if freq > n_samples: for j in range(n_samples): X_minor[i, j] = 2 - X_minor[i, j] logger.info('read input file: ' + args.input_file) X_2bit, dataset = read_hdf5_dataset(args.input_file, return_name=True) n_snps, n_samples = X_2bit.shape n_snps /= 2 logger.info('number of SNPs: %d, number of samples: %d'%(n_snps, n_samples)) X_2bit = X_2bit.reshape((n_snps, 2, n_samples)) logger.info('convert from 2bit code to minor copy numbers') # assume that the second allele in the 2bit representation is the minor allele # 10 -> 0, 11 -> 1, 01 -> 2 #X_minor = np.einsum('ijk,j->ik', X_2bit, np.array([-1, 1])) + 1 # swap the two alleles to make sure that the number represents a minor allele #allele_freq = np.expand_dims(np.sum(X_2bit[:, 1, :], axis=1), axis=1) #X_minor = np.where(allele_freq <= n_samples/2, X_minor, 2 - X_minor) X_minor = np.empty((n_snps, n_samples), dtype='int8') _2bit_to_minor(X_2bit, X_minor) logger.info('save minor allele copy numbers to output file: ' + args.output_file) prepare_output_file(args.output_file) with h5py.File(args.output_file, 'w') as f: f.create_dataset(dataset, data=X_minor)
def evaluate(args): import h5py from sklearn.metrics import r2_score, mean_squared_error from scipy.stats import pearsonr from utils import prepare_output_file, read_hdf5_dataset logger.info('read prediction file: ' + args.input_file) with h5py.File(args.input_file, 'r') as f: y_true = f['y_true'][:] y_pred = f['y_pred'][:] logger.info('read sample indices file: ' + args.sample_indices_file) indices = read_hdf5_dataset(args.sample_indices_file) y_true = y_true[indices] y_pred = y_pred[indices] logger.info('save metrics file: ' + args.output_file) prepare_output_file(args.output_file) with open(args.output_file, 'w') as f: f.write('r2\tmse\tpcc\n') f.write('%f' % r2_score(y_true, y_pred)) f.write('\t%f' % mean_squared_error(y_true, y_pred)) f.write('\t%f' % pearsonr(y_true, y_pred)[0]) f.write('\n')
def run_regression(args): import h5py import numpy as np from utils import read_hdf5_dataset, standardize_genotypes from sklearn.metrics import r2_score, mean_squared_error from scipy.stats import pearsonr if args.gsm_file is not None: logger.info('read GSM file: ' + args.gsm_file) with h5py.File(args.gsm_file, 'r') as f: U = f['U'][:] S = f['S'][:] U = U*S[np.newaxis, :] U = U[:, S**2 > 0.5] U = standardize_genotypes(U) X = U else: logger.info('read genotype file: ' + args.genotype_file) X = read_hdf5_dataset(args.genotype_file) if args.transpose_x: logger.info('transpose X') X = X.T X = standardize_genotypes(X) logger.info('read phenotype file: ' + args.phenotype_file) y = read_hdf5_dataset(args.phenotype_file) logger.info('read training indices file: ' + args.train_index_file) train_index = read_hdf5_dataset(args.train_index_file) logger.info('read test indices file: ' + args.test_index_file) test_index = read_hdf5_dataset(args.test_index_file) if not os.path.exists(args.output_dir): logger.info('create output directory: ' + args.output_dir) os.makedirs(args.output_dir) logger.info('use model: ' + args.model_name) if args.model_name == 'mlp': import keras from keras.models import Sequential from keras.layers import Dense, Activation from keras import backend as K if K.backend() == 'tensorflow': # replace the original get_session() function keras.backend.tensorflow_backend.get_session.func_code = _get_session.func_code logger.info('build the model') model = Sequential() # Feedforward model.add(Dense(500, input_dim=X.shape[1])) model.add(Activation('tanh')) model.add(Dense(100)) model.add(Activation('tanh')) model.add(Dense(1)) optimizer = keras.optimizers.RMSprop() model.compile(loss='mean_squared_error', optimizer=optimizer) callbacks = [keras.callbacks.CSVLogger(os.path.join(args.output_dir, 'train_log.csv'))] logger.info('build the model') model.fit(X[train_index], y[train_index], epochs=args.max_epochs, callbacks=callbacks) ''' logger.info('save the model') model.save(os.path.join(args.output_dir, 'model'))''' else: logger.info('build the model') import dill as pickle if args.model_name == 'gpr': from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel kernel = DotProduct(sigma_0=1.0)**4 + WhiteKernel() model = GaussianProcessRegressor(kernel=kernel, optimizer=None) elif args.model_name == 'ridge': from sklearn.linear_model import Ridge model = Ridge(alpha=1) logger.info('train the model') model.fit(X[train_index], y[train_index]) ''' logger.info('save the model') with open(os.path.join(args.output_dir, 'model'), 'wb') as fout: pickle.dump(model, fout)''' logger.info('test the model') y_pred = np.ravel(model.predict(X)) logger.info('save predictions on the test set') fout = h5py.File(os.path.join(args.output_dir, 'predictions'), 'w') fout.create_dataset('y_true', data=y) fout.create_dataset('y_pred', data=y_pred) fout.close() for phase in ('train', 'test'): if phase == 'train': y_ = y[train_index] y_pred_ = y_pred[train_index] else: y_ = y[test_index] y_pred_ = y_pred[test_index] metrics = {} metrics['mean_squared_error'] = mean_squared_error(y_, y_pred_) metrics['r2_score'] = r2_score(y_, y_pred_) metrics['pearsonr'] = pearsonr(y_, y_pred_)[0] for metric_name, metric_value in metrics.items(): logger.info('%s.%s = %f'%(phase, metric_name, metric_value)) logger.info('save metrics') with open(os.path.join(args.output_dir, 'metrics.%s.txt'%phase), 'w') as fout: fout.write('%s\t%f\n'%(metric_name, metric_value))
def run_metric_regressor(args): from utils import read_hdf5_dataset import h5py from metric_regressor import MetricRegressor import dill as pickle import numpy as np from utils import read_hdf5_single, cv_split_emaize, standardize_genotypes, get_indices_table logger.info('read genotype file: ' + args.genotype_file) X = read_hdf5_dataset(args.genotype_file) if args.transpose_genotype: X = X.T X = standardize_genotypes(X) logger.info('read GSM file: ' + args.gsm_file) with h5py.File(args.gsm_file, 'r') as f: U = f['U'][:] S = f['S'][:] U = U[:, S ** 2 > 0.5] U = standardize_genotypes(U) logger.info('read phenotype file: ' + args.phenotype_file) y = read_hdf5_dataset(args.phenotype_file) logger.info('read parent table file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) logger.info('read training indices file: ' + args.train_index_file) train_index = read_hdf5_dataset(args.train_index_file) logger.info('read test indices file: ' + args.train_index_file) test_index = read_hdf5_dataset(args.test_index_file) indices_table, mask = get_indices_table(train_index, parent_table) if args.cv_type == 's1f': train_index_list, test_index_list, s0_index_list = cv_split_emaize(indices_table, mask, k=parent_table.shape[0], method='s1f') elif args.cv_type == 's0': train_index_list, test_index_list, s0_index_list = cv_split_emaize(indices_table, mask, k1=parent_table.shape[0] / 5, k2=parent_table.shape[1] / 5, method='s0') elif args.cv_type == 's1m': train_index_list, test_index_list, s0_index_list = cv_split_emaize(indices_table, mask, k=parent_table.shape[1], method='s1m') else: raise ValueError('unkown cross-validation type: %s'%args.cv_type) logger.info('%d rows and %d columns in the indices table' % (indices_table.shape[0], indices_table.shape[1])) logger.info('number of cross-validation folds %d' % len(train_index_list)) logger.info('number principal components to use: %d' % U.shape[1]) X = np.concatenate([X, U], axis=1) y = y.reshape((-1, 1)) X_train, y_train = X[train_index], y[train_index] def cv_generator(batch_size=5): while True: for i in range(len(train_index_list)): train_index = train_index_list[i] if args.cv_type == 's0': test_index = test_index_list[i][s0_index_list[i]] else: test_index = test_index_list[i] if (len(train_index) < batch_size) or (len(test_index) < batch_size): continue train_index = np.random.choice(train_index, size=batch_size, replace=False) test_index = np.random.choice(test_index, size=batch_size, replace=False) yield (X_train[train_index], X_train[test_index], y_train[train_index], y_train[test_index]) model = MetricRegressor(input_dim=X.shape[1], hidden_dim=args.n_hidden, alpha=args.alpha, sparse_rate=args.sparse_rate, kernel=args.kernel) model.fit(X_train, y_train, data_generator=cv_generator(batch_size=args.batch_size), lr=args.lr, max_iter=args.max_iter, n_batches=len(train_index_list)) y_pred = model.predict(X) logger.info('save results to output directory: ' + args.output_dir) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_file = os.path.join(args.output_dir, 'model') #with open(model_file, 'wb') as f: # pickle.dump(model, f) model.save(model_file) pred_file = os.path.join(args.output_dir, 'predictions') with h5py.File(pred_file, 'w') as f: f.create_dataset('y_true', data=y) f.create_dataset('y_pred', data=y_pred) f.create_dataset('mses', data=model.mses_) f.create_dataset('velocities', data=model.velocities_) f.create_dataset('mse_grads', data=model.mse_grads_)
def plot_predictions(args): import h5py from utils import read_hdf5_single, prepare_output_file, read_hdf5_dataset import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt from matplotlib.backends.backend_pdf import PdfPages plt.rcParams['font.family'] = 'Arial' plt.rcParams['font.size'] = 12 plt.rcParams['legend.fontsize'] = 12 import numpy as np def normalize_phenotype(x, range_pheno=4.0): return (np.clip(x, -range_pheno, range_pheno) + range_pheno) / 2.0 / range_pheno logger.info('read parent table file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) logger.info('read predictions from file: ' + args.input_file) with h5py.File(args.input_file, 'r') as f: y_true = f['y_true'][:] y_pred = f['y_pred'][:] logger.info('read training indices from file: ' + args.train_indices_file) train_index = read_hdf5_dataset(args.train_indices_file) logger.info('read test indices from file: ' + args.test_indices_file) test_index = read_hdf5_dataset(args.test_indices_file) y_pred_train = np.full(y_pred.shape, np.nan) y_pred_train[train_index] = y_pred[train_index] y_pred_test = np.full(y_pred.shape, np.nan) y_pred_test[test_index] = y_pred[test_index] logger.info('create output file: ' + args.output_file) prepare_output_file(args.output_file) with PdfPages(args.output_file) as pdf: fig, axes = plt.subplots(4, 1, figsize=(10, 8)) axes[0].matshow(np.take(np.ravel(normalize_phenotype(y_true)), parent_table), cmap=plt.cm.RdBu_r) axes[0].set_title('True phenotypes') axes[1].matshow(np.take(np.ravel(normalize_phenotype(y_pred)), parent_table), cmap=plt.cm.RdBu_r) axes[1].set_title('Predicted phenotypes') axes[2].matshow(np.take(np.ravel(normalize_phenotype(y_pred_train)), parent_table), cmap=plt.cm.RdBu_r) axes[2].set_title('Predicted phenotypes (train)') axes[3].matshow(np.take(np.ravel(normalize_phenotype(y_pred_test)), parent_table), cmap=plt.cm.RdBu_r) axes[3].set_title('Predicted phenotypes (test)') plt.tight_layout() pdf.savefig(fig) plt.clf() fig, axes = plt.subplots(2, 3, figsize=(10, 6)) axes[0, 0].hist(y_true[~np.isnan(y_true)], bins=50) axes[0, 0].set_title('True phenotypes') axes[0, 1].hist(y_true[train_index], bins=50) axes[0, 1].set_title('True phenotypes (train)') axes[0, 2].hist(y_true[test_index], bins=50) axes[0, 2].set_title('True phenotypes (test)') axes[1, 0].hist(y_pred, bins=50) axes[1, 0].set_title('Predicted phenotypes') axes[1, 1].hist(y_pred[train_index], bins=50) axes[1, 1].set_title('Predicted phenotypes (train)') axes[1, 2].hist(y_pred[test_index], bins=50) axes[1, 2].set_title('Predicted phenotypes (test)') for i in range(2): for j in range(3): axes[i, j].set_xlim(-5, 5) plt.tight_layout() pdf.savefig(fig) plt.clf() fig, axes = plt.subplots(1, 3, figsize=(10, 4)) axes[0].scatter(y_true[~np.isnan(y_true)], y_pred[~np.isnan(y_true)], s=3) axes[0].set_xlabel('True phenotypes') axes[0].set_ylabel('Predicted phenotypes') axes[0].set_title('All samples') axes[1].scatter(y_true[train_index], y_pred[train_index], s=3) axes[1].set_xlabel('True phenotypes') axes[1].set_ylabel('Predicted phenotypes') axes[1].set_title('Training samples') axes[2].scatter(y_true[test_index], y_pred[test_index], s=3) axes[2].set_xlabel('True phenotypes') axes[2].set_ylabel('Predicted phenotypes') axes[2].set_title('Training samples') plt.tight_layout() pdf.savefig(fig)
def select_best_subset(args): import h5py from tqdm import tqdm import pandas as pd from scipy.stats import pearsonr from sklearn.metrics import mean_squared_error from utils import prepare_output_file, read_hdf5_dataset logger.info('read sample indices of test dataset from ' + args.test_index_file) test_index = read_hdf5_dataset(args.test_index_file) traits = args.traits.split(',') def iterator(): cv_type = 's1f' for gamma in args.gammas.split(','): for n_snps in [int(a) for a in args.n_snps.split(',')]: for trait in traits: for snp_set in range(args.n_groups): filename = args.input_dir + '/gamma={gamma}/{n_snps}/{trait}/{snp_set}/{cv_type}/predictions'.format( n_snps=n_snps, trait=trait, snp_set=snp_set, cv_type=cv_type, gamma=gamma) yield ('random_choice', gamma, trait, n_snps, snp_set, cv_type, filename) def query_dict(records, **kwargs): ''' Search for records (dicts) that match the key-value pairs :param records: a list of dicts :param kwargs: key-value pairs :return: a list of records that match the query arguments ''' results = [] for record in records: val = True for key in kwargs: val = val and (record[key] == kwargs[key]) if val: results.append(record) return results logger.info('read prediction results') predictions = [] for method, gamma, trait, n_snps, snp_set, cv_type, filename in tqdm( list(iterator())): with h5py.File(filename, 'r') as f: predictions.append({ 'trait': trait, 'gamma': f['best_gamma'][()], 'alpha': f['best_alpha'][()], 'method': method, 'n_snps': n_snps, 'snp_set': snp_set, 'y_pred': f['y_pred'][:], 'cv_type': cv_type, 'mse_cv': np.ravel(f['mse_cv']), 'pcc_cv': np.ravel(f['pcc_cv']) }) logger.info('summarize cross-validation metrics') summary = [] for pred in tqdm(predictions): summary.append( (pred['method'], pred['gamma'], pred['alpha'], pred['trait'], pred['n_snps'], pred['snp_set'], pred['cv_type'], np.min(pred['pcc_cv']), np.min(pred['mse_cv']), np.mean(pred['pcc_cv']), np.mean(pred['mse_cv']), np.max(pred['pcc_cv']), np.max(pred['mse_cv']), np.median(pred['pcc_cv']), np.median(pred['mse_cv']))) summary = pd.DataFrame.from_records( summary, columns=('method', 'gamma', 'alpha', 'trait', 'n_snps', 'snp_set', 'cv_type', 'pcc_cv_min', 'mse_cv_min', 'pcc_cv_mean', 'mse_cv_mean', 'pcc_cv_max', 'mse_cv_max', 'pcc_cv_median', 'mse_cv_median')) ascending = args.by.startswith('mse') summary_best = summary.sort_values(['trait', args.by], ascending=ascending).groupby(['trait' ]).head(3) if not os.path.exists(args.output_dir): logger.info('create output directory: ' + args.output_dir) os.makedirs(args.output_dir) summary_file = os.path.join(args.output_dir, 'summary.txt') logger.info('write summary of all SNP subsets to file: ' + summary_file) summary.to_csv(summary_file, sep='\t', index=False) summary_best_file = os.path.join(args.output_dir, 'summary_best.txt') logger.info('write summary of best SNP subsets to file: ' + summary_best_file) summary_best.to_csv(summary_best_file, sep='\t', index=False) logger.info('extract predictions from best SNP subsets') rank = {} for trait in traits: rank[trait] = 0 for index, record in summary_best.iterrows(): pred_file = args.input_dir + '/gamma={gamma:.2f}/{n_snps}/{trait}/{snp_set}/{cv_type}/predictions'.format( **record.to_dict()) with h5py.File(pred_file, 'r') as f: y_pred = f['y_pred'][:] trait = record['trait'] test_pred_file = os.path.join( args.output_dir, 'prediction.%s.%d.txt' % (trait, rank[trait])) logger.info('save test predictions to file: ' + test_pred_file) np.savetxt(test_pred_file, y_pred[test_index]) rank[trait] += 1
def single_model(args): import h5py import pandas as pd import numpy as np import dill as pickle from utils import read_hdf5_dataset, prepare_output_file, read_hdf5_single from sklearn.preprocessing import StandardScaler from sklearn.metrics import r2_score, mean_squared_error from tqdm import tqdm logger.info('read phenotypes from file: ' + args.phenotype_file) #phenotypes = pd.read_table(args.phenotype_file) phenotypes = read_hdf5_dataset(args.phenotype_file) logger.info('read genotypes from file: ' + args.genotype_file) X = read_hdf5_dataset(args.genotype_file) if args.transpose_x: logger.info('transpose X') X = X.T y = phenotypes if args.feature_indices_file: logger.info('read feature indices from: ' + args.feature_indices_file) feature_indices = read_hdf5_dataset(args.feature_indices_file) X = np.take(X, feature_indices, axis=1) if args.normalize_x: logger.info('normalize X') X = StandardScaler().fit_transform(X) if args.sample_indices_file: logger.info('read sample indices from: ' + args.sample_indices_file) sample_indices = read_hdf5_dataset(args.sample_indices_file) else: sample_indices = np.nonzero(~np.isnan(phenotypes))[0] X_train = X[sample_indices] y_train = y[sample_indices] logger.info('read parent table from file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) logger.info('use model ' + args.model_name) logger.info('X.shape = %s, y.shape = %s' % (repr(X.shape), repr(y.shape))) if args.model_name == 'ridge': from sklearn.linear_model import Ridge model = Ridge(alpha=10000) model.fit(X_train, y_train) y_pred = np.ravel(model.predict(X)) y_pred_train = y_pred[sample_indices] elif args.model_name == 'ridge_cv': from sklearn.linear_model import Ridge alphas = 10.0**np.arange(1, 6) train_masks, test_masks = generate_cv_masks(sample_indices, parent_table, k_female=5, k_male=5) cv_metrics = {} cv_metrics['mse'] = np.zeros((len(alphas), train_masks.shape[0])) cv_metrics['r2'] = np.zeros((len(alphas), train_masks.shape[0])) pbar = tqdm(total=len(alphas) * train_masks.shape[0]) for i, alpha in enumerate(alphas): for j in range(train_masks.shape[0]): model = Ridge(alpha=alpha) model.fit(X[train_masks[j]], y[train_masks[j]]) y_pred = model.predict(X[test_masks[j]]) cv_metrics['mse'][i, j] = mean_squared_error( y[test_masks[j]], y_pred) cv_metrics['r2'][i, j] = r2_score(y[test_masks[j]], y_pred) pbar.update(1) pbar.close() best_alpha = alphas[cv_metrics['r2'].mean(axis=1).argmax()] logger.info('optmized alpha = %f' % best_alpha) model = Ridge(alpha=best_alpha) model.fit(X_train, y_train) y_pred = np.ravel(model.predict(X)) y_pred_train = y_pred[sample_indices] elif args.model_name == 'gpr': from sklearn.gaussian_process import GaussianProcessRegressor from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel, RBF kernel = RBF() + WhiteKernel() model = GaussianProcessRegressor(kernel=kernel) model.fit(X_train, y_train) logger.info('kernel params: %s' % repr(model.get_params())) y_pred_train = np.ravel(model.predict(X_train)) y_pred = np.ravel(model.predict(X)) elif args.model_name == 'gpy': from GPy.kern import Linear from GPy.models import GPRegression kernel = Linear(input_dim=2, name='linear') model = GPRegression(X_train, y_train, kernel=kernel) model.optimize() else: raise ValueError('unknown model name: ' + args.model_name) logger.info('r2 score = %f' % r2_score(y_train, y_pred_train)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_file = os.path.join(args.output_dir, 'model') logger.info('save model file: ' + model_file) with open(model_file, 'wb') as f: pickle.dump(model, f) pred_file = os.path.join(args.output_dir, 'predictions') logger.info('save predictions to file: ' + pred_file) with h5py.File(pred_file, 'w') as f: if args.output_residuals: f.create_dataset('residual', data=(y - y_pred)) f.create_dataset('y_true', data=y) f.create_dataset('y_pred', data=y_pred) f.create_dataset('y_pred_train', data=y_pred_train) f.create_dataset('indices_train', data=sample_indices) if args.model_name == 'ridge_cv': f.create_dataset('alpha', data=alphas) g = f.create_group('cv_metrics') for key in cv_metrics.keys(): g.create_dataset(key, data=cv_metrics[key])
def run_mixed_ridge(args): from utils import read_hdf5_dataset import h5py from models import MixedRidge import dill as pickle from utils import read_hdf5_single, cv_split_emaize, standardize_genotypes, get_indices_table logger.info('read genotype file: ' + args.genotype_file) X = read_hdf5_dataset(args.genotype_file) if args.transpose_genotype: X = X.T X = standardize_genotypes(X) logger.info('read GSM file: ' + args.gsm_file) with h5py.File(args.gsm_file, 'r') as f: U = f['U'][:] S = f['S'][:] #U = U*S[np.newaxis, :] U = U[:, S**2 > 0.5] U = standardize_genotypes(U) logger.info('read phenotype file: ' + args.phenotype_file) y = read_hdf5_dataset(args.phenotype_file) logger.info('read parent table file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) logger.info('read training indices file: ' + args.train_index_file) train_index = read_hdf5_dataset(args.train_index_file) logger.info('read test indices file: ' + args.test_index_file) test_index = read_hdf5_dataset(args.test_index_file) indices_table, mask = get_indices_table(train_index, parent_table) if args.cv_type == 's1f': if args.k is None: k = parent_table.shape[0] else: k = args.k train_index_list, test_index_list, s0_index_list = cv_split_emaize( indices_table, mask, k=k, method='s1f') elif args.cv_type == 's0': train_index_list, test_index_list, s0_index_list = cv_split_emaize( indices_table, mask, k1=parent_table.shape[0] / 5, k2=parent_table.shape[1] / 5, method='s0') elif args.cv_type == 's1m': if args.k is None: k = parent_table.shape[1] else: k = args.k train_index_list, test_index_list, s0_index_list = cv_split_emaize( indices_table, mask, k=k, method='s1m') else: raise ValueError('unkown cross-validation type: %s' % args.cv_type) logger.info('%d rows and %d columns in the indices table' % (indices_table.shape[0], indices_table.shape[1])) logger.info('number of cross-validation folds %d' % len(train_index_list)) logger.info('number principal components to use: %d' % U.shape[1]) alpha_list = [0.001, 0.01, 0.1] gamma_list = [0.0, 0.05, 0.1, 0.15, 0.2] alpha_list = [0.001] gamma_list = [0.15] alpha_list = [float(a) for a in args.alphas.split(',')] gamma_list = [float(a) for a in args.gammas.split(',')] metrics = { 'pcc_cv': np.zeros((len(alpha_list), len(gamma_list), len(test_index_list))), 'mse_cv': np.zeros((len(alpha_list), len(gamma_list), len(test_index_list))) } X_train, U_train, y_train = X[train_index], U[train_index], y[train_index] n_samples_total = np.prod(parent_table.shape) test_index_mask = np.zeros((len(test_index_list), n_samples_total), dtype='bool') if args.cv_type == 's0': for i in range(len(s0_index_list)): test_index_mask[i, train_index[s0_index_list[i]]] = True else: for i in range(len(test_index_list)): test_index_mask[i, train_index[test_index_list[i]]] = True for i, alpha in enumerate(alpha_list): #model.optimize_grid(X[train_index], U[train_index], y[train_index]) ''' mse_cv = np.zeros(len(train_index_list)) pcc_cv = np.zeros(len(train_index_list)) for j in range(len(train_index_list)): model.fit(X_train[train_index_list[j]], U_train[train_index_list[j]], y_train[train_index_list[j]], gamma=0.1) y_pred_cv = model.predict(X_train[test_index_list[j]], U_train[test_index_list[j]]) mse_cv[j] = mean_squared_error(y_train[test_index_list[j]], y_pred_cv) pcc_cv[j] = pearsonr(y_train[test_index_list[j]], y_pred_cv)[0] logger.info('cross-validation (real) MSE = %f, %f, %f'%(np.nanmin(mse_cv), np.nanmean(mse_cv), np.nanmax(mse_cv))) logger.info('cross-validation (real) PCC = %f, %f, %f' % (np.nanmin(pcc_cv), np.nanmean(pcc_cv), np.nanmax(pcc_cv))) ''' for j, gamma in enumerate(gamma_list): model = MixedRidge(alphas=alpha) model.fit(X_train, U_train, y_train, gamma=gamma, cv=True) mse_cv = model.kfold(test_index_list, subset_indices=s0_index_list, return_mean=False) pcc_cv = model.pcc_cv metrics['pcc_cv'][i, j] = pcc_cv metrics['mse_cv'][i, j] = mse_cv logger.info( 'cross-validation (fast) MSE = %f, %f, %f' % (np.nanmin(mse_cv), np.nanmean(mse_cv), np.nanmax(mse_cv))) logger.info( 'cross-validation (fast) PCC = %f, %f, %f' % (np.nanmin(pcc_cv), np.nanmean(pcc_cv), np.nanmax(pcc_cv))) logger.info('alpha=%f, gamma=%f' % (alpha, gamma)) pcc_cv_mean = np.nanmean(metrics['pcc_cv'], axis=2) i_best = np.argmax(np.max(pcc_cv_mean, axis=1)) best_alpha = alpha_list[i_best] best_gamma = gamma_list[np.argmax(pcc_cv_mean[i_best])] logger.info('best model: alpha=%f, gamma=%f' % (best_alpha, best_gamma)) best_model = MixedRidge(alphas=best_alpha) best_model.fit(X_train, U_train, y_train, gamma=best_gamma) y_pred_best = best_model.predict(X, U) ''' logger.info('best model on test data: pcc=%f, mse=%f'%( pearsonr(y[test_index], y_pred_best[test_index])[0], mean_squared_error(y[test_index], y_pred_best[test_index]))) ''' logger.info('save results to output directory: ' + args.output_dir) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) #model_file = os.path.join(args.output_dir, 'model') #with open(model_file, 'wb') as f: # pickle.dump(best_model, f) pred_file = os.path.join(args.output_dir, 'predictions') with h5py.File(pred_file, 'w') as f: f.create_dataset('best_alpha', data=best_alpha) f.create_dataset('best_gamma', data=best_gamma) f.create_dataset('y_true', data=y) f.create_dataset('y_pred', data=y_pred_best) f.create_dataset('pcc_cv', data=metrics['pcc_cv']) f.create_dataset('mse_cv', data=metrics['mse_cv']) f.create_dataset('alpha_list', data=np.asarray(alpha_list)) f.create_dataset('gamma_list', data=np.asarray(gamma_list)) f.create_dataset('test_index_mask', data=test_index_mask)
def mixed_model(args): import h5py import numpy as np import dill as pickle from utils import read_hdf5_dataset, read_hdf5_single from sklearn.metrics import r2_score, mean_squared_error logger.info('read predictions of the first model: ' + args.input_file1) with h5py.File(args.input_file1, 'r') as f: X1 = f['y_pred'][:][:, np.newaxis] logger.info('read predictions of the second model: ' + args.input_file2) with h5py.File(args.input_file2, 'r') as f: X2 = f['y_pred'][:][:, np.newaxis] X = np.concatenate([X1, X2], axis=1) logger.info('read phenotypes from file: ' + args.phenotype_file) y = read_hdf5_dataset(args.phenotype_file) logger.info('read parent table from file: ' + args.parent_table_file) parent_table = read_hdf5_single(args.parent_table_file) if args.sample_indices_file: logger.info('read sample indices from: ' + args.sample_indices_file) sample_indices = read_hdf5_dataset(args.sample_indices_file) else: sample_indices = np.nonzero(~np.isnan(y))[0] X_train = X[sample_indices] y_train = y[sample_indices] logger.info('use model ' + args.model_name) logger.info('X.shape = %s, y.shape = %s' % (repr(X.shape), repr(y.shape))) if args.model_name == 'linear': from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X) y_pred_train = y_pred[sample_indices] logger.info('coefficients: ' + ', '.join([str(a) for a in model.coef_])) elif args.model_name == 'linear_cv': mix_factors, mse_train, mse_test = linear_cv(X, y, sample_indices, parent_table) best_mix_factor = mix_factors[np.argmin(mse_test.mean(axis=1))] logger.info('best mix factor: %f' % best_mix_factor) X_mixed = (X[:, 0] * (1 - best_mix_factor) + X[:, 1] * best_mix_factor)[:, np.newaxis] from sklearn.linear_model import LinearRegression model = LinearRegression() model.fit(X_mixed[sample_indices], y_train) y_pred = model.predict(X_mixed) y_pred_train = y_pred[sample_indices] else: raise ValueError('unknown model name: ' + args.model_name) logger.info('r2 score = %f' % r2_score(y_train, y_pred_train)) logger.info('mse = %f' % mean_squared_error(y_train, y_pred_train)) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) model_file = os.path.join(args.output_dir, 'model') logger.info('save model file: ' + model_file) with open(model_file, 'wb') as f: pickle.dump(model, f) pred_file = os.path.join(args.output_dir, 'predictions') logger.info('save predictions to file: ' + pred_file) with h5py.File(pred_file, 'w') as f: f.create_dataset('y_true', data=y) f.create_dataset('y_pred', data=y_pred) f.create_dataset('y_pred_train', data=y_pred_train) f.create_dataset('indices_train', data=sample_indices) if args.model_name == 'linear_cv': f.create_dataset('mse_train', data=mse_train) f.create_dataset('mse_test', data=mse_test) f.create_dataset('mix_factors', data=mix_factors)