def main(): df = get_data_set('idao_dataset/train', save_to_csv=False) create_folds(df, 5, config) run_training(1, config, mode='clf') run_training(1, config, mode='reg') clf_preds, reg_preds = predict(config) sub_df = pd.read_csv(sub_df_path) sub_df['classification_predictions'] = clf_preds sub_df['regression_predictions'] = reg_preds sub_df['regression_predictions'] = sub_df['regression_predictions'].apply( transform) sub_df.to_csv('Final_Submission.csv', index=False)
def gtv_cvlam(X, y, q, num_folds=5, num_lams=20): n = len(X) folds = create_folds(n, num_folds) scores = np.zeros(num_lams) lams = None for i, fold in enumerate(folds): mask = np.ones(n, dtype=bool) mask[fold] = False x_train, y_train = X[mask], y[mask] x_test, y_test = X[~mask], y[~mask] data, weights, grid = bucket_vals(x_train, y_train, q) results = solve_gfl(data, None, weights=weights, full_path=True, minlam=0.1, maxlam=20., numlam=num_lams) fold_score = np.array([ mse(y_test, predict(x_test, beta, grid)) for beta in results['beta'] ]) scores += fold_score if i == 0: lams = results['lambda'] scores /= float(num_folds) lam_best = lams[np.argmin(scores)] data, weights, grid = bucket_vals(X, y, q) beta = solve_gfl(data, None, weights=weights, lam=lam_best) return beta.reshape(q), grid
def setup(self, model_path, cell_lines, drugs, drug_ids, features, X, Y, A, B, C, raw_index, lam_gridsize=100, nfolds=10, **kwargs): '''Initializes the model and caches certain statistics.''' self.model_path = model_path self.cell_lines = cell_lines self.drugs = drugs self.drug_ids = drug_ids self.features = features self.X = X self.A = A self.B = B self.C = C self.Y = Y self.raw_index = raw_index assert A.shape == Y.shape[:-1] assert B.shape == Y.shape[:-1] assert C.shape == Y.shape[:-1] self.Y_shape = Y.shape self.nsamples = Y.shape[0] self.ndrugs = Y.shape[1] self.ndoses = Y.shape[2] self.nfeatures = X.shape[1] # Cache which doses are missing and put in dummy values from scipy.stats import gamma self.obs_mask = (~np.isnan(Y)).astype(int) self.A = np.nan_to_num(self.A, nan=1) self.B = np.nan_to_num(self.B, nan=1) self.C = np.nan_to_num(self.C, nan=1) self.Y = np.nan_to_num(self.Y, nan=0)*self.obs_mask + (1-self.obs_mask)*2 # We approximate the integral over lambda with a finite grid of lam_gridsize points print('Caching lambda integral approximation') self.lam_gridsize = lam_gridsize self.lam_grid = np.transpose(np.linspace(gamma.ppf(1e-3, self.A, scale=self.B), gamma.ppf(1-1e-3, self.A, scale=self.B), self.lam_gridsize), [1,2,0]) self.lam_weights = gamma.pdf(self.lam_grid, self.A[...,None], scale=self.B[...,None]) self.lam_weights = (self.lam_weights / self.lam_weights.sum(axis=-1, keepdims=True)).clip(1e-6, 1-1e-6) self.log_lam_weights = np.log(self.lam_weights) # np.save(os.path.join(self.model_path, 'lam_grid.npy'), self.lam_grid) # np.save(os.path.join(self.model_path, 'lam_weights.npy'), self.lam_weights) # Split the data into K folds self.nfolds = nfolds self.folds = create_folds(self.nsamples, self.nfolds) # The out-of-sample predictions self.mu = np.full(self.Y.shape, np.nan)
names = [ 'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight', 'Rings' ] df = pd.read_table('experiments/uci/data/abalone.data.txt', header=None, sep=',', names=names) # Preprocess the features onehot(df, ['Sex']) standardize(df, [ 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight', 'Viscera weight', 'Shell weight' ]) # Reorder columns to put the target column at the end cols = df.columns.tolist() cols = cols[:-4] + cols[-3:] + cols[-4:-3] df = df[cols] # # Convert from one significant decimal place to discrete integers df['Rings'] = df['Rings'].apply(np.int32) df['Rings'] -= df['Rings'].min() print df.describe() create_folds('experiments/uci/data/splits/abalone', df) save_details('abalone', len(df), df.shape[1] - 1, df['Rings'].max() + 1)
def train(self, model_fn=None, lasso=0., l2=1e-4, lr=3e-4, num_epochs=250, batch_size=None, num_folds=3, val_pct=0.1, verbose=False, folds=None, weight_decay=0.01, random_restarts=1, save_dir='/tmp/', momentum=0.9, patience=3, clip_gradients=None): # Make sure we have a model of the prior if model_fn is None: model_fn = lambda nfeatures: DeepAdaptiveFDRModeler(nfeatures) # Lasso penalty (if any) lasso = autograd.Variable(torch.FloatTensor([lasso]), requires_grad=False) l2 = autograd.Variable(torch.FloatTensor([l2]), requires_grad=False) if batch_size is None: batch_size = int( max(10, min(100, np.round(self.X.shape[0] / 100.)))) print('Batch size: {}'.format(batch_size)) # Discrete approximation of a beta PDF support tbeta_grid = autograd.Variable(torch.FloatTensor(self.beta_grid), requires_grad=False) sys.stdout.flush() # Split the data into a bunch of cross-validation folds if folds is None: if verbose: print('\tCreating {} folds'.format(num_folds)) sys.stdout.flush() folds = create_folds(self.X, k=num_folds) self.priors = np.zeros((self.nsamples, 2), dtype=float) self.models = [] train_losses, val_losses = np.zeros( (len(folds), random_restarts, num_epochs)), np.zeros( (len(folds), random_restarts, num_epochs)) epochs_per_fold = np.zeros(len(folds)) for fold_idx, test_indices in enumerate(folds): # Create train/validate splits mask = np.ones(self.nsamples, dtype=bool) mask[test_indices] = False indices = np.arange(self.nsamples, dtype=int)[mask] np.random.shuffle(indices) train_cutoff = int(np.round(len(indices) * (1 - val_pct))) train_indices = indices[:train_cutoff] validate_indices = indices[train_cutoff:] torch_test_indices = autograd.Variable( torch.LongTensor(test_indices), requires_grad=False) best_loss = None # Try re-initializing a few times for restart in range(random_restarts): model = model_fn(self.nfeatures) # Setup the optimizers # optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum) # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=patience) optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay) # optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay) # Train the model for epoch in range(num_epochs): if verbose: print('\t\tRestart {} Fold {} Epoch {}'.format( restart + 1, fold_idx + 1, epoch + 1)) sys.stdout.flush() train_loss = torch.Tensor([0]) for batch_idx, batch in enumerate( batches(train_indices, batch_size, shuffle=False)): if verbose and (batch_idx % 100 == 0): print('\t\t\tBatch {}'.format(batch_idx)) tidx = autograd.Variable(torch.LongTensor(batch), requires_grad=False) # Set the model to training mode model.train() # Reset the gradient model.zero_grad() # Run the model and get the prior predictions concentrations = model(self.tX[tidx]) # Calculate the loss as the negative log-likelihood of the data # Use a beta prior for the treatment effect prior_dist = torch.distributions.Beta( concentrations[:, 0:1], concentrations[:, 1:2]) # Discretize the (0,1) interval to approximate the beta PDF prior_probs = prior_dist.log_prob(tbeta_grid).exp() prior_probs = prior_probs / prior_probs.sum( dim=1, keepdim=True) # Calculate the loss posterior_probs = (((1 - tbeta_grid) * self.tP0[tidx] + tbeta_grid * self.tP1[tidx]) * prior_probs).sum(dim=1) loss = -posterior_probs.log().mean() # L1 penalty to shrink c and be more conservative regularized_loss = loss + lasso * concentrations.mean( ) + l2 * (concentrations**2).mean() # Update the model with gradient clipping for stability regularized_loss.backward() # Clip the gradients if need-be if clip_gradients is not None: torch.nn.utils.clip_grad_norm( model.parameters(), clip_gradients) # Apply the update [p for p in model.parameters() if p.requires_grad] optimizer.step() # Track the loss train_loss += loss.data validate_loss = torch.Tensor([0]) for batch_idx, batch in enumerate( batches(validate_indices, batch_size)): if verbose and (batch_idx % 100 == 0): print( '\t\t\tValidation Batch {}'.format(batch_idx)) tidx = autograd.Variable(torch.LongTensor(batch), requires_grad=False) # Set the model to test mode model.eval() # Reset the gradient model.zero_grad() # Run the model and get the prior predictions concentrations = model(self.tX[tidx]) # Calculate the loss as the negative log-likelihood of the data # Use a beta prior for the treatment effect prior_dist = torch.distributions.Beta( concentrations[:, 0:1], concentrations[:, 1:2]) # Discretize the (0,1) interval to approximate the beta PDF prior_probs = prior_dist.log_prob(tbeta_grid).exp() prior_probs = (prior_probs / prior_probs.sum( dim=1, keepdim=True)).clamp(1e-8, 1 - 1e-8) # Calculate the loss posterior_probs = (((1 - tbeta_grid) * self.tP0[tidx] + tbeta_grid * self.tP1[tidx]) * prior_probs).sum(dim=1).clamp( 1e-8, 1 - 1e-8) loss = -posterior_probs.log().sum() # Track the loss validate_loss += loss.data train_losses[fold_idx, restart, epoch] = train_loss.numpy() / float( len(train_indices)) val_losses[fold_idx, restart, epoch] = validate_loss.numpy() / float( len(validate_indices)) # # Adjust the learning rate down if the validation performance is bad # scheduler.step(val_losses[fold_idx, epoch]) # Check if we are currently have the best held-out log-likelihood if verbose: print('Validation loss: {} Best: {}'.format( val_losses[fold_idx, restart, epoch], best_loss)) if (restart == 0 and epoch == 0 ) or val_losses[fold_idx, restart, epoch] <= best_loss: if verbose: print( '\t\t\tSaving test set results. <----- New high water mark for fold {} on epoch {}' .format(fold_idx + 1, epoch + 1)) # If so, use the current model on the test set best_loss = val_losses[fold_idx, restart, epoch] epochs_per_fold[fold_idx] = epoch + 1 self.priors[test_indices] = model( self.tX[torch_test_indices]).data.numpy() torch.save(model, save_dir + '_fold{}.pt'.format(fold_idx)) if verbose: means = self.priors[test_indices, 0] / self.priors[test_indices].sum( axis=1) print('Prior range: [{},{}]'.format( means.min(), means.max())) print('First 3:') print(self.priors[test_indices][:3]) # Reload the best model self.models.append( torch.load(save_dir + '_fold{}.pt'.format(fold_idx))) # Calculate the posterior probabilities if verbose: print('Calculating posteriors.') sys.stdout.flush() prior_grid = beta.pdf(self.beta_grid, self.priors[:, 0:1], self.priors[:, 1:2]) prior_grid /= prior_grid.sum(axis=1, keepdims=True) post0 = self.P0 * (1 - self.beta_grid) post1 = self.P1 * self.beta_grid self.posteriors = ((post1 / (post0 + post1)) * prior_grid).sum(axis=1) self.posteriors = self.posteriors.clip(1e-8, 1 - 1e-8) if verbose: print('Calculating predictions at a {:.2f}% FDR threshold'.format( self.fdr * 100)) sys.stdout.flush() self.predictions = calc_fdr(self.posteriors, self.fdr) if verbose: print('Finished training.') sys.stdout.flush() self.folds = folds return { 'train_losses': train_losses, 'validation_losses': val_losses, 'priors': self.priors, 'posteriors': self.posteriors, 'predictions': self.predictions, 'models': self.models, 'folds': folds }
'''Preprocessing code for the Energy Efficiency data: https://archive.ics.uci.edu/ml/datasets/Energy+efficiency''' import numpy as np import pandas as pd from utils import standardize, unitize, create_folds, save_details df = pd.read_table('experiments/uci/data/ENB2012_data.csv', header=0, sep=',') # # Preprocess the features standardize(df, ['X1', 'X2', 'X3', 'X4']) unitize(df, ['X5', 'X6', 'X7', 'X8']) # # Convert from one significant decimal place to discrete integers df['Y1'] = (df['Y1'].round()).apply(np.int32) df['Y1'] -= df['Y1'].min() df['Y2'] = (df['Y2'].round()).apply(np.int32) df['Y2'] -= df['Y2'].min() print df.describe() create_folds('experiments/uci/data/splits/energy_efficiency', df) save_details('energy_efficiency', len(df), df.shape[1] - 2, (df['Y1'].max() + 1, df['Y2'].max() + 1))
df = pd.read_table('experiments/uci/data/slump_test.data.txt', header=0, sep=',') # Remove the ID column del df['No'] # # Preprocess the features # unitize(df, ['age']) standardize( df, ['Cement', 'Slag', 'Fly ash', 'Water', 'SP', 'Coarse Aggr.', 'Fine Aggr.']) # Create discrete labels df['SLUMP(cm)'] = (df['SLUMP(cm)'].round()).apply(np.int32) df['SLUMP(cm)'] -= df['SLUMP(cm)'].min() df['FLOW(cm)'] = (df['FLOW(cm)'].round()).apply(np.int32) df['FLOW(cm)'] -= df['FLOW(cm)'].min() df['Compressive Strength (28-day)(Mpa)'] = ( df['Compressive Strength (28-day)(Mpa)'].round()).apply(np.int32) df['Compressive Strength (28-day)(Mpa)'] -= df[ 'Compressive Strength (28-day)(Mpa)'].min() print df.describe() create_folds('experiments/uci/data/splits/concrete', df) save_details('concrete', len(df), df.shape[1] - 3, (df['SLUMP(cm)'].max() + 1, df['FLOW(cm)'].max() + 1, df['Compressive Strength (28-day)(Mpa)'].max() + 1))
doNormalize = False metadatafile = DATADIR + 'annotations/metadata.csv' list_genres_of_interest_file = DATADIR + 'annotations/categories.lst' severalGenresPerSong = True song_data_dict = load_data_to_song_dict(metadatafile, list_genres_of_interest_file, DATADIR, severalGenresPerSong) ### plot arousal = f ( valence ) # songid = 732 # plot_valence_arousal(song_data_dict, songid) num_folds = 10 folds = create_folds(song_data_dict, num_folds) # print len(folds[0][0]), len(folds[0][1]) if doNormalize: print '... normalizing folds ...' normed_folds = standardize_folds(folds) # print '... writing folds to MAT files ...' # write_folds_to_mat_files(normed_folds, num_folds) print '... writing folds to pickle files ...' write_folds_to_pickle_files(normed_folds, num_folds, DATADIR, doNormalize) else: print '... writing folds to pickle files ...'
DATADIR = '/baie/corpus/emoMusic/train/' # DATADIR = './train/' doNormalize = False metadatafile = DATADIR + 'annotations/metadata.csv' list_genres_of_interest_file = DATADIR + 'annotations/categories.lst' severalGenresPerSong = True song_data_dict = load_data_to_song_dict(metadatafile, list_genres_of_interest_file, DATADIR, severalGenresPerSong) ### plot arousal = f ( valence ) # songid = 732 # plot_valence_arousal(song_data_dict, songid) num_folds = 10 folds = create_folds(song_data_dict, num_folds) # print len(folds[0][0]), len(folds[0][1]) if doNormalize: print '... normalizing folds ...' normed_folds = standardize_folds(folds) # print '... writing folds to MAT files ...' # write_folds_to_mat_files(normed_folds, num_folds) print '... writing folds to pickle files ...' write_folds_to_pickle_files(normed_folds, num_folds, DATADIR, doNormalize) else: print '... writing folds to pickle files ...' write_folds_to_pickle_files(folds, num_folds, DATADIR, doNormalize)
from utils import standardize, unitize, create_folds, save_details names = [ 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model year', 'origin', 'car name' ] df = pd.read_table('experiments/uci/data/auto-mpg.data.txt', header=None, delim_whitespace=True, names=names) # Preprocess the features unitize(df, ['cylinders', 'model year', 'origin']) standardize(df, ['displacement', 'horsepower', 'weight', 'acceleration']) del df['car name'] # Convert from one significant decimal place to discrete integers df['mpg'] = (df['mpg'] * 10).apply(np.int32) df['mpg'] -= df['mpg'].min() # Reorder columns to put the target column at the end cols = df.columns.tolist() cols = cols[1:] + cols[0:1] df = df[cols] print df.describe() create_folds('experiments/uci/data/splits/auto_mpg', df) save_details('auto_mpg', len(df), df.shape[1] - 1, df['mpg'].max() + 1)
def train(self, model_fn, bandwidth=2., kernel_scale=0.35, variance=0.02, mvn_train_samples=5, mvn_validate_samples=105, validation_samples=1000, validation_burn=1000, validation_mcmc_samples=1000, validation_thin=1, lr=3e-4, num_epochs=10, batch_size=100, val_pct=0.1, nfolds=5, folds=None, learning_rate_decay=0.9, weight_decay=0., clip=None, group_lasso_penalty=0., save_dir='tmp/', checkpoint=False, target_fold=None): print('\tFitting model using {} folds and training for {} epochs each'.format(nfolds, num_epochs)) torch_Y = autograd.Variable(torch.FloatTensor(self.Y), requires_grad=False) torch_lam_grid = autograd.Variable(torch.FloatTensor(self.lam_grid), requires_grad=False) torch_lam_weights = autograd.Variable(torch.FloatTensor(self.lam_weights), requires_grad=False) torch_c = autograd.Variable(torch.FloatTensor(self.c[:,np.newaxis,np.newaxis]), requires_grad=False) torch_obs = autograd.Variable(torch.FloatTensor(self.obs_mask), requires_grad=False) torch_dose_idxs = [autograd.Variable(torch.LongTensor( np.arange(d+(d**2 - d)//2, (d+1)+((d+1)**2 - (d+1))//2)), requires_grad=False) for d in range(self.ndoses)] # Use a fixed kernel Sigma = np.array([kernel_scale*(np.exp(-0.5*(i - np.arange(self.ndoses))**2 / bandwidth**2)) for i in np.arange(self.ndoses)]) + variance*np.eye(self.ndoses) # squared exponential kernel L = np.linalg.cholesky(Sigma)[np.newaxis,np.newaxis,:,:] # Use a fixed set of noise draws for validation Z = np.random.normal(size=(self.Y_shape[0], mvn_validate_samples, self.ndoses, 1)) validate_noise = autograd.Variable(torch.FloatTensor(np.matmul(L, Z)[:,:,:,0]), requires_grad=False) self.folds = folds if folds is not None else create_folds(self.Y_shape[0], nfolds) nfolds = len(self.folds) self.fold_validation_indices = [] self.prior_mu = np.full(self.Y_shape, np.nan, dtype=float) self.prior_Sigma = np.zeros((nfolds, self.ndoses, self.ndoses)) self.train_losses, self.val_losses = np.zeros((nfolds,num_epochs)), np.zeros((nfolds,num_epochs)) self.epochs_per_fold = np.zeros(nfolds, dtype=int) self.models = [None for _ in range(nfolds)] for fold_idx, test_indices in enumerate(self.folds): # Create train/validate splits mask = np.ones(self.Y_shape[0], dtype=bool) mask[test_indices] = False indices = np.arange(self.Y_shape[0], dtype=int)[mask] np.random.shuffle(indices) train_cutoff = int(np.round(len(indices)*(1-val_pct))) train_indices = indices[:train_cutoff] validate_indices = indices[train_cutoff:] torch_test_indices = autograd.Variable(torch.LongTensor(test_indices), requires_grad=False) self.fold_validation_indices.append(validate_indices) # If we are only training one specific fold, skip all the rest if target_fold is not None and target_fold != fold_idx: continue if checkpoint: self.load_checkpoint(save_dir, fold_idx) if self.models[fold_idx] is None: self.models[fold_idx] = model_fn() model = self.models[fold_idx] # Setup the optimizers # optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9) optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3) for epoch in range(self.epochs_per_fold[fold_idx], num_epochs): print('\t\tFold {} Epoch {}'.format(fold_idx+1,epoch+1)) train_loss = torch.Tensor([0]) for batch_idx, batch in enumerate(batches(train_indices, batch_size)): if batch_idx % 100 == 0: print('\t\t\tBatch {}'.format(batch_idx)) sys.stdout.flush() tidx = autograd.Variable(torch.LongTensor(batch), requires_grad=False) Z = np.random.normal(size=(len(batch), mvn_train_samples, self.ndoses, 1)) noise = autograd.Variable(torch.FloatTensor(np.matmul(L, Z)[:,:,:,0]), requires_grad=False) # Set the model to training mode model.train() # Reset the gradient model.zero_grad() # Run the model and get the prior predictions mu = model(batch, tidx) #### Calculate the loss as the negative log-likelihood of the data #### # Get the MVN draw as mu + L.T.dot(Z) beta = mu.view(-1,1,self.ndoses) + noise # Logistic transform on the log-odds prior sample tau = 1 / (1. + (-beta).exp()) # Poisson noise model for observations rates = tau[:,:,:,None] * torch_lam_grid[tidx,None,:,:] + torch_c[tidx,None,:,:] likelihoods = torch.distributions.Poisson(rates) # Get log probabilities of the data and filter out the missing observations loss = -(logsumexp(likelihoods.log_prob(torch_Y[tidx][:,None,:,None]) + torch_lam_weights[tidx][:,None,:,:], dim=-1).mean(dim=1) * torch_obs[tidx]).mean() if group_lasso_penalty > 0: loss += group_lasso_penalty * torch.norm(model.cell_line_features.weight, 2, 0).mean() # Update the model loss.backward() if clip is not None: torch.nn.utils.clip_grad_norm_(model.parameters(), clip) for p in model.parameters(): p.data.add_(-lr, p.grad.data) else: optimizer.step() train_loss += loss.data validate_loss = torch.Tensor([0]) for batch_idx, batch in enumerate(batches(validate_indices, batch_size, shuffle=False)): if batch_idx % 100 == 0: print('\t\t\tValidation Batch {}'.format(batch_idx)) sys.stdout.flush() tidx = autograd.Variable(torch.LongTensor(batch), requires_grad=False) noise = validate_noise[tidx] # Set the model to training mode model.eval() # Reset the gradient model.zero_grad() # Run the model and get the prior predictions mu = model(batch, tidx) #### Calculate the loss as the negative log-likelihood of the data #### # Get the MVN draw as mu + L.T.dot(Z) beta = mu.view(-1,1,self.ndoses) + noise # Logistic transform on the log-odds prior sample tau = 1 / (1. + (-beta).exp()) # Poisson noise model for observations rates = tau[:,:,:,None] * torch_lam_grid[tidx,None,:,:] + torch_c[tidx,None,:,:] likelihoods = torch.distributions.Poisson(rates) # Get log probabilities of the data and filter out the missing observations loss = -(logsumexp(likelihoods.log_prob(torch_Y[tidx][:,None,:,None]) + torch_lam_weights[tidx][:,None,:,:], dim=-1).mean(dim=1) * torch_obs[tidx]).sum() validate_loss += loss.data self.train_losses[fold_idx, epoch] = train_loss.numpy() / float(len(train_indices)) self.val_losses[fold_idx, epoch] = validate_loss.numpy() / float(len(validate_indices)) # Adjust the learning rate down if the validation performance is bad scheduler.step(self.val_losses[fold_idx, epoch]) # Check if we currently have the best held-out log-likelihood if epoch == 0 or np.argmin(self.val_losses[fold_idx, :epoch+1]) == epoch: print('\t\t\tNew best score: {}'.format(self.val_losses[fold_idx,epoch])) print('\t\t\tSaving test set results.') # If so, use the current model on the test set mu = model(test_indices, torch_test_indices) self.prior_mu[test_indices] = mu.data.numpy() self.save_fold(save_dir, fold_idx) cur_mu = self.prior_mu[test_indices] print('First 10 data points: {}'.format(test_indices[:10])) print('First 10 prior means:') print(pretty_str(ilogit(cur_mu[:10]))) print('Prior mean ranges:') for dose in range(self.ndoses): print('{}: {} [{}, {}]'.format(dose, ilogit(cur_mu[:,dose].mean()), np.percentile(ilogit(cur_mu[:,dose]), 5), np.percentile(ilogit(cur_mu[:,dose]), 95))) print('Best model score: {} (epoch {})'.format(np.min(self.val_losses[fold_idx,:epoch+1]), np.argmin(self.val_losses[fold_idx, :epoch+1])+1)) print('Current score: {}'.format(self.val_losses[fold_idx, epoch])) print('') self.epochs_per_fold[fold_idx] += 1 # Update the save point if needed if checkpoint: self.save_checkpoint(save_dir, fold_idx, model) sys.stdout.flush() # Reload the best model tmp = model.cell_features self.load_fold(save_dir, fold_idx) self.models[fold_idx].cell_features = tmp print('Finished fold {}. Estimating covariance matrix using elliptical slice sampler with max {} samples.'.format(fold_idx+1, validation_samples)) validate_subset = np.random.choice(validate_indices, validation_samples, replace=False) if len(validate_indices) > validation_samples else validate_indices tidx = autograd.Variable(torch.LongTensor(validate_subset), requires_grad=False) # Set the model to training mode self.models[fold_idx].eval() # Reset the gradient self.models[fold_idx].zero_grad() # Run the model and get the prior predictions mu_validate = self.models[fold_idx](validate_subset, tidx).data.numpy() # Run the slice sampler to get the covariance and data log-likelihoods Y_validate = self.Y[validate_subset].astype(int) Y_validate[self.obs_mask[validate_subset] == 0] = -1 (Beta_samples, Sigma_samples, Loglikelihood_samples) = posterior_ess_Sigma(Y_validate, mu_validate, self.a[validate_subset], self.b[validate_subset], self.c[validate_subset], Sigma=Sigma, nburn=validation_burn, nsamples=validation_mcmc_samples, nthin=validation_thin, print_freq=1) # Save the result self.prior_Sigma[fold_idx] = Sigma_samples.mean(axis=0) print('Last sample:') print(pretty_str(Sigma_samples[-1])) print('Mean:') print(pretty_str(self.prior_Sigma[fold_idx])) if checkpoint: self.clean_checkpoint(save_dir, fold_idx) print('Finished training.') return {'train_losses': self.train_losses, 'validation_losses': self.val_losses, 'mu': self.prior_mu, 'Sigma': self.prior_Sigma, 'models': self.models}
from drug_features_prior import DrugResponsePrior as DrugFeaturePrior print('Loading drug features') Z = load_dataset(args.drug_features, index_col=0).T model_fn = lambda: DrugFeaturePrior(df, genomic_features=X, drug_features=Z, cell_embedding_size=args.cell_embedding_size, drug_embedding_size=args.drug_embedding_size) print('Building optimizer') ebo = EmpiricalBayesOptimizer(Y, a, b, c, lam_path=args.lam_path) if args.cell_line_folds: print('Creating cell line folds using only those with features') cell_lines_with_features = list(set(X.columns) & set(df['CELL_LINE_NAME'].unique())) cell_line_folds = create_folds(len(cell_lines_with_features), args.nfolds) cell_line_to_fold = {} for fold_idx, fold_cell_lines in enumerate(cell_line_folds): for c in fold_cell_lines: cell_line_to_fold[cell_lines_with_features[c]] = fold_idx folds = [[] for _ in range(args.nfolds)] for idx, c in enumerate(df['CELL_LINE_NAME']): if c in cell_line_to_fold: folds[cell_line_to_fold[c]].append(idx) for fold_idx, fold in enumerate(folds): print('Fold {}: {}'.format(fold_idx, len(fold))) else: folds = None print('Training model') results = ebo.train(model_fn, num_epochs=args.nepochs,
# Create a one-hot encoding subject ID onehot(df, [ 'school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason', 'guardian' ]) unitize(df, [ 'age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures_mat', 'failures_por', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health' ]) standardize(df, ['absences_mat', 'absences_por']) # Make binary yes/no into binary 1/0 df.replace('yes', 1, inplace=True) df.replace('no', 0, inplace=True) # Create the target columns df['G3_mat'] = df['G3'] df['G3_por'] = df_por['G3'] del df['G3'] del df['G2'] del df['G1'] with pd.option_context('display.max_columns', 1000): print df.describe() create_folds('experiments/uci/data/splits/student_performance', df) save_details('student_performance', len(df), df.shape[1] - 2, (df['G3_mat'].max() + 1, df['G3_por'].max() + 1))
'''Preprocessing code for the Housing data: https://archive.ics.uci.edu/ml/datasets/Housing''' import numpy as np import pandas as pd from utils import standardize, unitize, create_folds, save_details names = [ 'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV' ] df = pd.read_table('experiments/uci/data/housing.data.txt', header=None, delim_whitespace=True, names=names) # Preprocess the features standardize(df, [ 'CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT' ]) unitize(df, ['ZN', 'RAD']) # Convert from one significant decimal place to discrete integers df['MEDV'] = (df['MEDV'] * 10).apply(np.int32) df['MEDV'] -= df['MEDV'].min() print df.describe() create_folds('experiments/uci/data/splits/housing', df) save_details('housing', len(df), df.shape[1] - 1, df['MEDV'].max() + 1)
'''Preprocessing code for the Parkinsons Telemonitoring data: https://archive.ics.uci.edu/ml/datasets/Parkinsons+Telemonitoring''' import numpy as np import pandas as pd from utils import standardize, unitize, onehot, create_folds, save_details df = pd.read_table('experiments/uci/data/parkinsons_updrs.data.txt', header=0, sep=',') # Create a one-hot encoding subject ID onehot(df, ['subject#']) # Move the target columns to the end cols = df.columns.tolist() cols = cols[:3] + cols[5:] + cols[3:5] df = df[cols] # Preprocess the features unitize(df, ['age']) standardize(df, cols[2:-2]) # Create discrete labels df['motor_UPDRS'] = (df['motor_UPDRS'].round()).apply(np.int32) df['motor_UPDRS'] -= df['motor_UPDRS'].min() df['total_UPDRS'] = (df['total_UPDRS'].round()).apply(np.int32) df['total_UPDRS'] -= df['total_UPDRS'].min() print df.describe() create_folds('experiments/uci/data/splits/parkinsons', df) save_details('parkinsons', len(df), df.shape[1]-2, (df['motor_UPDRS'].max()+1, df['total_UPDRS'].max()+1))
# Get command line argument for data location argument_list = sys.argv[1:] path = str(argument_list[0]) filename = os.path.basename(path) filedir = path.replace(filename, '') data = parse_c45(filename, filedir) # Define epsilon value and type of noise epsilon = float(argument_list[1]) noise_type = argument_list[2] # Convert c45 data to DataFrame and create folds unprocessed_df = data_to_dataframe(data) attr_dict = create_attr_dict(data.schema) df_whole, _ = process_data(unprocessed_df, attr_dict) folds = create_folds(df_whole) # Create a DataFrame to store important metrics metrics_df = pd.DataFrame(columns=['fold', 'accuracy', 'precision', 'recall']) metrics = [] # Perform 5-fold cross-validation for i in range(len(folds)): print('Predicting Fold : ', i + 1) train, test = train_test_split(folds, i) plr = PrivLinReg(train, epsilon=epsilon, noise_type=noise_type) trained_model = plr.train_model(epochs=500, learning_rate=0.001) acc, precision, recall, auc = metrics_for_fold(test, model=trained_model) metrics.append([acc, precision, recall, auc]) metrics_df = pd.DataFrame(np.array(metrics),