def main():
    df = get_data_set('idao_dataset/train', save_to_csv=False)
    create_folds(df, 5, config)

    run_training(1, config, mode='clf')
    run_training(1, config, mode='reg')

    clf_preds, reg_preds = predict(config)

    sub_df = pd.read_csv(sub_df_path)
    sub_df['classification_predictions'] = clf_preds
    sub_df['regression_predictions'] = reg_preds
    sub_df['regression_predictions'] = sub_df['regression_predictions'].apply(
        transform)

    sub_df.to_csv('Final_Submission.csv', index=False)
Beispiel #2
0
def gtv_cvlam(X, y, q, num_folds=5, num_lams=20):
    n = len(X)
    folds = create_folds(n, num_folds)
    scores = np.zeros(num_lams)
    lams = None
    for i, fold in enumerate(folds):
        mask = np.ones(n, dtype=bool)
        mask[fold] = False
        x_train, y_train = X[mask], y[mask]
        x_test, y_test = X[~mask], y[~mask]
        data, weights, grid = bucket_vals(x_train, y_train, q)
        results = solve_gfl(data,
                            None,
                            weights=weights,
                            full_path=True,
                            minlam=0.1,
                            maxlam=20.,
                            numlam=num_lams)
        fold_score = np.array([
            mse(y_test, predict(x_test, beta, grid))
            for beta in results['beta']
        ])
        scores += fold_score
        if i == 0:
            lams = results['lambda']
    scores /= float(num_folds)
    lam_best = lams[np.argmin(scores)]
    data, weights, grid = bucket_vals(X, y, q)
    beta = solve_gfl(data, None, weights=weights, lam=lam_best)
    return beta.reshape(q), grid
    def setup(self, model_path, cell_lines, drugs, drug_ids, features,
                    X, Y, A, B, C, raw_index,
                    lam_gridsize=100, nfolds=10, **kwargs):
        '''Initializes the model and caches certain statistics.'''
        self.model_path = model_path
        self.cell_lines = cell_lines
        self.drugs = drugs
        self.drug_ids = drug_ids
        self.features = features
        self.X = X
        self.A = A
        self.B = B
        self.C = C
        self.Y = Y
        self.raw_index = raw_index
        
        assert A.shape == Y.shape[:-1]
        assert B.shape == Y.shape[:-1]
        assert C.shape == Y.shape[:-1]

        self.Y_shape = Y.shape
        self.nsamples = Y.shape[0]
        self.ndrugs = Y.shape[1]
        self.ndoses = Y.shape[2]
        self.nfeatures = X.shape[1]

        # Cache which doses are missing and put in dummy values
        from scipy.stats import gamma
        self.obs_mask = (~np.isnan(Y)).astype(int)
        self.A = np.nan_to_num(self.A, nan=1)
        self.B = np.nan_to_num(self.B, nan=1)
        self.C = np.nan_to_num(self.C, nan=1)
        self.Y = np.nan_to_num(self.Y, nan=0)*self.obs_mask + (1-self.obs_mask)*2

        # We approximate the integral over lambda with a finite grid of lam_gridsize points
        print('Caching lambda integral approximation')
        self.lam_gridsize = lam_gridsize
        self.lam_grid = np.transpose(np.linspace(gamma.ppf(1e-3, self.A, scale=self.B),
                                                 gamma.ppf(1-1e-3, self.A, scale=self.B),
                                                 self.lam_gridsize), [1,2,0])
        self.lam_weights = gamma.pdf(self.lam_grid, self.A[...,None], scale=self.B[...,None])
        self.lam_weights = (self.lam_weights / self.lam_weights.sum(axis=-1, keepdims=True)).clip(1e-6, 1-1e-6)
        self.log_lam_weights = np.log(self.lam_weights)
        # np.save(os.path.join(self.model_path, 'lam_grid.npy'), self.lam_grid)
        # np.save(os.path.join(self.model_path, 'lam_weights.npy'), self.lam_weights)

        # Split the data into K folds
        self.nfolds = nfolds
        self.folds = create_folds(self.nsamples, self.nfolds)

        # The out-of-sample predictions
        self.mu = np.full(self.Y.shape, np.nan)
Beispiel #4
0
names = [
    'Sex', 'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
    'Viscera weight', 'Shell weight', 'Rings'
]
df = pd.read_table('experiments/uci/data/abalone.data.txt',
                   header=None,
                   sep=',',
                   names=names)

# Preprocess the features
onehot(df, ['Sex'])
standardize(df, [
    'Length', 'Diameter', 'Height', 'Whole weight', 'Shucked weight',
    'Viscera weight', 'Shell weight'
])

# Reorder columns to put the target column at the end
cols = df.columns.tolist()
cols = cols[:-4] + cols[-3:] + cols[-4:-3]
df = df[cols]

# # Convert from one significant decimal place to discrete integers
df['Rings'] = df['Rings'].apply(np.int32)
df['Rings'] -= df['Rings'].min()

print df.describe()

create_folds('experiments/uci/data/splits/abalone', df)

save_details('abalone', len(df), df.shape[1] - 1, df['Rings'].max() + 1)
Beispiel #5
0
    def train(self,
              model_fn=None,
              lasso=0.,
              l2=1e-4,
              lr=3e-4,
              num_epochs=250,
              batch_size=None,
              num_folds=3,
              val_pct=0.1,
              verbose=False,
              folds=None,
              weight_decay=0.01,
              random_restarts=1,
              save_dir='/tmp/',
              momentum=0.9,
              patience=3,
              clip_gradients=None):
        # Make sure we have a model of the prior
        if model_fn is None:
            model_fn = lambda nfeatures: DeepAdaptiveFDRModeler(nfeatures)

        # Lasso penalty (if any)
        lasso = autograd.Variable(torch.FloatTensor([lasso]),
                                  requires_grad=False)
        l2 = autograd.Variable(torch.FloatTensor([l2]), requires_grad=False)

        if batch_size is None:
            batch_size = int(
                max(10, min(100, np.round(self.X.shape[0] / 100.))))
            print('Batch size: {}'.format(batch_size))

        # Discrete approximation of a beta PDF support
        tbeta_grid = autograd.Variable(torch.FloatTensor(self.beta_grid),
                                       requires_grad=False)
        sys.stdout.flush()
        # Split the data into a bunch of cross-validation folds
        if folds is None:
            if verbose:
                print('\tCreating {} folds'.format(num_folds))
                sys.stdout.flush()
            folds = create_folds(self.X, k=num_folds)
        self.priors = np.zeros((self.nsamples, 2), dtype=float)
        self.models = []
        train_losses, val_losses = np.zeros(
            (len(folds), random_restarts, num_epochs)), np.zeros(
                (len(folds), random_restarts, num_epochs))
        epochs_per_fold = np.zeros(len(folds))
        for fold_idx, test_indices in enumerate(folds):
            # Create train/validate splits
            mask = np.ones(self.nsamples, dtype=bool)
            mask[test_indices] = False
            indices = np.arange(self.nsamples, dtype=int)[mask]
            np.random.shuffle(indices)
            train_cutoff = int(np.round(len(indices) * (1 - val_pct)))
            train_indices = indices[:train_cutoff]
            validate_indices = indices[train_cutoff:]
            torch_test_indices = autograd.Variable(
                torch.LongTensor(test_indices), requires_grad=False)
            best_loss = None

            # Try re-initializing a few times
            for restart in range(random_restarts):
                model = model_fn(self.nfeatures)

                # Setup the optimizers
                # optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=momentum)
                # scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=patience)
                optimizer = optim.RMSprop(model.parameters(),
                                          lr=lr,
                                          weight_decay=weight_decay)
                # optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
                # Train the model
                for epoch in range(num_epochs):
                    if verbose:
                        print('\t\tRestart {} Fold {} Epoch {}'.format(
                            restart + 1, fold_idx + 1, epoch + 1))
                        sys.stdout.flush()

                    train_loss = torch.Tensor([0])
                    for batch_idx, batch in enumerate(
                            batches(train_indices, batch_size, shuffle=False)):
                        if verbose and (batch_idx % 100 == 0):
                            print('\t\t\tBatch {}'.format(batch_idx))
                        tidx = autograd.Variable(torch.LongTensor(batch),
                                                 requires_grad=False)

                        # Set the model to training mode
                        model.train()

                        # Reset the gradient
                        model.zero_grad()

                        # Run the model and get the prior predictions
                        concentrations = model(self.tX[tidx])

                        # Calculate the loss as the negative log-likelihood of the data
                        # Use a beta prior for the treatment effect
                        prior_dist = torch.distributions.Beta(
                            concentrations[:, 0:1], concentrations[:, 1:2])

                        # Discretize the (0,1) interval to approximate the beta PDF
                        prior_probs = prior_dist.log_prob(tbeta_grid).exp()
                        prior_probs = prior_probs / prior_probs.sum(
                            dim=1, keepdim=True)

                        # Calculate the loss
                        posterior_probs = (((1 - tbeta_grid) * self.tP0[tidx] +
                                            tbeta_grid * self.tP1[tidx]) *
                                           prior_probs).sum(dim=1)
                        loss = -posterior_probs.log().mean()

                        # L1 penalty to shrink c and be more conservative
                        regularized_loss = loss + lasso * concentrations.mean(
                        ) + l2 * (concentrations**2).mean()

                        # Update the model with gradient clipping for stability
                        regularized_loss.backward()

                        # Clip the gradients if need-be
                        if clip_gradients is not None:
                            torch.nn.utils.clip_grad_norm(
                                model.parameters(), clip_gradients)

                        # Apply the update
                        [p for p in model.parameters() if p.requires_grad]
                        optimizer.step()

                        # Track the loss
                        train_loss += loss.data

                    validate_loss = torch.Tensor([0])
                    for batch_idx, batch in enumerate(
                            batches(validate_indices, batch_size)):
                        if verbose and (batch_idx % 100 == 0):
                            print(
                                '\t\t\tValidation Batch {}'.format(batch_idx))
                        tidx = autograd.Variable(torch.LongTensor(batch),
                                                 requires_grad=False)

                        # Set the model to test mode
                        model.eval()

                        # Reset the gradient
                        model.zero_grad()

                        # Run the model and get the prior predictions
                        concentrations = model(self.tX[tidx])

                        # Calculate the loss as the negative log-likelihood of the data
                        # Use a beta prior for the treatment effect
                        prior_dist = torch.distributions.Beta(
                            concentrations[:, 0:1], concentrations[:, 1:2])

                        # Discretize the (0,1) interval to approximate the beta PDF
                        prior_probs = prior_dist.log_prob(tbeta_grid).exp()
                        prior_probs = (prior_probs / prior_probs.sum(
                            dim=1, keepdim=True)).clamp(1e-8, 1 - 1e-8)

                        # Calculate the loss
                        posterior_probs = (((1 - tbeta_grid) * self.tP0[tidx] +
                                            tbeta_grid * self.tP1[tidx]) *
                                           prior_probs).sum(dim=1).clamp(
                                               1e-8, 1 - 1e-8)
                        loss = -posterior_probs.log().sum()

                        # Track the loss
                        validate_loss += loss.data

                    train_losses[fold_idx, restart,
                                 epoch] = train_loss.numpy() / float(
                                     len(train_indices))
                    val_losses[fold_idx, restart,
                               epoch] = validate_loss.numpy() / float(
                                   len(validate_indices))

                    # # Adjust the learning rate down if the validation performance is bad
                    # scheduler.step(val_losses[fold_idx, epoch])

                    # Check if we are currently have the best held-out log-likelihood
                    if verbose:
                        print('Validation loss: {} Best: {}'.format(
                            val_losses[fold_idx, restart, epoch], best_loss))
                    if (restart == 0 and epoch == 0
                        ) or val_losses[fold_idx, restart, epoch] <= best_loss:
                        if verbose:
                            print(
                                '\t\t\tSaving test set results.      <----- New high water mark for fold {} on epoch {}'
                                .format(fold_idx + 1, epoch + 1))
                        # If so, use the current model on the test set
                        best_loss = val_losses[fold_idx, restart, epoch]
                        epochs_per_fold[fold_idx] = epoch + 1
                        self.priors[test_indices] = model(
                            self.tX[torch_test_indices]).data.numpy()
                        torch.save(model,
                                   save_dir + '_fold{}.pt'.format(fold_idx))

                    if verbose:
                        means = self.priors[test_indices,
                                            0] / self.priors[test_indices].sum(
                                                axis=1)
                        print('Prior range: [{},{}]'.format(
                            means.min(), means.max()))
                        print('First 3:')
                        print(self.priors[test_indices][:3])

            # Reload the best model
            self.models.append(
                torch.load(save_dir + '_fold{}.pt'.format(fold_idx)))

        # Calculate the posterior probabilities
        if verbose:
            print('Calculating posteriors.')
            sys.stdout.flush()
        prior_grid = beta.pdf(self.beta_grid, self.priors[:, 0:1],
                              self.priors[:, 1:2])
        prior_grid /= prior_grid.sum(axis=1, keepdims=True)
        post0 = self.P0 * (1 - self.beta_grid)
        post1 = self.P1 * self.beta_grid
        self.posteriors = ((post1 / (post0 + post1)) * prior_grid).sum(axis=1)
        self.posteriors = self.posteriors.clip(1e-8, 1 - 1e-8)

        if verbose:
            print('Calculating predictions at a {:.2f}% FDR threshold'.format(
                self.fdr * 100))
            sys.stdout.flush()
        self.predictions = calc_fdr(self.posteriors, self.fdr)

        if verbose:
            print('Finished training.')
            sys.stdout.flush()

        self.folds = folds

        return {
            'train_losses': train_losses,
            'validation_losses': val_losses,
            'priors': self.priors,
            'posteriors': self.posteriors,
            'predictions': self.predictions,
            'models': self.models,
            'folds': folds
        }
Beispiel #6
0
'''Preprocessing code for the Energy Efficiency data: https://archive.ics.uci.edu/ml/datasets/Energy+efficiency'''
import numpy as np
import pandas as pd
from utils import standardize, unitize, create_folds, save_details

df = pd.read_table('experiments/uci/data/ENB2012_data.csv', header=0, sep=',')

# # Preprocess the features
standardize(df, ['X1', 'X2', 'X3', 'X4'])
unitize(df, ['X5', 'X6', 'X7', 'X8'])

# # Convert from one significant decimal place to discrete integers
df['Y1'] = (df['Y1'].round()).apply(np.int32)
df['Y1'] -= df['Y1'].min()
df['Y2'] = (df['Y2'].round()).apply(np.int32)
df['Y2'] -= df['Y2'].min()

print df.describe()

create_folds('experiments/uci/data/splits/energy_efficiency', df)

save_details('energy_efficiency', len(df), df.shape[1] - 2,
             (df['Y1'].max() + 1, df['Y2'].max() + 1))
Beispiel #7
0
df = pd.read_table('experiments/uci/data/slump_test.data.txt',
                   header=0,
                   sep=',')

# Remove the ID column
del df['No']

# # Preprocess the features
# unitize(df, ['age'])
standardize(
    df,
    ['Cement', 'Slag', 'Fly ash', 'Water', 'SP', 'Coarse Aggr.', 'Fine Aggr.'])

# Create discrete labels
df['SLUMP(cm)'] = (df['SLUMP(cm)'].round()).apply(np.int32)
df['SLUMP(cm)'] -= df['SLUMP(cm)'].min()
df['FLOW(cm)'] = (df['FLOW(cm)'].round()).apply(np.int32)
df['FLOW(cm)'] -= df['FLOW(cm)'].min()
df['Compressive Strength (28-day)(Mpa)'] = (
    df['Compressive Strength (28-day)(Mpa)'].round()).apply(np.int32)
df['Compressive Strength (28-day)(Mpa)'] -= df[
    'Compressive Strength (28-day)(Mpa)'].min()

print df.describe()

create_folds('experiments/uci/data/splits/concrete', df)

save_details('concrete', len(df), df.shape[1] - 3,
             (df['SLUMP(cm)'].max() + 1, df['FLOW(cm)'].max() + 1,
              df['Compressive Strength (28-day)(Mpa)'].max() + 1))
Beispiel #8
0
    doNormalize = False

    metadatafile = DATADIR + 'annotations/metadata.csv'
    list_genres_of_interest_file = DATADIR + 'annotations/categories.lst'
    severalGenresPerSong = True

    song_data_dict = load_data_to_song_dict(metadatafile,
                                            list_genres_of_interest_file,
                                            DATADIR, severalGenresPerSong)

    ### plot arousal = f ( valence )
    # songid = 732
    # plot_valence_arousal(song_data_dict, songid)

    num_folds = 10
    folds = create_folds(song_data_dict, num_folds)
    # print len(folds[0][0]), len(folds[0][1])

    if doNormalize:
        print '... normalizing folds ...'
        normed_folds = standardize_folds(folds)

        # print '... writing folds to MAT files ...'
        # write_folds_to_mat_files(normed_folds, num_folds)

        print '... writing folds to pickle files ...'
        write_folds_to_pickle_files(normed_folds, num_folds, DATADIR,
                                    doNormalize)

    else:
        print '... writing folds to pickle files ...'
Beispiel #9
0
    DATADIR = '/baie/corpus/emoMusic/train/'
    # DATADIR = './train/'
    doNormalize = False

    metadatafile = DATADIR + 'annotations/metadata.csv'
    list_genres_of_interest_file = DATADIR + 'annotations/categories.lst'
    severalGenresPerSong = True

    song_data_dict = load_data_to_song_dict(metadatafile, list_genres_of_interest_file, DATADIR, severalGenresPerSong)

    ### plot arousal = f ( valence )
    # songid = 732
    # plot_valence_arousal(song_data_dict, songid)

    num_folds = 10
    folds = create_folds(song_data_dict, num_folds)
    # print len(folds[0][0]), len(folds[0][1])

    if doNormalize:
        print '... normalizing folds ...'
        normed_folds = standardize_folds(folds)

    # print '... writing folds to MAT files ...'
    # write_folds_to_mat_files(normed_folds, num_folds)

        print '... writing folds to pickle files ...'
        write_folds_to_pickle_files(normed_folds, num_folds, DATADIR, doNormalize)

    else:
        print '... writing folds to pickle files ...'
        write_folds_to_pickle_files(folds, num_folds, DATADIR, doNormalize)
Beispiel #10
0
from utils import standardize, unitize, create_folds, save_details

names = [
    'mpg', 'cylinders', 'displacement', 'horsepower', 'weight', 'acceleration',
    'model year', 'origin', 'car name'
]
df = pd.read_table('experiments/uci/data/auto-mpg.data.txt',
                   header=None,
                   delim_whitespace=True,
                   names=names)

# Preprocess the features
unitize(df, ['cylinders', 'model year', 'origin'])
standardize(df, ['displacement', 'horsepower', 'weight', 'acceleration'])
del df['car name']

# Convert from one significant decimal place to discrete integers
df['mpg'] = (df['mpg'] * 10).apply(np.int32)
df['mpg'] -= df['mpg'].min()

# Reorder columns to put the target column at the end
cols = df.columns.tolist()
cols = cols[1:] + cols[0:1]
df = df[cols]

print df.describe()

create_folds('experiments/uci/data/splits/auto_mpg', df)

save_details('auto_mpg', len(df), df.shape[1] - 1, df['mpg'].max() + 1)
    def train(self, model_fn,
                    bandwidth=2., kernel_scale=0.35, variance=0.02,
                    mvn_train_samples=5, mvn_validate_samples=105,
                    validation_samples=1000,
                    validation_burn=1000,
                    validation_mcmc_samples=1000,
                    validation_thin=1,
                    lr=3e-4, num_epochs=10, batch_size=100,
                    val_pct=0.1, nfolds=5, folds=None,
                    learning_rate_decay=0.9, weight_decay=0.,
                    clip=None, group_lasso_penalty=0.,
                    save_dir='tmp/',
                    checkpoint=False,
                    target_fold=None):
        print('\tFitting model using {} folds and training for {} epochs each'.format(nfolds, num_epochs))
        torch_Y = autograd.Variable(torch.FloatTensor(self.Y), requires_grad=False)
        torch_lam_grid = autograd.Variable(torch.FloatTensor(self.lam_grid), requires_grad=False)
        torch_lam_weights = autograd.Variable(torch.FloatTensor(self.lam_weights), requires_grad=False)
        torch_c = autograd.Variable(torch.FloatTensor(self.c[:,np.newaxis,np.newaxis]), requires_grad=False)
        torch_obs = autograd.Variable(torch.FloatTensor(self.obs_mask), requires_grad=False)
        torch_dose_idxs = [autograd.Variable(torch.LongTensor(
                                np.arange(d+(d**2 - d)//2, (d+1)+((d+1)**2 - (d+1))//2)), requires_grad=False)
                                for d in range(self.ndoses)]

        # Use a fixed kernel
        Sigma = np.array([kernel_scale*(np.exp(-0.5*(i - np.arange(self.ndoses))**2 / bandwidth**2)) for i in np.arange(self.ndoses)]) + variance*np.eye(self.ndoses) # squared exponential kernel
        L = np.linalg.cholesky(Sigma)[np.newaxis,np.newaxis,:,:]

        # Use a fixed set of noise draws for validation
        Z = np.random.normal(size=(self.Y_shape[0], mvn_validate_samples, self.ndoses, 1))
        validate_noise = autograd.Variable(torch.FloatTensor(np.matmul(L, Z)[:,:,:,0]), requires_grad=False)

        self.folds = folds if folds is not None else create_folds(self.Y_shape[0], nfolds)
        nfolds = len(self.folds)
        self.fold_validation_indices = []
        self.prior_mu = np.full(self.Y_shape, np.nan, dtype=float)
        self.prior_Sigma = np.zeros((nfolds, self.ndoses, self.ndoses))
        self.train_losses, self.val_losses = np.zeros((nfolds,num_epochs)), np.zeros((nfolds,num_epochs))
        self.epochs_per_fold = np.zeros(nfolds, dtype=int)
        self.models = [None for _ in range(nfolds)]
        for fold_idx, test_indices in enumerate(self.folds):
            # Create train/validate splits
            mask = np.ones(self.Y_shape[0], dtype=bool)
            mask[test_indices] = False
            indices = np.arange(self.Y_shape[0], dtype=int)[mask]
            np.random.shuffle(indices)
            train_cutoff = int(np.round(len(indices)*(1-val_pct)))
            train_indices = indices[:train_cutoff]
            validate_indices = indices[train_cutoff:]
            torch_test_indices = autograd.Variable(torch.LongTensor(test_indices), requires_grad=False)
            self.fold_validation_indices.append(validate_indices)

            # If we are only training one specific fold, skip all the rest
            if target_fold is not None and target_fold != fold_idx:
                continue

            if checkpoint:
                self.load_checkpoint(save_dir, fold_idx)

            if self.models[fold_idx] is None:
                self.models[fold_idx] = model_fn()

            model = self.models[fold_idx]

            # Setup the optimizers
            # optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay, momentum=0.9)
            optimizer = optim.RMSprop(model.parameters(), lr=lr, weight_decay=weight_decay)
            scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3)
            for epoch in range(self.epochs_per_fold[fold_idx], num_epochs):
                print('\t\tFold {} Epoch {}'.format(fold_idx+1,epoch+1))
                train_loss = torch.Tensor([0])
                for batch_idx, batch in enumerate(batches(train_indices, batch_size)):
                    if batch_idx % 100 == 0:
                        print('\t\t\tBatch {}'.format(batch_idx))
                        sys.stdout.flush()

                    tidx = autograd.Variable(torch.LongTensor(batch), requires_grad=False)
                    Z = np.random.normal(size=(len(batch), mvn_train_samples, self.ndoses, 1))
                    noise = autograd.Variable(torch.FloatTensor(np.matmul(L, Z)[:,:,:,0]), requires_grad=False)

                    # Set the model to training mode
                    model.train()

                    # Reset the gradient
                    model.zero_grad()

                    # Run the model and get the prior predictions
                    mu = model(batch, tidx)

                    #### Calculate the loss as the negative log-likelihood of the data ####
                    # Get the MVN draw as mu + L.T.dot(Z)
                    beta = mu.view(-1,1,self.ndoses) + noise

                    # Logistic transform on the log-odds prior sample
                    tau = 1 / (1. + (-beta).exp())

                    # Poisson noise model for observations
                    rates = tau[:,:,:,None] * torch_lam_grid[tidx,None,:,:] + torch_c[tidx,None,:,:]
                    likelihoods = torch.distributions.Poisson(rates)

                    # Get log probabilities of the data and filter out the missing observations
                    loss = -(logsumexp(likelihoods.log_prob(torch_Y[tidx][:,None,:,None]) + torch_lam_weights[tidx][:,None,:,:], dim=-1).mean(dim=1) * torch_obs[tidx]).mean()

                    if group_lasso_penalty > 0:
                        loss += group_lasso_penalty * torch.norm(model.cell_line_features.weight, 2, 0).mean()

                    # Update the model
                    loss.backward()
                    if clip is not None:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
                        for p in model.parameters():
                            p.data.add_(-lr, p.grad.data)
                    else:
                        optimizer.step()

                    train_loss += loss.data

                validate_loss = torch.Tensor([0])
                for batch_idx, batch in enumerate(batches(validate_indices, batch_size, shuffle=False)):
                    if batch_idx % 100 == 0:
                        print('\t\t\tValidation Batch {}'.format(batch_idx))
                        sys.stdout.flush()
                    
                    tidx = autograd.Variable(torch.LongTensor(batch), requires_grad=False)
                    noise = validate_noise[tidx]

                    # Set the model to training mode
                    model.eval()

                    # Reset the gradient
                    model.zero_grad()

                    # Run the model and get the prior predictions
                    mu = model(batch, tidx)

                    #### Calculate the loss as the negative log-likelihood of the data ####
                    # Get the MVN draw as mu + L.T.dot(Z)
                    beta = mu.view(-1,1,self.ndoses) + noise

                    # Logistic transform on the log-odds prior sample
                    tau = 1 / (1. + (-beta).exp())

                    # Poisson noise model for observations
                    rates = tau[:,:,:,None] * torch_lam_grid[tidx,None,:,:] + torch_c[tidx,None,:,:]
                    likelihoods = torch.distributions.Poisson(rates)

                    # Get log probabilities of the data and filter out the missing observations
                    loss = -(logsumexp(likelihoods.log_prob(torch_Y[tidx][:,None,:,None]) + torch_lam_weights[tidx][:,None,:,:], dim=-1).mean(dim=1) * torch_obs[tidx]).sum()

                    validate_loss += loss.data

                self.train_losses[fold_idx, epoch] = train_loss.numpy() / float(len(train_indices))
                self.val_losses[fold_idx, epoch] = validate_loss.numpy() / float(len(validate_indices))

                # Adjust the learning rate down if the validation performance is bad
                scheduler.step(self.val_losses[fold_idx, epoch])

                # Check if we currently have the best held-out log-likelihood
                if epoch == 0 or np.argmin(self.val_losses[fold_idx, :epoch+1]) == epoch:
                    print('\t\t\tNew best score: {}'.format(self.val_losses[fold_idx,epoch]))
                    print('\t\t\tSaving test set results.')
                    # If so, use the current model on the test set
                    mu = model(test_indices, torch_test_indices)
                    self.prior_mu[test_indices] = mu.data.numpy()
                    self.save_fold(save_dir, fold_idx)
                
                cur_mu = self.prior_mu[test_indices]
                print('First 10 data points: {}'.format(test_indices[:10]))
                print('First 10 prior means:')
                print(pretty_str(ilogit(cur_mu[:10])))
                print('Prior mean ranges:')
                for dose in range(self.ndoses):
                    print('{}: {} [{}, {}]'.format(dose,
                                                   ilogit(cur_mu[:,dose].mean()),
                                                   np.percentile(ilogit(cur_mu[:,dose]), 5),
                                                   np.percentile(ilogit(cur_mu[:,dose]), 95)))
                print('Best model score: {} (epoch {})'.format(np.min(self.val_losses[fold_idx,:epoch+1]), np.argmin(self.val_losses[fold_idx, :epoch+1])+1))
                print('Current score: {}'.format(self.val_losses[fold_idx, epoch]))
                print('')

                self.epochs_per_fold[fold_idx] += 1
                
                # Update the save point if needed
                if checkpoint:
                    self.save_checkpoint(save_dir, fold_idx, model)
                    sys.stdout.flush()
                
            
            # Reload the best model
            tmp = model.cell_features
            self.load_fold(save_dir, fold_idx)
            self.models[fold_idx].cell_features = tmp

            print('Finished fold {}. Estimating covariance matrix using elliptical slice sampler with max {} samples.'.format(fold_idx+1, validation_samples))
            validate_subset = np.random.choice(validate_indices, validation_samples, replace=False) if len(validate_indices) > validation_samples else validate_indices
            tidx = autograd.Variable(torch.LongTensor(validate_subset), requires_grad=False)
                        
            # Set the model to training mode
            self.models[fold_idx].eval()

            # Reset the gradient
            self.models[fold_idx].zero_grad()

            # Run the model and get the prior predictions
            mu_validate = self.models[fold_idx](validate_subset, tidx).data.numpy()
            
            # Run the slice sampler to get the covariance and data log-likelihoods
            Y_validate = self.Y[validate_subset].astype(int)
            Y_validate[self.obs_mask[validate_subset] == 0] = -1
            (Beta_samples,
                Sigma_samples,
                Loglikelihood_samples) = posterior_ess_Sigma(Y_validate,
                                                             mu_validate,
                                                             self.a[validate_subset],
                                                             self.b[validate_subset],
                                                             self.c[validate_subset],
                                                             Sigma=Sigma,
                                                             nburn=validation_burn,
                                                             nsamples=validation_mcmc_samples,
                                                             nthin=validation_thin,
                                                             print_freq=1)

            # Save the result
            self.prior_Sigma[fold_idx] = Sigma_samples.mean(axis=0)
            print('Last sample:')
            print(pretty_str(Sigma_samples[-1]))
            print('Mean:')
            print(pretty_str(self.prior_Sigma[fold_idx]))

            if checkpoint:
                self.clean_checkpoint(save_dir, fold_idx)

        print('Finished training.')
        
        return {'train_losses': self.train_losses,
                'validation_losses': self.val_losses,
                'mu': self.prior_mu,
                'Sigma': self.prior_Sigma,
                'models': self.models}
        from drug_features_prior import DrugResponsePrior as DrugFeaturePrior
        print('Loading drug features')
        Z = load_dataset(args.drug_features, index_col=0).T
        model_fn = lambda: DrugFeaturePrior(df,
                                        genomic_features=X,
                                        drug_features=Z,
                                        cell_embedding_size=args.cell_embedding_size,
                                        drug_embedding_size=args.drug_embedding_size)

    print('Building optimizer')
    ebo = EmpiricalBayesOptimizer(Y, a, b, c, lam_path=args.lam_path)

    if args.cell_line_folds:
        print('Creating cell line folds using only those with features')
        cell_lines_with_features = list(set(X.columns) & set(df['CELL_LINE_NAME'].unique()))
        cell_line_folds = create_folds(len(cell_lines_with_features), args.nfolds)
        cell_line_to_fold = {}
        for fold_idx, fold_cell_lines in enumerate(cell_line_folds):
            for c in fold_cell_lines:
                cell_line_to_fold[cell_lines_with_features[c]] = fold_idx
        folds = [[] for _ in range(args.nfolds)]
        for idx, c in enumerate(df['CELL_LINE_NAME']):
            if c in cell_line_to_fold:
                folds[cell_line_to_fold[c]].append(idx)
        for fold_idx, fold in enumerate(folds):
            print('Fold {}: {}'.format(fold_idx, len(fold)))
    else:
        folds = None

    print('Training model')
    results = ebo.train(model_fn, num_epochs=args.nepochs,
# Create a one-hot encoding subject ID
onehot(df, [
    'school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason',
    'guardian'
])
unitize(df, [
    'age', 'Medu', 'Fedu', 'traveltime', 'studytime', 'failures_mat',
    'failures_por', 'famrel', 'freetime', 'goout', 'Dalc', 'Walc', 'health'
])
standardize(df, ['absences_mat', 'absences_por'])

# Make binary yes/no into binary 1/0
df.replace('yes', 1, inplace=True)
df.replace('no', 0, inplace=True)

# Create the target columns
df['G3_mat'] = df['G3']
df['G3_por'] = df_por['G3']
del df['G3']
del df['G2']
del df['G1']

with pd.option_context('display.max_columns', 1000):
    print df.describe()

create_folds('experiments/uci/data/splits/student_performance', df)

save_details('student_performance', len(df), df.shape[1] - 2,
             (df['G3_mat'].max() + 1, df['G3_por'].max() + 1))
Beispiel #14
0
'''Preprocessing code for the Housing data: https://archive.ics.uci.edu/ml/datasets/Housing'''
import numpy as np
import pandas as pd
from utils import standardize, unitize, create_folds, save_details

names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
df = pd.read_table('experiments/uci/data/housing.data.txt',
                   header=None,
                   delim_whitespace=True,
                   names=names)

# Preprocess the features
standardize(df, [
    'CRIM', 'INDUS', 'NOX', 'RM', 'AGE', 'DIS', 'TAX', 'PTRATIO', 'B', 'LSTAT'
])
unitize(df, ['ZN', 'RAD'])

# Convert from one significant decimal place to discrete integers
df['MEDV'] = (df['MEDV'] * 10).apply(np.int32)
df['MEDV'] -= df['MEDV'].min()

print df.describe()

create_folds('experiments/uci/data/splits/housing', df)

save_details('housing', len(df), df.shape[1] - 1, df['MEDV'].max() + 1)
Beispiel #15
0
'''Preprocessing code for the Parkinsons Telemonitoring data: https://archive.ics.uci.edu/ml/datasets/Parkinsons+Telemonitoring'''
import numpy as np
import pandas as pd
from utils import standardize, unitize, onehot, create_folds, save_details

df = pd.read_table('experiments/uci/data/parkinsons_updrs.data.txt',  header=0, sep=',')

# Create a one-hot encoding subject ID
onehot(df, ['subject#'])

# Move the target columns to the end
cols = df.columns.tolist()
cols = cols[:3] + cols[5:] + cols[3:5]
df = df[cols]

# Preprocess the features
unitize(df, ['age'])
standardize(df, cols[2:-2])

# Create discrete labels
df['motor_UPDRS'] = (df['motor_UPDRS'].round()).apply(np.int32)
df['motor_UPDRS'] -= df['motor_UPDRS'].min()
df['total_UPDRS'] = (df['total_UPDRS'].round()).apply(np.int32)
df['total_UPDRS'] -= df['total_UPDRS'].min()

print df.describe()

create_folds('experiments/uci/data/splits/parkinsons', df)

save_details('parkinsons', len(df), df.shape[1]-2, (df['motor_UPDRS'].max()+1, df['total_UPDRS'].max()+1))
Beispiel #16
0
# Get command line argument for data location
argument_list = sys.argv[1:]
path = str(argument_list[0])
filename = os.path.basename(path)
filedir = path.replace(filename, '')
data = parse_c45(filename, filedir)

# Define epsilon value and type of noise
epsilon = float(argument_list[1])
noise_type = argument_list[2]

# Convert c45 data to DataFrame and create folds
unprocessed_df = data_to_dataframe(data)
attr_dict = create_attr_dict(data.schema)
df_whole, _ = process_data(unprocessed_df, attr_dict)
folds = create_folds(df_whole)

# Create a DataFrame to store important metrics
metrics_df = pd.DataFrame(columns=['fold', 'accuracy', 'precision', 'recall'])
metrics = []

# Perform 5-fold cross-validation
for i in range(len(folds)):
    print('Predicting Fold : ', i + 1)
    train, test = train_test_split(folds, i)
    plr = PrivLinReg(train, epsilon=epsilon, noise_type=noise_type)
    trained_model = plr.train_model(epochs=500, learning_rate=0.001)
    acc, precision, recall, auc = metrics_for_fold(test, model=trained_model)
    metrics.append([acc, precision, recall, auc])

metrics_df = pd.DataFrame(np.array(metrics),