Example #1
0
def main(shape, spacing, origin, nbl, space_order, xs, xr, tn, f0, npasses,
         batch_size, **kwargs):

    # Get true model
    true_model = get_true_model(shape, spacing, origin, nbl, space_order)

    # Get smooth model
    smooth_model = get_smooth_model(shape, spacing, origin, nbl, space_order)

    # Compute initial born perturbation from m - m0
    dm = (true_model.vp.data**(-2) - smooth_model.vp.data**(-2))

    # Geometry
    nsrc = xs.shape[0]
    nrec = xr.shape[0]
    geometry0 = set_geometry(smooth_model, nsrc, nrec, f0, tn, t0=0)

    # Compute observed data in parallel (inverse crime).
    # In real life we would read the SEG-Y data here.
    futures = []
    for i in range(geometry0.nsrc):
        args = [dm, i, smooth_model, geometry0, space_order]
        futures.append(forward_modeling.remote(*args))
    dobs = np.zeros((geometry0.nt * geometry0.nrec, geometry0.nsrc),
                    dtype=np.float32)
    for i in range(geometry0.nsrc):
        dobs[:, i] = ray.get(futures[i])

    # List containing an identifying element for each subfunction
    sub_refs = set_subreferences(dobs, geometry0, batch_size)

    # Initial guess
    theta_init = np.zeros(smooth_model.shape, dtype=np.float32)

    # # initialize the optimizer
    optimizer = SFO(f_df_multi_shots, theta_init, sub_refs,
                    [geometry0, smooth_model, space_order])

    # # run the optimizer for npasses pass through the data
    theta = optimizer.optimize(num_passes=npasses)

    # Write inverted reflectivity to disk
    file = open('output/dvel-final.bin', "wb")
    scopy = theta.reshape(smooth_model.shape).astype(
        np.float32).copy(order='C')
    file.write(scopy)

    # Create a plot with the minibatch function values
    plt.plot(np.array(optimizer.hist_f_flat))
    plt.xlabel('Iteration')
    plt.ylabel('Minibatch Function Value')
    plt.title('Convergence Trace')
    plt.savefig('output/history_sfo.png')
    def init_optimizer(self, closure):
        def f_df(newparams, data):
            x, y_ = Variable(data['x']), Variable(data['y'])
            dfdtheta = []
            for i, p in enumerate(self.params):
                if p.grad is not None:
                    p.grad.data.zero_()
                p.data = torch.from_numpy(newparams[i]).float()

            loss = closure(x, y_)

            for i, p in enumerate(self.params):
                dfdtheta.append(p.grad.data.numpy())

            loss = loss.data.numpy()
            return loss, dfdtheta

        # create the array of subfunction specific arguments
        sub_refs = []
        for i in range(self.N):
            # extract a single minibatch of training data.
            sub_refs.append({
                'x':
                self.data[i * self.batch_size:(i + 1) *
                          self.batch_size, :, :, :],
                'y':
                self.target[i * self.batch_size:(i + 1) * self.batch_size]
            })
        params_init = []
        for p in self.params:
            params_init.append(p.data.numpy())

        optimizer = SFO(f_df, params_init, sub_refs)
        return optimizer
    def __init__(self, model, calculate_full_objective=True, num_projection_dims=5, full_objective_per_pass=4):
        """
        Trains the model using a variety of optimization algorithms.
        This class also wraps the objective and gradient of the model,
        so that it can evaluate and store the full objective for each
        step in the optimization.

        This is WAY SLOWER than just calling the optimizers, because
        it evaluates the FULL objective and gradient instead of a single
        subfunction several times per pass.

        Designed to be used by figure_convergence.py.
        """

        self.model = model
        self.history = {'f':defaultdict(list), 'x_projection':defaultdict(list), 'events':defaultdict(list), 'x':defaultdict(list)}

        # we use SFO to flatten/unflatten parameters for the other optimizers
        self.x_map = SFO(self.model.f_df, self.model.theta_init, self.model.subfunction_references)
        self.xinit_flat = self.x_map.theta_original_to_flat(self.model.theta_init)
        self.calculate_full_objective = calculate_full_objective

        M = self.xinit_flat.shape[0]
        self.x_projection_matrix = np.random.randn(num_projection_dims, M)/np.sqrt(M)

        self.num_subfunctions = len(self.model.subfunction_references)
        self.full_objective_period = int(self.num_subfunctions/full_objective_per_pass)
Example #4
0
    def SFO_variations(self, num_passes=20):
        """
        Train model using several variations on the standard SFO algorithm.
        """

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO standard'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init,
                             self.model.subfunction_references)
        x = self.optimizer.optimize(num_passes=num_passes)

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO all active'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             init_subf=len(self.model.subfunction_references))
        x = self.optimizer.optimize(num_passes=num_passes)

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO rank 1'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             hessian_algorithm='rank1')
        x = self.optimizer.optimize(num_passes=num_passes)

        self.learner_name = 'SFO random'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             subfunction_selection='random')
        x = self.optimizer.optimize(num_passes=num_passes)

        self.learner_name = 'SFO cyclic'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             subfunction_selection='cyclic')
        x = self.optimizer.optimize(num_passes=num_passes)
def explore_MN(burnin_steps=2, test_steps=2):

    M_arr = []
    N_arr = []
    N = 100
    #N = 50
    for M in np.linspace(1, 1e6, 5):
    #for M in np.linspace(1, 1e3, 4):
        M_arr.append(int(M))
        N_arr.append(int(N))
    M = 1e6
    #M = 1e3
    for N in np.linspace(1,200,5):
    #for N in np.linspace(1,50,4):
        M_arr.append(int(M))
        N_arr.append(int(N))

    T_arr = []

    for ii in range(len(M_arr)):
        M = M_arr[ii]
        N = N_arr[ii]

        print "case %d of %d, M=%g, N=%g"%(ii+1, len(M_arr), M, N)

        # make the model
        model = models.toy(num_subfunctions=N, num_dims=M)
        # initialize the optimizer
        optimizer = SFO(model.f_df, model.theta_init, model.subfunction_references, display=1)
        # burn in the optimizer, to make sure the subspace has eg. reached its full size
        optimizer.optimize(num_passes=burnin_steps)

        # time spent in optimizer during burning
        t0 = optimizer.time_pass - optimizer.time_func
        steps0 = np.sum(optimizer.eval_count)
        optimizer.optimize(num_passes=test_steps)
        t1 = optimizer.time_pass - optimizer.time_func
        t_diff = t1 - t0
        steps1 = np.sum(optimizer.eval_count)
        actual_test_steps = float(steps1 - steps0)/float(N)
        T_arr.append(t_diff/actual_test_steps)
        print T_arr[-1]
        
    return np.array(M_arr), np.array(N_arr), np.array(T_arr)
Example #6
0
    def SFO(self, num_passes=20, learner_name='SFO', **kwargs):
        """ Train model using SFO."""
        self.learner_name = learner_name
        print("\n\n" + self.learner_name)

        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init,
                             self.model.subfunction_references, **kwargs)
        # # check the gradients
        # self.optimizer.check_grad()
        x = self.optimizer.optimize(num_passes=num_passes)
Example #7
0
    def __init__(self,
                 model,
                 calculate_full_objective=True,
                 num_projection_dims=5,
                 full_objective_per_pass=4):
        """
        Trains the model using a variety of optimization algorithms.
        This class also wraps the objective and gradient of the model,
        so that it can evaluate and store the full objective for each
        step in the optimization.

        This is WAY SLOWER than just calling the optimizers, because
        it evaluates the FULL objective and gradient instead of a single
        subfunction several times per pass.

        Designed to be used by figure_convergence.py.
        """

        self.model = model
        self.history = {
            'f': defaultdict(list),
            'x_projection': defaultdict(list),
            'events': defaultdict(list),
            'x': defaultdict(list)
        }

        # we use SFO to flatten/unflatten parameters for the other optimizers
        self.x_map = SFO(self.model.f_df, self.model.theta_init,
                         self.model.subfunction_references)
        self.xinit_flat = self.x_map.theta_original_to_flat(
            self.model.theta_init)
        self.calculate_full_objective = calculate_full_objective

        M = self.xinit_flat.shape[0]
        self.x_projection_matrix = np.random.randn(num_projection_dims,
                                                   M) / np.sqrt(M)

        self.num_subfunctions = len(self.model.subfunction_references)
        self.full_objective_period = int(self.num_subfunctions /
                                         full_objective_per_pass)
Example #8
0
def explore_MN(burnin_steps=2, test_steps=2):

    M_arr = []
    N_arr = []
    N = 100
    #N = 50
    for M in np.linspace(1, 1e6, 5):
        #for M in np.linspace(1, 1e3, 4):
        M_arr.append(int(M))
        N_arr.append(int(N))
    M = 1e6
    #M = 1e3
    for N in np.linspace(1, 200, 5):
        #for N in np.linspace(1,50,4):
        M_arr.append(int(M))
        N_arr.append(int(N))

    T_arr = []

    for ii in range(len(M_arr)):
        M = M_arr[ii]
        N = N_arr[ii]

        print "case %d of %d, M=%g, N=%g" % (ii + 1, len(M_arr), M, N)

        # make the model
        model = models.toy(num_subfunctions=N, num_dims=M)
        # initialize the optimizer
        optimizer = SFO(model.f_df,
                        model.theta_init,
                        model.subfunction_references,
                        display=1)
        # burn in the optimizer, to make sure the subspace has eg. reached its full size
        optimizer.optimize(num_passes=burnin_steps)

        # time spent in optimizer during burning
        t0 = optimizer.time_pass - optimizer.time_func
        steps0 = np.sum(optimizer.eval_count)
        optimizer.optimize(num_passes=test_steps)
        t1 = optimizer.time_pass - optimizer.time_func
        t_diff = t1 - t0
        steps1 = np.sum(optimizer.eval_count)
        actual_test_steps = float(steps1 - steps0) / float(N)
        T_arr.append(t_diff / actual_test_steps)
        print T_arr[-1]

    return np.array(M_arr), np.array(N_arr), np.array(T_arr)
M = 20  # number visible units
J = 10  # number hidden units
D = 100000  # full data batch size
N = int(np.sqrt(D) / 10.0)  # number minibatches
# generate random training data
v = randn(M, D)

# create the array of subfunction specific arguments
sub_refs = []
for i in range(N):
    # extract a single minibatch of training data.
    sub_refs.append(v[:, i::N])

# initialize parameters
theta_init = {"W": randn(J, M), "b_h": randn(J, 1), "b_v": randn(M, 1)}
# initialize the optimizer
optimizer = SFO(f_df, theta_init, sub_refs)
# # uncomment the following line to test the gradient of f_df
# optimizer.check_grad()
# run the optimizer for 1 pass through the data
theta = optimizer.optimize(num_passes=1)
# continue running the optimizer for another 20 passes through the data
theta = optimizer.optimize(num_passes=20)

# plot the convergence trace
plt.plot(np.array(optimizer.hist_f_flat))
plt.xlabel("Iteration")
plt.ylabel("Minibatch Function Value")
plt.title("Convergence Trace")
plt.show()
class train:
    """
    Trains the model using a variety of optimization algorithms.
    This class also wraps the objective and gradient of the model,
    so that it can evaluate and store the full objective for each
    step in the optimization.

    This is WAY SLOWER than just calling the optimizers, because
    it evaluates the FULL objective and gradient instead of a single
    subfunction several times per pass.

    Designed to be used by figure_convergence.py.
    """

    def __init__(self, model, calculate_full_objective=True, num_projection_dims=5, full_objective_per_pass=4):
        """
        Trains the model using a variety of optimization algorithms.
        This class also wraps the objective and gradient of the model,
        so that it can evaluate and store the full objective for each
        step in the optimization.

        This is WAY SLOWER than just calling the optimizers, because
        it evaluates the FULL objective and gradient instead of a single
        subfunction several times per pass.

        Designed to be used by figure_convergence.py.
        """

        self.model = model
        self.history = {'f':defaultdict(list), 'x_projection':defaultdict(list), 'events':defaultdict(list), 'x':defaultdict(list)}

        # we use SFO to flatten/unflatten parameters for the other optimizers
        self.x_map = SFO(self.model.f_df, self.model.theta_init, self.model.subfunction_references)
        self.xinit_flat = self.x_map.theta_original_to_flat(self.model.theta_init)
        self.calculate_full_objective = calculate_full_objective

        M = self.xinit_flat.shape[0]
        self.x_projection_matrix = np.random.randn(num_projection_dims, M)/np.sqrt(M)

        self.num_subfunctions = len(self.model.subfunction_references)
        self.full_objective_period = int(self.num_subfunctions/full_objective_per_pass)


    def f_df_wrapper(self, *args, **kwargs):
        """
        This (slightly hacky) function stands between the optimizer and the objective function.
        It evaluates the objective on the full function every full_objective_function times a 
        subfunction is evaluated, and stores the history of the full objective function value.
        """

        ## call the true subfunction objective function, passing through all parameters
        f, df = self.model.f_df(*args, **kwargs)

        if len(self.history['f'][self.learner_name]) == 0:
            # this is the first time step for this learner
            self.last_f = np.inf
            self.last_idx = -1
            self.nsteps_this_learner = 0

        self.nsteps_this_learner += 1
        # only record the step every once every self.full_objective_period steps
        if np.mod(self.nsteps_this_learner, self.full_objective_period) != 1 and self.full_objective_period > 1:
            return f, df

        # the full objective function on all subfunctions
        if self.calculate_full_objective:
            new_f = 0.
            for ref in self.model.full_objective_references:
                new_f += self.model.f_df(args[0], ref)[0]
        else:
            new_f = f

        events = dict() # holds anything special about this step
        # a unique identifier for the current subfunction
        new_idx = id(args[1])
        if 'SFO' in self.learner_name:
            events = dict(self.optimizer.events)
        # append the full objective value, projections, etc to the history
        self.history['f'][self.learner_name].append(new_f)
        x_proj = np.dot(self.x_projection_matrix, self.x_map.theta_original_to_flat(args[0])).ravel()
        self.history['x_projection'][self.learner_name].append(x_proj)
        self.history['events'][self.learner_name].append(events)
        self.history['x'][self.learner_name] = args[0]
        print("full f %g"%(new_f))
        # store the prior values
        self.last_f = new_f
        self.last_idx = new_idx

        return f, df


    def f_df_wrapper_flattened(self, x_flat, subfunction_references, *args, **kwargs):
        """
        Calculate the subfunction objective and gradient.
        Takes a 1d parameter vector, and returns a 1d gradient, even
        if the parameters for f_df are a list or a dictionary.
        x_flat should be the flattened version of the parameters.
        """

        x = self.x_map.theta_flat_to_original(x_flat)
        f = 0.
        df = 0.
        for sr in subfunction_references:
            fl, dfl = self.f_df_wrapper(x, sr, *args, **kwargs)
            dfl_flat = self.x_map.theta_original_to_flat(dfl)
            f += fl
            df += dfl_flat
        return f, df.ravel()


    def SGD(self, num_passes=20):
        """ Train model using SGD with various learning rates """

        # get the number of minibatches
        N = len(self.model.subfunction_references)
        # step through all the hyperparameters.  eta is step length.
        for eta in 10**np.linspace(-5,2,8):
            # label this convergence trace using the optimizer name and hyperparameter
            self.learner_name = "SGD %.4f"%eta
            print("\n\n" + self.learner_name)

            # initialize the parameters
            x = self.xinit_flat.copy()
            ## perform stochastic gradient descent
            for _ in range(num_passes*N): # number of minibatch evaluations
                # choose a minibatch at random
                idx = np.random.randint(N)
                sr = self.model.subfunction_references[idx]
                # evaluate the objective and gradient for that minibatch
                fl, dfl = self.f_df_wrapper_flattened(x.reshape((-1,1)), (sr,))
                # update the parameters
                x -= dfl.reshape(x.shape) * eta
                # if the objective has diverged, skip the rest of the run for this hyperparameter
                if not np.isfinite(fl):
                    print("Non-finite subfunction.")
                    break


    def LBFGS(self, num_passes=20):
        """ Train model using LBFGS """

        self.learner_name = "LBFGS"
        print("\n\n" + self.learner_name)
        _, _, _ = fmin_l_bfgs_b(
            self.f_df_wrapper_flattened,
            self.xinit_flat.copy(), 
            disp=1,
            args=(self.model.subfunction_references, ),
            maxfun=num_passes)


    def SFO(self, num_passes=20, learner_name='SFO'):
        """ Train model using SFO."""
        self.learner_name = learner_name
        print("\n\n" + self.learner_name)

        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references)
        # # check the gradients
        # self.optimizer.check_grad()
        x = self.optimizer.optimize(num_passes=num_passes)


    def SFO_variations(self, num_passes=20):
        """
        Train model using several variations on the standard SFO algorithm.
        """

        np.random.seed(0) # make experiments repeatable
        self.learner_name = 'SFO standard'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references)
        x = self.optimizer.optimize(num_passes=num_passes)

        np.random.seed(0) # make experiments repeatable
        self.learner_name = 'SFO all active'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references,
            init_subf=len(self.model.subfunction_references))
        x = self.optimizer.optimize(num_passes=num_passes)

        np.random.seed(0) # make experiments repeatable
        self.learner_name = 'SFO rank 1'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references,
            hessian_algorithm='rank1')
        x = self.optimizer.optimize(num_passes=num_passes)

        self.learner_name = 'SFO random'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references,
            subfunction_selection='random'
            )
        x = self.optimizer.optimize(num_passes=num_passes)

        self.learner_name = 'SFO cyclic'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init, self.model.subfunction_references,
            subfunction_selection='cyclic'
            )
        x = self.optimizer.optimize(num_passes=num_passes)


    def SAG(self, num_passes=20):
        """ Train model using SAG with line search, for various initial Lipschitz """

        # larger L is easier, so start large
        for L in 10**(-np.linspace(-3, 3, 7)):
            self.learner_name = "SAG %.4f"%L
            #learner_name = "SAG (diverges)"
            print("\n\n" + self.learner_name)
            self.optimizer = SAG(self.f_df_wrapper_flattened, self.xinit_flat.copy(), self.model.subfunction_references, L=L)
            x = self.optimizer.optimize(num_passes=num_passes)
            print(np.mean(self.optimizer.f), "average value at last evaluation")


    def LBFGS_minibatch(self, num_passes=20, data_fraction=0.1, num_steps=10):
        """ Perform LBFGS on minibatches of size data_fraction of the full datastep, and with num_steps LBFGS steps per minibatch."""

        self.learner_name = "LBFGS minibatch"


        x = self.xinit_flat.copy()
        for epoch in range(num_passes):
            idx = random_choice(len(self.model.subfunction_references),
                int(data_fraction*len(self.model.subfunction_references)),
                replace=False)
            sr = []
            for ii in idx:
                sr.append(self.model.subfunction_references[ii])
            x, _, _ = fmin_l_bfgs_b(
                self.f_df_wrapper_flattened,
                x, 
                args=(sr, ),
                disp=1,
                maxfun=num_steps)

    
    def SGD_momentum(self, num_passes=20):
        """ Train model using SGD with various learning rates and momentums"""

        learning_rates = 10**np.linspace(-5,2,8)
        momentums = np.array([0.5, 0.9, 0.95, 0.99])
        params = product(learning_rates, momentums)
        N = len(self.model.subfunction_references)
        for eta, momentum in params:
            self.learner_name = "SGD_momentum eta=%.5f, mu=%.2f" % (eta, momentum)
            print("\n\n" + self.learner_name)
            f = np.ones((N))*np.nan
            x = self.xinit_flat.copy()
            # Prevous step
            inc = 0.0
            for epoch in range(num_passes):
                for minibatch in range(N):
                    idx = np.random.randint(N)
                    sr = self.model.subfunction_references[idx]
                    fl, dfl = self.f_df_wrapper_flattened(x.reshape((-1,1)), (sr,))
                    inc = momentum * inc - eta * dfl.reshape(x.shape)
                    x += inc
                    f[idx] = fl
                    if not np.isfinite(fl):
                        print("Non-finite subfunction.  Ending run.")
                        break
                if not np.isfinite(fl):
                    print("Non-finite subfunction.  Ending run.")
                    break
            print(np.mean(f[np.isfinite(f)]), "average finite value at last evaluation")


    def ADA(self, num_passes=20):
        """ Train model using ADAgrad with various learning rates """

        for eta in 10**np.linspace(-3,1,5):
            self.learner_name = "ADAGrad %.4f"%eta
            print("\n\n" + self.learner_name)
            self.optimizer = ADAGrad(self.f_df_wrapper_flattened, self.xinit_flat.copy(), self.model.subfunction_references, learning_rate=eta)
            x = self.optimizer.optimize(num_passes=num_passes)
            print(np.mean(self.optimizer.f), "average value at last evaluation")
Example #11
0
    def train(self,
              images,
              batch_size=50,
              num_epochs=20,
              method='SGD',
              train_means=False,
              train_top_layer=False,
              momentum=0.9,
              learning_rate=1.,
              decay1=0.9,
              decay2=0.999,
              precondition=True):
        """
		@type  images: C{ndarray}/C{list}
		@param images: an array or a list of images
		"""

        print 'Preprocessing...'

        inputs, outputs = self._preprocess(images)

        if precondition:
            print 'Preconditioning...'

            # remove correlations
            inputs, outputs = self._precondition(inputs, outputs)

        # indicates which layers will be trained
        train_layers = [self.num_layers -
                        1] if train_top_layer else range(self.num_layers)

        print 'Creating SLSTMs...'

        # create SLSTMs
        for l in range(self.num_layers):
            self.slstm[l] = SLSTM(
                num_rows=inputs.shape[1],
                num_cols=inputs.shape[2],
                num_channels=inputs.shape[3] if l < 1 else self.num_hiddens,
                num_hiddens=self.num_hiddens,
                batch_size=min([batch_size, self.MAX_BATCH_SIZE]),
                nonlinearity=self.nonlinearity,
                extended=self.extended,
                slstm=self.slstm[l],
                verbosity=self.verbosity)

        # compute loss function and its gradient
        def f_df(params, idx):
            # set model parameters
            for l in train_layers:
                self.slstm[l].set_parameters(params['slstm'][l])
            self.mcgsm._set_parameters(params['mcgsm'],
                                       {'train_means': train_means})

            # select batch and compute hidden activations
            Y = outputs[idx:idx + batch_size]
            H = inputs[idx:idx + batch_size]

            for l in range(self.num_layers):
                H = self.slstm[l].forward(H)

            # form inputs to MCGSM
            H_flat = H.reshape(-1, self.num_hiddens).T
            Y_flat = Y.reshape(-1, self.num_channels).T

            norm_const = -H_flat.shape[1]

            # compute gradients
            df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat)
            df_dh = df_dh.T.reshape(*H.shape) / norm_const

            # ignore bottom-right pixel (BSDS300)
            df_dh[:, -1, -1] = 0.

            # average negative log-likelihood
            f = sum(loglik) / norm_const

            df_dtheta = {}
            df_dtheta['slstm'] = [0.] * self.num_layers

            for l in range(self.num_layers)[::-1]:
                if l not in train_layers:
                    break
                if l > min(train_layers):
                    # derivative with respect to inputs of layer l are derivatives
                    # of hidden states of layer l - 1
                    df_dtheta['slstm'][l] = self.slstm[l].backward(
                        df_dh, force_backward=True)
                    df_dh = df_dtheta['slstm'][l]['inputs']
                    del df_dtheta['slstm'][l]['inputs']

                else:
                    # no need to compute derivatives with respect to input units
                    df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh)

            # compute gradient of MCGSM
            df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient(
                H_flat, Y_flat, parameters={'train_means': train_means
                                            }) * log(2.) * self.mcgsm.dim_out

            return f, df_dtheta

        # collect current parameters
        params = {}
        params['slstm'] = [0.] * self.num_layers
        for l in range(self.num_layers)[::-1]:
            if l not in train_layers:
                break
            params['slstm'][l] = self.slstm[l].parameters()
        params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means})

        # a start index for each batch
        start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size)

        print 'Training...'

        if method.upper() == 'SFO':
            try:
                # optimize using sum-of-functions optimizer
                optimizer = SFO(f_df,
                                params,
                                start_indices,
                                display=self.verbosity)
                params_opt = optimizer.optimize(num_passes=num_epochs)

                # set model parameters
                for l in range(self.num_layers):
                    self.slstm[l].set_parameters(params_opt['slstm'][l])
                self.mcgsm._set_parameters(params_opt['mcgsm'],
                                           {'train_means': train_means})

            except KeyboardInterrupt:
                pass

            return optimizer.hist_f_flat

        elif method.upper() == 'SGD':
            loss = []
            diff = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }

            for l in train_layers:
                diff['slstm'][l] = {}
                for key in params['slstm'][l]:
                    diff['slstm'][l][key] = zeros_like(params['slstm'][l][key])

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1,
                               batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f)

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params['slstm'][l]:
                            diff['slstm'][l][key] = momentum * diff['slstm'][
                                l][key] - df['slstm'][l][key]
                            params['slstm'][l][key] = params['slstm'][l][
                                key] + learning_rate * diff['slstm'][l][key]

                    # update MCGSM parameters
                    diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm']
                    params['mcgsm'] = params[
                        'mcgsm'] + learning_rate * diff['mcgsm']

                    if self.verbosity > 0:
                        print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
                            n, loss[-1],
                            mean(loss[-max([10, 20000 // batch_size]):]))

            return loss

        elif method.upper() == 'ADAM':
            loss = []
            diff_mean = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }
            diff_sqrd = {
                'slstm': [0.] * self.num_layers,
                'mcgsm': zeros_like(params['mcgsm'])
            }

            for l in train_layers:
                diff_mean['slstm'][l] = {}
                diff_sqrd['slstm'][l] = {}
                for key in params['slstm'][l]:
                    diff_mean['slstm'][l][key] = zeros_like(
                        params['slstm'][l][key])
                    diff_sqrd['slstm'][l][key] = zeros_like(
                        params['slstm'][l][key])

            # step counter
            t = 1

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1,
                               batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f)

                    # include bias correction in step width
                    step_width = learning_rate / (
                        1. - power(decay1, t)) * sqrt(1. - power(decay2, t))
                    t += 1

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params['slstm'][l]:
                            diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \
                             + (1. - decay1) * df['slstm'][l][key]
                            diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \
                             + (1. - decay2) * square(df['slstm'][l][key])

                            params['slstm'][l][key] = params['slstm'][l][key] - \
                             step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key]))

                    # update MCGSM parameters
                    diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + (
                        1. - decay1) * df['mcgsm']
                    diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + (
                        1. - decay2) * square(df['mcgsm'])
                    params['mcgsm'] = params['mcgsm'] - \
                     step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm']))

                    if self.verbosity > 0:
                        print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
                            n, loss[-1],
                            mean(loss[-max([10, 20000 // batch_size]):]))

            return loss

        else:
            raise ValueError('Unknown method \'{0}\'.'.format(method))
M = 20  # number visible units
J = 10  # number hidden units
D = 100000  # full data batch size
N = int(np.sqrt(D) / 10.)  # number minibatches
# generate random training data
v = randn(M, D)

# create the array of subfunction specific arguments
sub_refs = []
for i in range(N):
    # extract a single minibatch of training data.
    sub_refs.append(v[:, i::N])

# initialize parameters
theta_init = {'W': randn(J, M), 'b_h': randn(J, 1), 'b_v': randn(M, 1)}
# initialize the optimizer
optimizer = SFO(f_df, theta_init, sub_refs)
# # uncomment the following line to test the gradient of f_df
# optimizer.check_grad()
# run the optimizer for 1 pass through the data
theta = optimizer.optimize(num_passes=1)
# continue running the optimizer for another 20 passes through the data
theta = optimizer.optimize(num_passes=20)

# plot the convergence trace
plt.plot(np.array(optimizer.hist_f_flat))
plt.xlabel('Iteration')
plt.ylabel('Minibatch Function Value')
plt.title('Convergence Trace')
plt.show()
	pp.axes(xlim=(-xlm, xlm), ylim=(-ylm, ylm))
	pp.scatter(forward_data[-1,:,0],forward_data[-1,:,1],c='b',alpha=.2)
	#pp.figure(7)
	#pp.suptitle('Histogram: Model Density vs. Distance from Origin')
	#pp.axes(xlim=(0.25,2.25),ylim=(0,5),xlabel='Distance from Origin',ylabel='Probability Density')
	#pp.hist(np.sqrt(np.sum(samples[-1]**2,axis=1)),50,normed=True,color='r')
	#pp.figure(8)
	#pp.suptitle(r'Learned $\beta$ Schedule')
	#pp.axes(xlabel='t', ylabel=r'$\beta$')
	#pp.plot(np.arange(nsteps),(1.0/(1.0+np.exp(-opt_params[-1])))*beta_max)
	pp.show()

exit()

if automate_training:
	optimizer = SFO(f_df, init_params, subfuncs)
	end_loss=99.0
	while end_loss>-2.50:
		linalgerror=False
		try:
			opt_params = optimizer.optimize(num_passes=2)
			end_loss = f_df(opt_params,fdata)[0]
		except np.linalg.linalg.LinAlgError:
			linalgerror=True
		
		if np.isnan(end_loss) or linalgerror:
			mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32)
			mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32)
			mu_biases=np.zeros(nhid_mu).astype(np.float32)
			mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32)
			mu_b=np.zeros((ntgates, nx)).astype(np.float32)
# Compiling the sampling function

samplesT, tT, sample_updates=get_samps(nsamps, paramsT)
sample_T=theano.function([mu_centersT, mu_spreadsT, mu_biasesT, mu_MT, mu_bT,mu_t_centersT,mu_t_spreadsT,
					cov_centersT, cov_spreadsT, cov_biasesT, cov_MT, cov_bT,cov_t_centersT,cov_t_spreadsT],
					samplesT,
					allow_input_downcast=True)

def sample(params):
	out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5],
						params[6],params[7],params[8],params[9],params[10],params[11],params[12],params[13])
	return out


if automate_training:
	optimizer = SFO(f_df, init_params, subfuncs)
	end_loss=99.0
	while end_loss>-2.50:
		linalgerror=False
		try:
			opt_params = optimizer.optimize(num_passes=2)
			end_loss = f_df(opt_params,fdata)[0]
		except np.linalg.linalg.LinAlgError:
			linalgerror=True
		
		if np.isnan(end_loss) or linalgerror:
			mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32)
			mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32)
			mu_biases=np.zeros(nhid_mu).astype(np.float32)
			mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32)
			mu_b=np.zeros((ntgates, nx)).astype(np.float32)
Example #15
0
class train:
    """
    Trains the model using a variety of optimization algorithms.
    This class also wraps the objective and gradient of the model,
    so that it can evaluate and store the full objective for each
    step in the optimization.

    This is WAY SLOWER than just calling the optimizers, because
    it evaluates the FULL objective and gradient instead of a single
    subfunction several times per pass.

    Designed to be used by figure_convergence.py.
    """
    def __init__(self,
                 model,
                 calculate_full_objective=True,
                 num_projection_dims=5,
                 full_objective_per_pass=4):
        """
        Trains the model using a variety of optimization algorithms.
        This class also wraps the objective and gradient of the model,
        so that it can evaluate and store the full objective for each
        step in the optimization.

        This is WAY SLOWER than just calling the optimizers, because
        it evaluates the FULL objective and gradient instead of a single
        subfunction several times per pass.

        Designed to be used by figure_convergence.py.
        """

        self.model = model
        self.history = {
            'f': defaultdict(list),
            'x_projection': defaultdict(list),
            'events': defaultdict(list),
            'x': defaultdict(list)
        }

        # we use SFO to flatten/unflatten parameters for the other optimizers
        self.x_map = SFO(self.model.f_df, self.model.theta_init,
                         self.model.subfunction_references)
        self.xinit_flat = self.x_map.theta_original_to_flat(
            self.model.theta_init)
        self.calculate_full_objective = calculate_full_objective

        M = self.xinit_flat.shape[0]
        self.x_projection_matrix = np.random.randn(num_projection_dims,
                                                   M) / np.sqrt(M)

        self.num_subfunctions = len(self.model.subfunction_references)
        self.full_objective_period = int(self.num_subfunctions /
                                         full_objective_per_pass)

    def f_df_wrapper(self, *args, **kwargs):
        """
        This (slightly hacky) function stands between the optimizer and the objective function.
        It evaluates the objective on the full function every full_objective_function times a 
        subfunction is evaluated, and stores the history of the full objective function value.
        """

        ## call the true subfunction objective function, passing through all parameters
        f, df = self.model.f_df(*args, **kwargs)

        if len(self.history['f'][self.learner_name]) == 0:
            # this is the first time step for this learner
            self.last_f = np.inf
            self.last_idx = -1
            self.nsteps_this_learner = 0

        self.nsteps_this_learner += 1
        # only record the step every once every self.full_objective_period steps
        if np.mod(self.nsteps_this_learner, self.full_objective_period
                  ) != 1 and self.full_objective_period > 1:
            return f, df

        # the full objective function on all subfunctions
        if self.calculate_full_objective:
            new_f = 0.
            for ref in self.model.full_objective_references:
                new_f += self.model.f_df(args[0], ref)[0]
        else:
            new_f = f

        events = dict()  # holds anything special about this step
        # a unique identifier for the current subfunction
        new_idx = id(args[1])
        if 'SFO' in self.learner_name:
            events = dict(self.optimizer.events)
        # append the full objective value, projections, etc to the history
        self.history['f'][self.learner_name].append(new_f)
        x_proj = np.dot(self.x_projection_matrix,
                        self.x_map.theta_original_to_flat(args[0])).ravel()
        self.history['x_projection'][self.learner_name].append(x_proj)
        self.history['events'][self.learner_name].append(events)
        self.history['x'][self.learner_name] = args[0]
        print("full f %g" % (new_f))
        # store the prior values
        self.last_f = new_f
        self.last_idx = new_idx

        return f, df

    def f_df_wrapper_flattened(self, x_flat, subfunction_references, *args,
                               **kwargs):
        """
        Calculate the subfunction objective and gradient.
        Takes a 1d parameter vector, and returns a 1d gradient, even
        if the parameters for f_df are a list or a dictionary.
        x_flat should be the flattened version of the parameters.
        """

        x = self.x_map.theta_flat_to_original(x_flat)
        f = 0.
        df = 0.
        for sr in subfunction_references:
            fl, dfl = self.f_df_wrapper(x, sr, *args, **kwargs)
            dfl_flat = self.x_map.theta_original_to_flat(dfl)
            f += fl
            df += dfl_flat
        return f, df.ravel()

    def SGD(self, num_passes=20):
        """ Train model using SGD with various learning rates """

        # get the number of minibatches
        N = len(self.model.subfunction_references)
        # step through all the hyperparameters.  eta is step length.
        for eta in 10**np.linspace(-5, 2, 8):
            # label this convergence trace using the optimizer name and hyperparameter
            self.learner_name = "SGD %.4f" % eta
            print("\n\n" + self.learner_name)

            # initialize the parameters
            x = self.xinit_flat.copy()
            ## perform stochastic gradient descent
            for _ in range(num_passes * N):  # number of minibatch evaluations
                # choose a minibatch at random
                idx = np.random.randint(N)
                sr = self.model.subfunction_references[idx]
                # evaluate the objective and gradient for that minibatch
                fl, dfl = self.f_df_wrapper_flattened(x.reshape((-1, 1)),
                                                      (sr, ))
                # update the parameters
                x -= dfl.reshape(x.shape) * eta
                # if the objective has diverged, skip the rest of the run for this hyperparameter
                if not np.isfinite(fl):
                    print("Non-finite subfunction.")
                    break

    def LBFGS(self, num_passes=20):
        """ Train model using LBFGS """

        self.learner_name = "LBFGS"
        print("\n\n" + self.learner_name)
        _, _, _ = fmin_l_bfgs_b(self.f_df_wrapper_flattened,
                                self.xinit_flat.copy(),
                                disp=1,
                                args=(self.model.subfunction_references, ),
                                maxfun=num_passes)

    def SFO(self, num_passes=20, learner_name='SFO', **kwargs):
        """ Train model using SFO."""
        self.learner_name = learner_name
        print("\n\n" + self.learner_name)

        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init,
                             self.model.subfunction_references, **kwargs)
        # # check the gradients
        # self.optimizer.check_grad()
        x = self.optimizer.optimize(num_passes=num_passes)

    def SFO_variations(self, num_passes=20):
        """
        Train model using several variations on the standard SFO algorithm.
        """

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO standard'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper, self.model.theta_init,
                             self.model.subfunction_references)
        x = self.optimizer.optimize(num_passes=num_passes)

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO all active'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             init_subf=len(self.model.subfunction_references))
        x = self.optimizer.optimize(num_passes=num_passes)

        np.random.seed(0)  # make experiments repeatable
        self.learner_name = 'SFO rank 1'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             hessian_algorithm='rank1')
        x = self.optimizer.optimize(num_passes=num_passes)

        self.learner_name = 'SFO random'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             subfunction_selection='random')
        x = self.optimizer.optimize(num_passes=num_passes)

        self.learner_name = 'SFO cyclic'
        print("\n\n" + self.learner_name)
        self.optimizer = SFO(self.f_df_wrapper,
                             self.model.theta_init,
                             self.model.subfunction_references,
                             subfunction_selection='cyclic')
        x = self.optimizer.optimize(num_passes=num_passes)

    def SAG(self, num_passes=20):
        """ Train model using SAG with line search, for various initial Lipschitz """

        # larger L is easier, so start large
        for L in 10**(-np.linspace(-3, 3, 7)):
            self.learner_name = "SAG %.4f" % L
            #learner_name = "SAG (diverges)"
            print("\n\n" + self.learner_name)
            self.optimizer = SAG(self.f_df_wrapper_flattened,
                                 self.xinit_flat.copy(),
                                 self.model.subfunction_references,
                                 L=L)
            x = self.optimizer.optimize(num_passes=num_passes)
            print(np.mean(self.optimizer.f),
                  "average value at last evaluation")

    def LBFGS_minibatch(self, num_passes=20, data_fraction=0.1, num_steps=10):
        """ Perform LBFGS on minibatches of size data_fraction of the full datastep, and with num_steps LBFGS steps per minibatch."""

        self.learner_name = "LBFGS minibatch"

        x = self.xinit_flat.copy()
        for epoch in range(num_passes):
            idx = random_choice(len(self.model.subfunction_references),
                                int(data_fraction *
                                    len(self.model.subfunction_references)),
                                replace=False)
            sr = []
            for ii in idx:
                sr.append(self.model.subfunction_references[ii])
            x, _, _ = fmin_l_bfgs_b(self.f_df_wrapper_flattened,
                                    x,
                                    args=(sr, ),
                                    disp=1,
                                    maxfun=num_steps)

    def SGD_momentum(self, num_passes=20):
        """ Train model using SGD with various learning rates and momentums"""

        learning_rates = 10**np.linspace(-5, 2, 8)
        momentums = np.array([0.5, 0.9, 0.95, 0.99])
        params = product(learning_rates, momentums)
        N = len(self.model.subfunction_references)
        for eta, momentum in params:
            self.learner_name = "SGD_momentum eta=%.5f, mu=%.2f" % (eta,
                                                                    momentum)
            print("\n\n" + self.learner_name)
            f = np.ones((N)) * np.nan
            x = self.xinit_flat.copy()
            # Prevous step
            inc = 0.0
            for epoch in range(num_passes):
                for minibatch in range(N):
                    idx = np.random.randint(N)
                    sr = self.model.subfunction_references[idx]
                    fl, dfl = self.f_df_wrapper_flattened(
                        x.reshape((-1, 1)), (sr, ))
                    inc = momentum * inc - eta * dfl.reshape(x.shape)
                    x += inc
                    f[idx] = fl
                    if not np.isfinite(fl):
                        print("Non-finite subfunction.  Ending run.")
                        break
                if not np.isfinite(fl):
                    print("Non-finite subfunction.  Ending run.")
                    break
            print(np.mean(f[np.isfinite(f)]),
                  "average finite value at last evaluation")

    def ADA(self, num_passes=20):
        """ Train model using ADAgrad with various learning rates """

        for eta in 10**np.linspace(-3, 1, 5):
            self.learner_name = "ADAGrad %.4f" % eta
            print("\n\n" + self.learner_name)
            self.optimizer = ADAGrad(self.f_df_wrapper_flattened,
                                     self.xinit_flat.copy(),
                                     self.model.subfunction_references,
                                     learning_rate=eta)
            x = self.optimizer.optimize(num_passes=num_passes)
            print(np.mean(self.optimizer.f),
                  "average value at last evaluation")
Example #16
0
 def getOptimizer(self):
     self.batches = self.getSFOBatches()
     return SFO(self.Net._getCost_dCost,
                self.initial_p,
                self.batches,
                display=self.iprint)
Example #17
0
    def train(
        self,
        images,
        batch_size=50,
        num_epochs=20,
        method="SGD",
        train_means=False,
        train_top_layer=False,
        momentum=0.9,
        learning_rate=1.0,
        decay1=0.9,
        decay2=0.999,
        precondition=True,
    ):
        """
		Train model via stochastic gradient descent (SGD) or sum-of-functions optimizer (SFO).

		@type  images: C{ndarray}/C{list}
		@param images: an array or a list of training images (e.g., Nx32x32x3)

		@type  batch_size: C{int}
		@param batch_size: batch size used by SGD

		@type  num_epochs: C{int}
		@param num_epochs: number of passes through the training set

		@type  method: C{str}
		@param method: either 'SGD', 'SFO', or 'ADAM'

		@type  train_means: C{bool}
		@param train_means: whether or not to optimize the mean parameters of the MCGSM

		@type  train_top_layer: C{bool}
		@param train_top_layer: if true, only the MCGSM and spatial LSTM at the top layer is trained

		@type  momentum: C{float}
		@param momentum: momentum rate used by SGD

		@type  learning_rate: C{float}
		@param learning_rate: learning rate used by SGD

		@type  decay1: C{float}
		@param decay1: hyperparameter used by ADAM

		@type  decay2: C{float}
		@param decay2: hyperparameter used by ADAM

		@type  precondition: C{bool}
		@param precondition: whether or not to perform conditional whitening

		@rtype: C{list}
		@return: evolution of negative log-likelihood (bits per pixel) over the training
		"""

        if images.shape[1] < self.input_mask.shape[0] or images.shape[2] < self.input_mask.shape[1]:
            raise ValueError("Images too small.")

        if self.verbosity > 0:
            print "Preprocessing..."

        inputs, outputs = self._preprocess(images)

        if precondition:
            if self.verbosity > 0:
                print "Preconditioning..."

                # remove correlations
            inputs, outputs = self._precondition(inputs, outputs)

            # indicates which layers will be trained
        train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers)

        if self.verbosity > 0:
            print "Creating SLSTMs..."

            # create SLSTMs
        for l in range(self.num_layers):
            self.slstm[l] = SLSTM(
                num_rows=inputs.shape[1],
                num_cols=inputs.shape[2],
                num_channels=inputs.shape[3] if l < 1 else self.num_hiddens,
                num_hiddens=self.num_hiddens,
                batch_size=min([batch_size, self.MAX_BATCH_SIZE]),
                nonlinearity=self.nonlinearity,
                extended=self.extended,
                slstm=self.slstm[l],
                verbosity=self.verbosity,
            )

            # compute loss function and its gradient

        def f_df(params, idx):
            # set model parameters
            for l in train_layers:
                self.slstm[l].set_parameters(params["slstm"][l])
            self.mcgsm._set_parameters(params["mcgsm"], {"train_means": train_means})

            # select batch and compute hidden activations
            Y = outputs[idx : idx + batch_size]
            H = inputs[idx : idx + batch_size]

            for l in range(self.num_layers):
                H = self.slstm[l].forward(H)

                # form inputs to MCGSM
            H_flat = H.reshape(-1, self.num_hiddens).T
            Y_flat = Y.reshape(-1, self.num_channels).T

            norm_const = -H_flat.shape[1]

            # compute gradients
            df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat)
            df_dh = df_dh.T.reshape(*H.shape) / norm_const

            # average log-likelihood
            f = sum(loglik) / norm_const

            df_dtheta = {}
            df_dtheta["slstm"] = [0.0] * self.num_layers

            for l in range(self.num_layers)[::-1]:
                if l not in train_layers:
                    break
                if l > min(train_layers):
                    # derivative with respect to inputs of layer l are derivatives
                    # of hidden states of layer l - 1
                    df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh, force_backward=True)
                    df_dh = df_dtheta["slstm"][l]["inputs"]
                    del df_dtheta["slstm"][l]["inputs"]

                else:
                    # no need to compute derivatives with respect to input units
                    df_dtheta["slstm"][l] = self.slstm[l].backward(df_dh)

                    # compute gradient of MCGSM
            df_dtheta["mcgsm"] = (
                self.mcgsm._parameter_gradient(H_flat, Y_flat, parameters={"train_means": train_means})
                * log(2.0)
                * self.mcgsm.dim_out
            )

            return f, df_dtheta

            # collect current parameters

        params = {}
        params["slstm"] = [0.0] * self.num_layers
        for l in range(self.num_layers)[::-1]:
            if l not in train_layers:
                break
            params["slstm"][l] = self.slstm[l].parameters()
        params["mcgsm"] = self.mcgsm._parameters({"train_means": train_means})

        # a start index for each batch
        start_indices = range(0, inputs.shape[0] - batch_size + 1, batch_size)

        if self.verbosity > 0:
            print "Training..."

        if method.upper() == "SFO":
            try:
                # optimize using sum-of-functions optimizer
                optimizer = SFO(f_df, params, start_indices, display=self.verbosity)
                params_opt = optimizer.optimize(num_passes=num_epochs)

                # set model parameters
                for l in range(self.num_layers):
                    self.slstm[l].set_parameters(params_opt["slstm"][l])
                self.mcgsm._set_parameters(params_opt["mcgsm"], {"train_means": train_means})

            except KeyboardInterrupt:
                pass

            return optimizer.hist_f_flat

        elif method.upper() == "SGD":
            loss = []
            diff = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])}

            for l in train_layers:
                diff["slstm"][l] = {}
                for key in params["slstm"][l]:
                    diff["slstm"][l][key] = zeros_like(params["slstm"][l][key])

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1, batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f / log(2.0) / self.num_channels)

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params["slstm"][l]:
                            diff["slstm"][l][key] = momentum * diff["slstm"][l][key] - df["slstm"][l][key]
                            params["slstm"][l][key] = params["slstm"][l][key] + learning_rate * diff["slstm"][l][key]

                            # update MCGSM parameters
                    diff["mcgsm"] = momentum * diff["mcgsm"] - df["mcgsm"]
                    params["mcgsm"] = params["mcgsm"] + learning_rate * diff["mcgsm"]

                    if self.verbosity > 0:
                        print "{0:>5} {1:>10.4f} {2:>10.4f}".format(
                            n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :])
                        )

            return loss

        elif method.upper() == "ADAM":
            loss = []
            diff_mean = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])}
            diff_sqrd = {"slstm": [0.0] * self.num_layers, "mcgsm": zeros_like(params["mcgsm"])}

            for l in train_layers:
                diff_mean["slstm"][l] = {}
                diff_sqrd["slstm"][l] = {}
                for key in params["slstm"][l]:
                    diff_mean["slstm"][l][key] = zeros_like(params["slstm"][l][key])
                    diff_sqrd["slstm"][l][key] = zeros_like(params["slstm"][l][key])

                    # step counter
            t = 1

            for n in range(num_epochs):
                for b in range(0, inputs.shape[0] - batch_size + 1, batch_size):
                    # compute gradients
                    f, df = f_df(params, b)

                    loss.append(f / log(2.0) / self.num_channels)

                    # include bias correction in step width
                    step_width = learning_rate / (1.0 - power(decay1, t)) * sqrt(1.0 - power(decay2, t))
                    t += 1

                    # update SLSTM parameters
                    for l in train_layers:
                        for key in params["slstm"][l]:
                            diff_mean["slstm"][l][key] = (
                                decay1 * diff_mean["slstm"][l][key] + (1.0 - decay1) * df["slstm"][l][key]
                            )
                            diff_sqrd["slstm"][l][key] = decay2 * diff_sqrd["slstm"][l][key] + (1.0 - decay2) * square(
                                df["slstm"][l][key]
                            )

                            params["slstm"][l][key] = params["slstm"][l][key] - step_width * diff_mean["slstm"][l][
                                key
                            ] / (1e-8 + sqrt(diff_sqrd["slstm"][l][key]))

                            # update MCGSM parameters
                    diff_mean["mcgsm"] = decay1 * diff_mean["mcgsm"] + (1.0 - decay1) * df["mcgsm"]
                    diff_sqrd["mcgsm"] = decay2 * diff_sqrd["mcgsm"] + (1.0 - decay2) * square(df["mcgsm"])
                    params["mcgsm"] = params["mcgsm"] - step_width * diff_mean["mcgsm"] / (
                        1e-8 + sqrt(diff_sqrd["mcgsm"])
                    )

                    if self.verbosity > 0:
                        print "{0:>5} {1:>10.4f} {2:>10.4f}".format(
                            n, loss[-1], mean(loss[-max([10, 20000 // batch_size]) :])
                        )

            return loss

        else:
            raise ValueError("Unknown method '{0}'.".format(method))
Example #18
0
def optim_vae_sfo(model,
                  x,
                  v_init,
                  w_init,
                  n_batch,
                  n_passes,
                  hook,
                  n_resample=20,
                  resample_keepmem=False,
                  bernoulli_x=False,
                  display=0):

    # Shuffle columns of dataset x
    ndict.shuffleCols(x)

    # create minibatches
    n_tot = x.itervalues().next().shape[1]
    minibatches = []
    n_minibatches = n_tot / n_batch
    if (n_tot % n_batch) != 0: raise Exception()

    # Divide into minibatches
    def make_minibatch(i):
        _x = ndict.getCols(x, i * n_batch, (i + 1) * n_batch)
        _eps = model.gen_eps(n_batch)
        if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x'])
        return [i, _x, _eps]

    for i in range(n_minibatches):
        minibatches.append(make_minibatch(i))

    L = [0.]
    n_L = [0]

    def f_df(w, minibatch):

        i_minibatch = minibatch[0]
        x_minibatch = minibatch[1]
        eps_minibatch = minibatch[2]

        # Get gradient
        logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch,
                                                  eps_minibatch)

        # Get gradient w.r.t. priors
        logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w'])
        gv = {i: gv[i] + float(n_batch) / n_tot * gv_prior[i] for i in gv}
        gw = {i: gw[i] + float(n_batch) / n_tot * gw_prior[i] for i in gw}

        f = (logpx.sum() + logpz.sum() - logqz.sum())
        L[0] += -f / (1. * n_batch)
        n_L[0] += 1
        f += float(n_batch) / n_tot * logpv
        f += float(n_batch) / n_tot * logpw

        for i in gv:
            gv[i] *= -1. / n_batch
        for i in gw:
            gw[i] *= -1. / n_batch
        f *= -1. / n_batch

        #print 'norms gv:'
        #ndict.pNorm(gv)
        #print 'norms gw'
        #ndict.pNorm(gw)

        return f, {'v': gv, 'w': gw}

    w_init = {'v': v_init, 'w': w_init}

    from sfo import SFO
    optimizer = SFO(f_df, w_init, minibatches, display=display)

    #optimizer.check_grad()

    # loop
    for i in range(n_passes):
        w = optimizer.optimize(num_passes=1)
        LB = L[0] / (1. * n_L[0])
        hook(i, w['v'], w['w'], LB)
        L[0] = 0
        n_L[0] = 0
        # Reset noise epsilon of some minibatches
        for j in range(n_minibatches):
            if n_resample > 0 and i % n_resample == j % n_resample:
                minibatches[j] = make_minibatch(j)
                optimizer.replace_subfunction(j, resample_keepmem,
                                              minibatches[j])

    print "Finished!"
Example #19
0
    def fit(self, train_X, optimizer, param_init = None, sample_every=None):
		self.opt = optimizer
		n_train, n_vis = train_X.shape
		batch_size = self.batch_size

		if sample_every == None:
			sample_every = 10000000

		#theano.config.profile = True
		#theano.config.exception_verbosity='high'

		assert(n_vis == self.nv)

		train_X = self.shared_dataset(train_X)
		n_batches = np.ceil(n_train / float(batch_size)).astype('int')

		# theano variables for managing data (index minibatches, n examples in batch)
		index, n_ex = T.iscalars('batch_index', 'n_ex')
		batch_start = index*batch_size
		batch_stop = T.minimum(n_ex, (index + 1)*batch_size)
		effective_batch_size = batch_stop - batch_start

		# theano variables for learning
		lr = T.scalar('lr', dtype=theano.config.floatX)
		mom = T.scalar('mom', dtype=theano.config.floatX)

		if self.k == 1:
			# this one is for scaning over a batch and getting connectivity for each example
			# return grads too because T.grads through scan is awful
			# takes ~3x longer, but can experiment connectivity
			#K, grads = self.mpf.rbm_K2G(self.X, effective_batch_size)

			# this tiles out the minibatch matrix into a 3D tensor to compute connectivity
			#K, offs, y, y1, z= self.mpf.rbm_K(self.X, effective_batch_size)
			K = self.mpf.rbm_K(self.X, effective_batch_size)

		elif self.k == 2:
			if DEBUG:
				return_values = self.mpf.debug_rbm_K_2wise(self.X, effective_batch_size)	
				K = return_values[-1]
			else:
				K = self.mpf.rbm_K_2wise(self.X, effective_batch_size)
		else:
			raise('NotImplemented')

		reg = self.L1_reg * self.mpf.L1 + self.L2_reg * self.mpf.L2
		reg_grad = T.grad(reg, self.mpf.theta)

		# if not scan (tile out matrix into tensor)
		cost = K + reg
		grads = T.grad(cost, self.mpf.theta)

		# otherwise
		#grads = grads + reg_grad

		if param_init == None:
			self.mpf.theta.set_value(random_theta(D, DH, k=self.k))
		else:
			self.mpf.theta.set_value(np.asarray(np.concatenate(param_init), dtype=theano.config.floatX))

		if optimizer == 'sgd':
			updates = []
			theta = self.mpf.theta
			theta_update = self.mpf.theta_update

			upd = mom * theta_update - lr * grads
			updates.append((theta_update, upd))
			updates.append((theta, theta + upd))

			print 'compiling theano function'
			if DEBUG:
				return_values = list(return_values)
				return_values.append(cost)
				return_values.append(grads)
				train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=return_values, updates=updates, givens={self.X: train_X[batch_start:batch_stop]})
			else:
				train_model = theano.function(inputs=[index, n_ex, lr, mom], outputs=cost, updates=updates, givens={self.X: train_X[batch_start:batch_stop]})

			self.current_epoch = 0
			start = time.time()
			learning_rate_init = self.learning_rate
			while self.current_epoch < self.n_epochs:
				print 'epoch:', self.current_epoch
				self.current_epoch += 1
				effective_mom = self.final_momentum if self.current_epoch > self.momentum_switchover else self.initial_momentum

				avg_epoch_cost = 0
				last_debug = None
				for minibatch_idx in xrange(n_batches):
					avg_cost = train_model(minibatch_idx, n_train, self.learning_rate, effective_mom)
					#print '\t\t', np.isnan(gr).sum(), np.isnan(yy).sum(), np.isnan(yy1).sum(), np.isnan(zz).sum()
					if DEBUG:
						return_values, avg_cost, gradients = avg_cost[:-2], avg_cost[-2], avg_cost[-1]
						print_debug(return_values, last_debug)
						last_debug = return_values
					avg_epoch_cost += avg_cost
					#print '\t', minibatch_idx, avg_cost
				print '\t avg epoch cost:', avg_epoch_cost/n_batches
				self.learning_rate *= self.learning_rate_decay

				theta_fit = split_theta(self.mpf.theta.get_value(), self.mpf.n_visible, self.mpf.n_hidden, k=self.mpf.k)
				if (self.current_epoch % sample_every == 0):
					sample_and_save(theta_fit, self.mpf.n_hidden, self.current_epoch, learning_rate_init, self.mpf.k, self.opt)

			theta_opt = self.mpf.theta.get_value()
			end = time.time()

		elif optimizer == 'cg' or optimizer == 'bfgs':
			print "compiling theano functions"
			get_batch_size = theano.function([index, n_ex], effective_batch_size, name='get_batch_size')
			batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')
			batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')
			batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')


			def train_fn_cost_grads(theta_value):
				print 'nbatches', n_batches

				self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True)
				train_losses_grads = [batch_cost_gradst(i, n_train) for i in xrange(n_batches)]

				train_losses = [i[0] for i in train_losses_grads]
				train_grads = [i[1] for i in train_losses_grads]

				train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)]

				print len(train_losses), len(train_grads)
				print train_losses[0].shape, train_grads[0].shape
				returns = np.average(train_losses, weights=train_batch_sizes), np.average(train_grads, weights=train_batch_sizes, axis=0)
				return returns


			def train_fn_cost(theta_value):
				print 'nbatches', n_batches

				self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True)
				train_costs = [batch_cost(i, n_train) for i in xrange(n_batches)]
				train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)]

				return np.average(train_costs, weights=train_batch_sizes)

			def train_fn_grads(theta_value):
				print 'nbatches', n_batches

				self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True)
				train_grads = [batch_grads(i, n_train) for i in xrange(n_batches)]
				train_batch_sizes = [get_batch_size(i, n_train) for i in xrange(n_batches)]

				return np.average(train_grads, weights=train_batch_sizes, axis=0)


			###############
			# TRAIN MODEL #
			###############
			def my_callback():
				print 'wtf'

			from scipy.optimize import minimize
			from scipy.optimize import fmin_bfgs, fmin_l_bfgs_b
			if optimizer == 'cg':
				pass
			elif optimizer == 'bfgs':
				print 'using bfgs'
				#theta_opt, f_theta_opt, info = fmin_l_bfgs_b(train_fn, self.mpf.theta.get_value(), iprint=1, maxfun=self.n_epochs)
				start = time.time()
				disp = True
				print 'ready to minimize'
				#result_obj = minimize(train_fn, self.mpf.theta.get_value(), jac=True, method='BFGS', options={'maxiter':self.n_epochs, 'disp':disp}, callback=my_callback())
				#theta_opt = fmin_bfgs(f=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs)
				theta_opt, fff, ddd = fmin_l_bfgs_b(func=train_fn_cost, x0=self.mpf.theta.get_value(), fprime=train_fn_grads, disp=1, maxiter=self.n_epochs)
				print 'done minimize ya right'
				end = time.time()

		elif optimizer == 'sof':
			print "compiling theano functions"
			batch_cost_grads = theano.function([index, n_ex], [cost, grads], givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')
			batch_cost = theano.function([index, n_ex], cost, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')
			batch_grads = theano.function([index, n_ex], grads, givens={self.X: train_X[batch_start:batch_stop, :]}, name='batch_cost')


			def train_fn(theta_value, i):
				self.mpf.theta.set_value(np.asarray(theta_value, dtype=theano.config.floatX), borrow=True)

				train_losses, train_grads = batch_cost_grads(i, n_train)
				
				return train_losses, train_grads

			###############
			# TRAIN MODEL #
			###############
			if param_init == None:
				theta.set_value(random_theta(D, DH))
			else:
				w0, bh0, bv0 = param_init
				self.mpf.theta.set_value(np.asarray(np.concatenate((w0, bh0, bv0)), dtype=theano.config.floatX))


			print 'using sof'
			sys.path.append('/export/mlrg/ebuchman/Programming/Sum-of-Functions-Optimizer')
			from sfo import SFO
			print 'n batches', n_batches
			print 'n epochs' , self.n_epochs
			optimizer = SFO(train_fn, self.mpf.theta.get_value(), np.arange(n_batches))
			start = time.time()
			theta_opt = optimizer.optimize(num_passes = self.n_epochs)
			end = time.time()

		
		self.mpf.theta.set_value(theta_opt.astype(theano.config.floatX), borrow=True)
		return end-start
# Compiling the sampling function

samplesT, tT, sample_updates=get_samps(nsamps, paramsT)
sample_T=theano.function([mu_centersT, mu_spreadsT, mu_biasesT, mu_MT, mu_bT,
					cov_centersT, cov_spreadsT, cov_biasesT, cov_MT, cov_bT],
					samplesT,
					allow_input_downcast=True)

def sample(params):
	out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5],
						params[6],params[7],params[8],params[9])
	return out


if automate_training:
	optimizer = SFO(f_df, init_params, subfuncs)
	end_loss=99.0
	while end_loss>-2.50:
		linalgerror=False
		try:
			opt_params = optimizer.optimize(num_passes=2)
			end_loss = f_df(opt_params,fdata)[0]
		except np.linalg.linalg.LinAlgError:
			linalgerror=True
		
		if np.isnan(end_loss) or linalgerror:
			mu_centers=(np.random.randn(nx, nhid_mu)*1.0).astype(np.float32)
			mu_spreads=(np.zeros((nx, nhid_mu))-1.0).astype(np.float32)
			mu_biases=np.zeros(nhid_mu).astype(np.float32)
			mu_M=(np.random.randn(nhid_mu, ntgates*nx)*0.01).astype(np.float32)
			mu_b=np.zeros((ntgates, nx)).astype(np.float32)
Example #21
0
	def train(self, images,
			batch_size=50,
			num_epochs=20,
			method='SGD',
			train_means=False,
			train_top_layer=False,
			momentum=0.9,
			learning_rate=1.,
			decay1=0.9,
			decay2=0.999,
			precondition=True):
		"""
		@type  images: C{ndarray}/C{list}
		@param images: an array or a list of images
		"""

		print 'Preprocessing...'

		inputs, outputs = self._preprocess(images)

		if precondition:
			print 'Preconditioning...'

			# remove correlations
			inputs, outputs = self._precondition(inputs, outputs)

		# indicates which layers will be trained
		train_layers = [self.num_layers - 1] if train_top_layer else range(self.num_layers)

		print 'Creating SLSTMs...'

		# create SLSTMs
		for l in range(self.num_layers):
			self.slstm[l] = SLSTM(
				num_rows=inputs.shape[1],
				num_cols=inputs.shape[2],
				num_channels=inputs.shape[3] if l < 1 else self.num_hiddens,
				num_hiddens=self.num_hiddens,
				batch_size=min([batch_size, self.MAX_BATCH_SIZE]),
				nonlinearity=self.nonlinearity,
				extended=self.extended,
				slstm=self.slstm[l],
				verbosity=self.verbosity)

		# compute loss function and its gradient
		def f_df(params, idx):
			# set model parameters
			for l in train_layers:
				self.slstm[l].set_parameters(params['slstm'][l])
			self.mcgsm._set_parameters(params['mcgsm'], {'train_means': train_means})

			# select batch and compute hidden activations
			Y = outputs[idx:idx + batch_size]
			H = inputs[idx:idx + batch_size]

			for l in range(self.num_layers):
				H = self.slstm[l].forward(H)

			# form inputs to MCGSM
			H_flat = H.reshape(-1, self.num_hiddens).T
			Y_flat = Y.reshape(-1, self.num_channels).T

			norm_const = -H_flat.shape[1]

			# compute gradients
			df_dh, _, loglik = self.mcgsm._data_gradient(H_flat, Y_flat)
			df_dh = df_dh.T.reshape(*H.shape) / norm_const

			# ignore bottom-right pixel (BSDS300)
			df_dh[:, -1, -1] = 0.

			# average negative log-likelihood
			f = sum(loglik) / norm_const

			df_dtheta = {}
			df_dtheta['slstm'] = [0.] * self.num_layers

			for l in range(self.num_layers)[::-1]:
				if l not in train_layers:
					break
				if l > min(train_layers):
					# derivative with respect to inputs of layer l are derivatives
					# of hidden states of layer l - 1
					df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh, force_backward=True)
					df_dh = df_dtheta['slstm'][l]['inputs']
					del df_dtheta['slstm'][l]['inputs']

				else:
					# no need to compute derivatives with respect to input units
					df_dtheta['slstm'][l] = self.slstm[l].backward(df_dh)

			# compute gradient of MCGSM
			df_dtheta['mcgsm'] = self.mcgsm._parameter_gradient(H_flat, Y_flat,
				parameters={'train_means': train_means}) * log(2.) * self.mcgsm.dim_out

			return f, df_dtheta

		# collect current parameters
		params = {}
		params['slstm'] = [0.] * self.num_layers
		for l in range(self.num_layers)[::-1]:
			if l not in train_layers:
				break
			params['slstm'][l] = self.slstm[l].parameters()
		params['mcgsm'] = self.mcgsm._parameters({'train_means': train_means})

		# a start index for each batch
		start_indices = range(
			0, inputs.shape[0] - batch_size + 1, batch_size)

		print 'Training...'

		if method.upper() == 'SFO':
			try:
				# optimize using sum-of-functions optimizer
				optimizer = SFO(f_df, params, start_indices, display=self.verbosity)
				params_opt = optimizer.optimize(num_passes=num_epochs)

				# set model parameters
				for l in range(self.num_layers):
					self.slstm[l].set_parameters(params_opt['slstm'][l])
				self.mcgsm._set_parameters(params_opt['mcgsm'], {'train_means': train_means})

			except KeyboardInterrupt:
				pass

			return optimizer.hist_f_flat

		elif method.upper() == 'SGD':
			loss = []
			diff = {
				'slstm': [0.] * self.num_layers,
				'mcgsm': zeros_like(params['mcgsm'])}

			for l in train_layers:
				diff['slstm'][l] = {}
				for key in params['slstm'][l]:
					diff['slstm'][l][key] = zeros_like(params['slstm'][l][key])

			for n in range(num_epochs):
				for b in range(0, inputs.shape[0] - batch_size + 1, batch_size):
					# compute gradients
					f, df = f_df(params, b)

					loss.append(f)

					# update SLSTM parameters
					for l in train_layers:
						for key in params['slstm'][l]:
							diff['slstm'][l][key] = momentum * diff['slstm'][l][key] - df['slstm'][l][key]
							params['slstm'][l][key] = params['slstm'][l][key] + learning_rate * diff['slstm'][l][key]

					# update MCGSM parameters
					diff['mcgsm'] = momentum * diff['mcgsm'] - df['mcgsm']
					params['mcgsm'] = params['mcgsm'] + learning_rate * diff['mcgsm']

					if self.verbosity > 0:
						print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
							n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):]))

			return loss

		elif method.upper() == 'ADAM':
			loss = []
			diff_mean = {
				'slstm': [0.] * self.num_layers,
				'mcgsm': zeros_like(params['mcgsm'])}
			diff_sqrd = {
				'slstm': [0.] * self.num_layers,
				'mcgsm': zeros_like(params['mcgsm'])}

			for l in train_layers:
				diff_mean['slstm'][l] = {}
				diff_sqrd['slstm'][l] = {}
				for key in params['slstm'][l]:
					diff_mean['slstm'][l][key] = zeros_like(params['slstm'][l][key])
					diff_sqrd['slstm'][l][key] = zeros_like(params['slstm'][l][key])

			# step counter
			t = 1

			for n in range(num_epochs):
				for b in range(0, inputs.shape[0] - batch_size + 1, batch_size):
					# compute gradients
					f, df = f_df(params, b)

					loss.append(f)

					# include bias correction in step width
					step_width = learning_rate / (1. - power(decay1, t)) * sqrt(1. - power(decay2, t))
					t += 1

					# update SLSTM parameters
					for l in train_layers:
						for key in params['slstm'][l]:
							diff_mean['slstm'][l][key] = decay1 * diff_mean['slstm'][l][key] \
								+ (1. - decay1) * df['slstm'][l][key]
							diff_sqrd['slstm'][l][key] = decay2 * diff_sqrd['slstm'][l][key] \
								+ (1. - decay2) * square(df['slstm'][l][key])

							params['slstm'][l][key] = params['slstm'][l][key] - \
								step_width * diff_mean['slstm'][l][key] / (1e-8 + sqrt(diff_sqrd['slstm'][l][key]))

					# update MCGSM parameters
					diff_mean['mcgsm'] = decay1 * diff_mean['mcgsm'] + (1. - decay1) * df['mcgsm']
					diff_sqrd['mcgsm'] = decay2 * diff_sqrd['mcgsm'] + (1. - decay2) * square(df['mcgsm'])
					params['mcgsm'] = params['mcgsm'] - \
						step_width * diff_mean['mcgsm'] / (1e-8 + sqrt(diff_sqrd['mcgsm']))

					if self.verbosity > 0:
						print '{0:>5} {1:>10.4f} {2:>10.4f}'.format(
							n, loss[-1], mean(loss[-max([10, 20000 // batch_size]):]))

			return loss


		else:
			raise ValueError('Unknown method \'{0}\'.'.format(method))
Example #22
0
def optim_vae_sfo(model, x, v_init, w_init, n_batch, n_passes, hook, n_resample=20, resample_keepmem=False, bernoulli_x=False, display=0):
    
    # Shuffle columns of dataset x
    ndict.shuffleCols(x)
    
    # create minibatches
    n_tot = x.itervalues().next().shape[1]
    minibatches = []
    n_minibatches = n_tot / n_batch
    if (n_tot%n_batch) != 0: raise Exception()
    
    # Divide into minibatches
    def make_minibatch(i):
        _x = ndict.getCols(x, i * n_batch, (i+1) * n_batch)
        _eps = model.gen_eps(n_batch)
        if bernoulli_x: _x['x'] = np.random.binomial(n=1, p=_x['x'])
        return [i, _x, _eps]

    for i in range(n_minibatches):
        minibatches.append(make_minibatch(i))
      
    L = [0.]
    n_L = [0]
    
    def f_df(w, minibatch):
        
        i_minibatch = minibatch[0]
        x_minibatch = minibatch[1]
        eps_minibatch = minibatch[2]
        
        # Get gradient
        logpx, logpz, logqz, gv, gw = model.dL_dw(w['v'], w['w'], x_minibatch, eps_minibatch)
        
        # Get gradient w.r.t. priors
        logpv, logpw, gv_prior, gw_prior = model.dlogpw_dw(w['v'], w['w'])
        gv = {i: gv[i] + float(n_batch)/n_tot * gv_prior[i] for i in gv}
        gw = {i: gw[i] + float(n_batch)/n_tot * gw_prior[i] for i in gw}
        
        f = (logpx.sum() + logpz.sum() - logqz.sum())
        L[0] += -f/(1.*n_batch)
        n_L[0] += 1
        f += float(n_batch)/n_tot * logpv
        f += float(n_batch)/n_tot * logpw
        
        for i in gv: gv[i] *= -1./n_batch
        for i in gw: gw[i] *= -1./n_batch
        f *= -1./n_batch
        
        #print 'norms gv:'
        #ndict.pNorm(gv)
        #print 'norms gw'
        #ndict.pNorm(gw)
        
        return f, {'v':gv,'w':gw}
    
    w_init = {'v':v_init, 'w':w_init}
    
    from sfo import SFO
    optimizer = SFO(f_df, w_init, minibatches, display=display)
    
    #optimizer.check_grad()
    
    # loop
    for i in range(n_passes):
        w = optimizer.optimize(num_passes=1)
        LB = L[0]/(1.*n_L[0])
        hook(i, w['v'], w['w'], LB)
        L[0] = 0
        n_L[0] = 0
        # Reset noise epsilon of some minibatches
        for j in range(n_minibatches):
            if n_resample > 0 and i%n_resample == j%n_resample:
                minibatches[j] = make_minibatch(j)
                optimizer.replace_subfunction(j, resample_keepmem, minibatches[j])
        
    print "Finished!"
# Compiling the sampling function

samplesT, tT, sample_updates=get_samps(nsamps, paramsT)
sample_T=theano.function([muW0T, muW1T, muW2T, mub0T, mub1T, mub2T, 
					covW0T, covW1T, covW2T, covb0T, covb1T, covb2T],
					samplesT,
					allow_input_downcast=True)

def sample(params):
	out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5],
						params[6],params[7],params[8],params[9],params[10],params[11])
	return out

# Creating the optimizer

optimizer = SFO(f_df, init_params, subfuncs)

# Running the optimization

init_loss = f_df(init_params,subfuncs[0])[0]
print init_loss

keyin=''
while keyin!='y':
	opt_params = optimizer.optimize(num_passes=24*4)
	end_loss = f_df(opt_params,subfuncs[0])[0]
	print 'Current loss: ', end_loss
	W=opt_params[0]
	pp.scatter(W[0,:],W[1,:]); pp.show()
	keyin=raw_input('End optimization? (y)')
# Compiling the sampling function

samplesT, tT, sample_updates=get_samps(nsamps, paramsT)
sample_T=theano.function([muW0T, muW1T, muW2T, mub0T, mub1T, mub2T, 
					covW0T, covW1T, covW2T, covb0T, covb1T, covb2T],
					samplesT,
					allow_input_downcast=True)

def sample(params):
	out = sample_T(params[0],params[1],params[2],params[3],params[4],params[5],
						params[6],params[7],params[8],params[9],params[10],params[11])
	return out

# Creating the optimizer

optimizer = SFO(f_df, init_params, subfuncs)

# Running the optimization

init_loss = f_df(init_params,subfuncs[0])[0]
print init_loss

keyin=''
while keyin!='y':
	opt_params = optimizer.optimize(num_passes=12)
	end_loss = f_df(opt_params,subfuncs[0])[0]
	print 'Current loss: ', end_loss
	W=opt_params[0]
	pp.scatter(W[0,:],W[1,:]); pp.show()
	keyin=raw_input('End optimization? (y)')