Esempio n. 1
0
def sample_chain(x0,
                 N,
                 energy_difference,
                 proposal_stddev,
                 thinning_factor=1,
                 burn_in=0,
                 temperature=1.0):
    """
    Vanilla Monte Carlo Markov Chain that proposes changes
    according to isotropic a Normal distribution N(0, proposal_stddev).

    This makes absolutely no use of the fact that we
    are dealing with an autoencoder apart from the
    fact that the energy_difference function is usually
    meant to be obtained from a DAE's reconstruction function.
    """

    if len(x0.shape) != 1:
        error("Wrong dimension for x0. This function is not vectorial.")

    if thinning_factor < 1:
        error(
            "You misunderstood the thinning_factor. It should be 1 for no thinning, and 32 if we want one out of every 32 samples."
        )

    proposal = lambda current_x: current_x + np.random.normal(
        size=current_x.shape, scale=proposal_stddev)

    def iterate_N_times(current_x, energy_difference, N):
        for _ in np.arange(N):
            proposed_x = proposal(current_x)
            loga = -energy_difference(proposed_x, current_x) / temperature
            if loga >= 0 or loga >= np.log(np.random.uniform(0, 1)):
                # accepted !
                current_x = proposed_x
                iterate_N_times.accepted_counter += 1
            else:
                iterate_N_times.rejected_counter += 1

        return current_x

    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0

    # Start with the burn-in iterations.
    current_x = x0
    current_x = iterate_N_times(current_x, energy_difference, burn_in)

    # Then we can think about collecting samples.
    samples_list = []
    # Start from the 'current_x' from the burn_in
    # and not from x0. Reset the acceptance counters.
    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0

    progress_logger = make_progress_logger("Sampling")

    for n in np.arange(0, N):
        current_x = iterate_N_times(current_x, energy_difference,
                                    thinning_factor)
        # collect sample after running through the thinning iterations
        samples_list.append(current_x)
        progress_logger(1.0 * n / N)

    samples = np.vstack(samples_list)
    acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (
        iterate_N_times.accepted_counter + iterate_N_times.rejected_counter)

    return (samples, acceptance_ratio)
Esempio n. 2
0
def sample_chain(x0, N,
                 energy_difference, proposal_stddev,
                 thinning_factor = 1, burn_in = 0, temperature = 1.0):
    """
    Vanilla Monte Carlo Markov Chain that proposes changes
    according to isotropic a Normal distribution N(0, proposal_stddev).

    This makes absolutely no use of the fact that we
    are dealing with an autoencoder apart from the
    fact that the energy_difference function is usually
    meant to be obtained from a DAE's reconstruction function.
    """

    if len(x0.shape) != 1:
        error("Wrong dimension for x0. This function is not vectorial.")

    if thinning_factor < 1:
        error("You misunderstood the thinning_factor. It should be 1 for no thinning, and 32 if we want one out of every 32 samples.")

    proposal = lambda current_x: current_x + np.random.normal(size=current_x.shape, scale=proposal_stddev)


    def iterate_N_times(current_x, energy_difference, N):
        for _ in np.arange(N):
            proposed_x = proposal(current_x)
            loga = - energy_difference(proposed_x, current_x) / temperature
            if loga >= 0 or loga >= np.log(np.random.uniform(0,1)):
                # accepted !
                current_x = proposed_x
                iterate_N_times.accepted_counter += 1
            else:
                iterate_N_times.rejected_counter += 1

        return current_x

    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0


    # Start with the burn-in iterations.
    current_x = x0
    current_x = iterate_N_times(current_x, energy_difference, burn_in)

    # Then we can think about collecting samples.
    samples_list = []
    # Start from the 'current_x' from the burn_in
    # and not from x0. Reset the acceptance counters.
    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0

    progress_logger = make_progress_logger("Sampling")
        
    for n in np.arange(0,N):
        current_x = iterate_N_times(current_x, energy_difference, thinning_factor)
        # collect sample after running through the thinning iterations
        samples_list.append(current_x)
        progress_logger(1.0*n/N)

    samples = np.vstack(samples_list)
    acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (iterate_N_times.accepted_counter + iterate_N_times.rejected_counter)

    return (samples, acceptance_ratio)
Esempio n. 3
0
def sample_chain(x0, N,
                 energy_difference, noise_levels,
                 r, r_prime,
                 thinning_factor = 1, burn_in = 0,
                 accept_all_proposals = False, proposal_noise_scheme = 'merge_x', omit_asymmetric_proposal_factor = False):
    """
    Will sample N values for the chain starting with x0.

    noise_levels is a dict with keys 
    ["train_stddev"], ["train_stddev", "langevin_beta"] or ["train_stddev", "langevin_stddev"]

    """

    print proposal_noise_scheme

    assert len(x0.shape) == 1, "Wrong dimension for x0."

    assert thinning_factor >= 1, "You misunderstood the thinning_factor. It should be 1 for no thinning, and 32 if we want one out of every 32 samples."

    train_stddev    = noise_levels["train_stddev"]
    langevin_stddev = noise_levels["langevin_stddev"]
    langevin_beta   = noise_levels["langevin_beta"]
    temperature     = noise_levels["temperature"]

    def langevin_proposal(current_x, preimage_current_x):

        # We are using the term "preimage" here because it corresponds
        # to the preimage when langevin_beta=1.0.
        # Otherwise, it should be called the "noisy_ancestor" or something
        # like that to reflect the fact that it's more about
        #
        # x_{\textrm{noisy}}^{(t)}&=&x^{(t)}+\epsilon\hspace{1em}for\hspace{1em}\epsilon\sim\mathcal{N}(0,\sigma^{2})
        # x^{*}&=&\left(1-\beta\right)x_{\textrm{noisy}}^{(t)}+\beta r^{*}(x_{\textrm{noisy}}^{(t)})
        #
        # than about being the preimage. Latex the stuff above to read it properly.

        # This function accesses the variables from the "closure" : accept_all_proposals, proposal_noise_scheme

        d = current_x.shape[0]

        if proposal_noise_scheme == 'merge_x':
            preimage_proposed_x = current_x + np.random.normal(size=(d,), scale=langevin_stddev)
            proposed_x = (1-langevin_beta) * preimage_proposed_x + langevin_beta * r(preimage_proposed_x)
        elif proposal_noise_scheme == 'noise_E':
            preimage_proposed_x = current_x + np.random.normal(size=(d,), scale=langevin_stddev)
            proposed_x = current_x - langevin_beta * preimage_proposed_x + langevin_beta * r(preimage_proposed_x)
        elif proposal_noise_scheme == 'noise_r':
            preimage_proposed_x = current_x + np.random.normal(size=(d,), scale=langevin_stddev)
            proposed_x = (1-langevin_beta)*current_x  + langevin_beta * r(preimage_proposed_x)
        else:
            raise("Unrecognized proposal_noise_scheme : %s" % proposal_noise_scheme)

        if accept_all_proposals or omit_asymmetric_proposal_factor:
            asymmetric_correction_log_factor = 0.0
        else:
            # Now we need to compute
            # log q( current_x | proposed_x ) - log q( proposed_x | current_x )

            A = np.zeros((2,))
            B = np.zeros((2,))

            A[0] = - 0.5/langevin_stddev**2 * ((preimage_current_x - proposed_x)**2).sum()
            B[0] = - 0.5/langevin_stddev**2 * ((preimage_proposed_x - current_x)**2).sum()
            if proposal_noise_scheme == 'merge_x':
                A[1] = -1 * np.log( np.linalg.det( (1-langevin_beta) * np.eye(d) +  langevin_beta * r_prime(preimage_current_x)) )
                B[1] = -1 * np.log( np.linalg.det( (1-langevin_beta) * np.eye(d) +  langevin_beta * r_prime(preimage_proposed_x)) )
            elif proposal_noise_scheme == 'noise_E':
                # clueless
                A[1] = -1 * np.log( np.linalg.det( (-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_current_x)) )
                B[1] = -1 * np.log( np.linalg.det( (-langevin_beta) * np.eye(d) + langevin_beta * r_prime(preimage_proposed_x)) )
                #pass
            elif proposal_noise_scheme == 'noise_r':
                # clueless
                A[1] = -1 * np.log(  np.linalg.det( langevin_beta * r_prime(preimage_current_x)) )
                B[1] = -1 * np.log( np.linalg.det( langevin_beta * r_prime(preimage_proposed_x)) )
                #pass
            else:
                raise("Unrecognized proposal_noise_scheme : %s" % proposal_noise_scheme)

            asymmetric_correction_log_factor = A[0] + A[1] - B[0] - B[1]                


        return (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor)


    def iterate_N_times(current_x, preimage_current_x, energy_difference, N):
        for _ in np.arange(N):
            (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) = langevin_proposal(current_x, preimage_current_x)

            if accept_all_proposals:
                loga = 0.0
            else:
                # This is a - in front of the energy difference because
                # log( p(proposed_x) / p(current_x) ) \approx -E(proposed_x) - -E(current_x) = - energy_difference(proposed_x, current_x)
                loga = - energy_difference(proposed_x, current_x) / temperature + asymmetric_correction_log_factor
                # loga = - energy_difference(proposed_x, current_x) + asymmetric_correction_log_factor

            if accept_all_proposals or loga >= 0.0 or loga >= np.log(np.random.uniform(0,1)):
                # accepted !
                current_x = proposed_x
                preimage_current_x = preimage_proposed_x
                iterate_N_times.accepted_counter += 1
            else:
                iterate_N_times.rejected_counter += 1

        return (current_x, preimage_current_x)

    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0


    # Start with the burn-in iterations.
    current_x = x0
    # not quite the actual pre-image, but it's just for initialization purposes
    preimage_current_x = current_x
    (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, burn_in)

    # Then we can think about collecting samples.
    samples_list = []
    # Start from the 'current_x' from the burn_in
    # and not from x0. Reset the acceptance counters.
    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0

    progress_logger = make_progress_logger("Sampling")

    for n in np.arange(0,N):
        (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, thinning_factor)
        # collect sample after running through the thinning iterations
        samples_list.append(current_x)
        progress_logger(1.0*n/N)

    samples = np.vstack(samples_list)
    acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (iterate_N_times.accepted_counter + iterate_N_times.rejected_counter)

    return (samples, acceptance_ratio, noise_levels)
Esempio n. 4
0
    def fit_with_stddevs_sequence(self, X, X_valid, stddevs, optimization_args):
        """
        stddevs has fields 'train', 'valid' and any number of other variants on 'valid'.
        The special key is 'train', used for training.
        The validation errors are computed with all the other keys that contain
        information about the stddev. Obviously, we want to use one called 'valid', but
        we can also have different alternatives such as 'alt_valid' or 'valid2' with
        a different sequence of stddevs.

        stddevs is of the form {'train' : [{'target' : 1.0, 'sampled' : 4.0},
                                           {'target' : 0.8, 'sampled' : 3.0},
                                           ...
                                           ],
                                'valid' : [{'target' : 1.0, 'sampled' : 4.0},
                                           {'target' : 0.8, 'sampled' : 3.0},
                                           ...
                                           ],
                                ...
                                }

        X is an array of shape (n_train, d)
        X_valid is an array of shape (n_valid, d). It can be None.

        optimisation_args passed through to the method 'fit' of this class.
        example of optimation_args :
                                     {'method' : 'fmin_l_bfgs_b',
                                      'maxiter' : maxiter,
                                      'm':lbfgs_rank}

        Returns the losses for all the stddevs. The variable 'best_q_mean_losses'.
        """

        validate_the_stddevs_argument(stddevs)

        # the walkback_vector_func is the function r(x) that
        # we have from this DAE
        walkback_vector_func = lambda X: self.encode_decode(X)

        progress_logger = make_progress_logger("Training")

        best_q_mean_losses = dict([(key, []) for key in stddevs.keys()])
        # Summary :
        #     Everything that follows is just a way to mutate the value of 'best_q'.
        #     That 'best_q' variable contains the learned parameters.
        #     We log various things based on the current value of 'best_q' and
        #     the datasets (X, X_valid).
        #     At the end of the day, we're left with 'best_q' and stuff logged
        #     in 'best_q_mean_losses' to make an informed decision about the
        #     usefulness of the model learned.

        M = len(stddevs["train"])
        for m in range(0, M):

            e = stddevs["train"][m]
            (noisy_X, importance_sampling_weights) = get_noisy_X_and_importance_weights(X, e, walkback_vector_func)

            (best_q, train_U_best_q) = self.fit(X, noisy_X, importance_sampling_weights, optimization_args)

            train_mean_U_best_q = train_U_best_q / X.shape[0]
            best_q_mean_losses["train"].append(train_mean_U_best_q)
            sys.stdout.write("        train mean loss is %f\n" % (train_mean_U_best_q,))

            if X_valid is not None:

                for key in stddevs.keys():
                    if key == "train":
                        continue

                    e = stddevs[key][m]
                    if e["sampled"] is None:
                        best_q_mean_losses[key].append(None)
                        continue

                    (noisy_X_valid, importance_sampling_weights) = get_noisy_X_and_importance_weights(
                        X_valid, e, walkback_vector_func
                    )

                    some_valid_U_best_q = self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum()

                    # Notice : Despite the importance_sampling_weights being used,
                    # I think that we are still doing the right thing by normalizing by
                    # X_valid.shape[0]. I was a bit afraid that we'd be throwing off everything
                    # by using these coefficients, but now I think that we won't find ourselves
                    # in a situation where the validation loss will be useless because of the
                    # wild importance sampling weights.

                    some_valid_mean_U_best_q = some_valid_U_best_q / X_valid.shape[0]
                    best_q_mean_losses[key].append(some_valid_mean_U_best_q)
                    sys.stdout.write("        %s mean loss is %f\n" % (str(key), some_valid_mean_U_best_q))

                    progress_logger(1.0 * (m + 1) / M)

        return best_q_mean_losses
Esempio n. 5
0
    def fit_with_decreasing_noise(
        self,
        X,
        list_of_train_stddev,
        optimization_args,
        early_termination_args={},
        X_valid=None,
        list_of_additional_valid_stddev=None,
    ):
        """
        The 'optimization_args' filters through to the 'fit' function almost unchanged.

        There is the option of adding a a special provision
        for it's 'maxiter' entry when we get a list.
        In such a situation, we use one value of maxiter
        from the list for each value of list_of_train_stddev.

        The 'early_termination_args' is optional. It provides a way to
        stop the training if we determine that we started in a state
        that was irredeemable and would only lead to a bad local minimum.
        We can keep in mind the r(x) = x solution as a benchmark and
        observe that, with r(x) = x we would have a loss function that
        roughly equals
            d * train_stddev**2, where d is the dimension of the data.

        The 'early_termination_args' dict has one key for now.
            early_termination_args['stop_if_loss_greater_than'] = [...]
                or
            early_termination_args['stop_if_loss_greater_than'] = "auto"

        If X_valid is not None, we will also return the values of the
        objective function evaluated with those validation samples.
        Those values will be the onces according to which we will
        decide to stop or not the descent with the train_stddev values.
        """

        # If we were passed the argument "auto", we have to replace the
        # value with an array of corresponding values.
        if (
            early_termination_args.has_key("stop_if_loss_greater_than")
            and type(early_termination_args["stop_if_loss_greater_than"]) == str
        ):
            if early_termination_args["stop_if_loss_greater_than"] == "auto":
                early_termination_args["stop_if_loss_greater_than"] = [
                    X.shape[1] * train_stddev ** 2 for train_stddev in list_of_train_stddev
                ]
                print "early termination with losses : "
                print early_termination_args["stop_if_loss_greater_than"]
            else:
                print "Wrong value for early_termination_args. Only valid string is 'auto'."
                print "Exiting."
                quit()

        # at some point we might want to decide to
        # record all the best_q for the sequence
        seq_train_mean_best_U_q = []
        seq_valid_mean_best_U_q = []
        i = 0
        progress_logger = make_progress_logger("Training")

        for train_stddev in list_of_train_stddev:

            sys.stdout.write("    Using train_stddev %f, " % train_stddev)
            (noisy_X, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights(
                X, 4.0 * train_stddev, train_stddev
            )
            # noisy_X = X + np.random.normal(size = X.shape, scale = train_stddev)

            if optimization_args.has_key("maxiter") and type(optimization_args["maxiter"]) in [list, np.array]:
                assert len(optimization_args["maxiter"]) == len(list_of_train_stddev)
                optimization_args0 = conj(optimization_args, "maxiter", optimization_args["maxiter"][i])
            else:
                optimization_args0 = optimization_args
            (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args0)
            # (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args)

            train_U_best_q = self.q_loss(best_q, X, noisy_X, importance_sampling_weights).sum()
            # sanity check to make sure that we're evaluating this right
            assert abs(train_U_best_q - train_U_best_q_) < 1e-8

            train_mean_U_best_q = train_U_best_q / X.shape[0]
            seq_train_mean_best_U_q.append(train_mean_U_best_q)
            sys.stdout.write("train mean loss is %f, " % (train_mean_U_best_q,))

            if not (X_valid == None):
                (noisy_X_valid, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights(
                    X_valid, 4.0 * train_stddev, train_stddev
                )

                # noisy_X_valid = X_valid + np.random.normal(size = X_valid.shape, scale = train_stddev)
                valid_U_best_q = self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum()
                valid_mean_U_best_q = valid_U_best_q / X_valid.shape[0]
                seq_valid_mean_best_U_q.append(valid_mean_U_best_q)
                sys.stdout.write("valid mean loss is %f." % (valid_mean_U_best_q,))

                # if we're dealing with a validation set, it will be the one used
                # to determine the stopping point
                if (
                    early_termination_args.has_key("stop_if_loss_greater_than")
                    and early_termination_args["stop_if_loss_greater_than"][i] < valid_mean_U_best_q
                ):
                    break
            else:
                # if we don't have a validation set, then we'll use mean_U_best_q
                # for the termination condition

                if (
                    early_termination_args.has_key("stop_if_loss_greater_than")
                    and early_termination_args["stop_if_loss_greater_than"][i] < mean_U_best_q
                ):
                    break

            print ""
            progress_logger(1.0 * i / len(list_of_train_stddev))
            i += 1
        # end for

        # might as well pad the rest of the list to
        # signify that we terminated early
        while len(seq_train_mean_best_U_q) < len(list_of_train_stddev):
            seq_train_mean_best_U_q.append(np.nan)
        while len(seq_valid_mean_best_U_q) < len(list_of_train_stddev):
            seq_valid_mean_best_U_q.append(np.nan)

        # Now we want to recompute the model losses for all the values of
        # the train_stddev, but using the final parameters best_q.
        # This will be used as an addition quality evaluation to determine
        # how the DAE treats data that's relatively far from the manifold
        # once it's done training.
        # It might be even more informative than the validation losses.

        seq_valid_mean_U_final_best_q = None
        seq_alt_valid_mean_U_final_best_q = None
        if not (X_valid == None):
            nreps = 10
            # This thing doesn't work with the list comprehension. You need to generate the data every time.
            (noisy_X_valid, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights(
                X_valid, 4.0 * train_stddev, train_stddev
            )
            seq_valid_mean_U_final_best_q = [
                np.array(
                    [
                        self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum()
                        / X_valid.shape[0]
                        for _ in range(nreps)
                    ]
                ).mean()
                for train_stddev in list_of_train_stddev
            ]

            if (list_of_additional_valid_stddev is not None) and len(list_of_additional_valid_stddev) > 0:
                # TODO : use some kind of tool to generate the importance_sampling_weights
                seq_alt_valid_mean_U_final_best_q = [
                    np.array(
                        [
                            self.q_loss(
                                best_q, X_valid, X_valid + np.random.normal(size=X_valid.shape, scale=alt_valid_stddev)
                            ).sum()
                            / X_valid.shape[0]
                            for _ in range(nreps)
                        ]
                    ).mean()
                    for alt_valid_stddev in list_of_additional_valid_stddev
                ]
        # end if

        return (
            seq_train_mean_best_U_q,
            seq_valid_mean_best_U_q,
            seq_valid_mean_U_final_best_q,
            seq_alt_valid_mean_U_final_best_q,
        )
Esempio n. 6
0
def sample_chain(x0,
                 N,
                 energy_difference,
                 noise_levels,
                 r,
                 r_prime,
                 thinning_factor=1,
                 burn_in=0,
                 accept_all_proposals=False,
                 proposal_noise_scheme='merge_x',
                 omit_asymmetric_proposal_factor=False):
    """
    Will sample N values for the chain starting with x0.

    noise_levels is a dict with keys 
    ["train_stddev"], ["train_stddev", "langevin_beta"] or ["train_stddev", "langevin_stddev"]

    """

    print proposal_noise_scheme

    assert len(x0.shape) == 1, "Wrong dimension for x0."

    assert thinning_factor >= 1, "You misunderstood the thinning_factor. It should be 1 for no thinning, and 32 if we want one out of every 32 samples."

    train_stddev = noise_levels["train_stddev"]
    langevin_stddev = noise_levels["langevin_stddev"]
    langevin_beta = noise_levels["langevin_beta"]
    temperature = noise_levels["temperature"]

    def langevin_proposal(current_x, preimage_current_x):

        # We are using the term "preimage" here because it corresponds
        # to the preimage when langevin_beta=1.0.
        # Otherwise, it should be called the "noisy_ancestor" or something
        # like that to reflect the fact that it's more about
        #
        # x_{\textrm{noisy}}^{(t)}&=&x^{(t)}+\epsilon\hspace{1em}for\hspace{1em}\epsilon\sim\mathcal{N}(0,\sigma^{2})
        # x^{*}&=&\left(1-\beta\right)x_{\textrm{noisy}}^{(t)}+\beta r^{*}(x_{\textrm{noisy}}^{(t)})
        #
        # than about being the preimage. Latex the stuff above to read it properly.

        # This function accesses the variables from the "closure" : accept_all_proposals, proposal_noise_scheme

        d = current_x.shape[0]

        if proposal_noise_scheme == 'merge_x':
            preimage_proposed_x = current_x + np.random.normal(
                size=(d, ), scale=langevin_stddev)
            proposed_x = (
                1 - langevin_beta
            ) * preimage_proposed_x + langevin_beta * r(preimage_proposed_x)
        elif proposal_noise_scheme == 'noise_E':
            preimage_proposed_x = current_x + np.random.normal(
                size=(d, ), scale=langevin_stddev)
            proposed_x = current_x - langevin_beta * preimage_proposed_x + langevin_beta * r(
                preimage_proposed_x)
        elif proposal_noise_scheme == 'noise_r':
            preimage_proposed_x = current_x + np.random.normal(
                size=(d, ), scale=langevin_stddev)
            proposed_x = (1 - langevin_beta) * current_x + langevin_beta * r(
                preimage_proposed_x)
        else:
            raise ("Unrecognized proposal_noise_scheme : %s" %
                   proposal_noise_scheme)

        if accept_all_proposals or omit_asymmetric_proposal_factor:
            asymmetric_correction_log_factor = 0.0
        else:
            # Now we need to compute
            # log q( current_x | proposed_x ) - log q( proposed_x | current_x )

            A = np.zeros((2, ))
            B = np.zeros((2, ))

            A[0] = -0.5 / langevin_stddev**2 * (
                (preimage_current_x - proposed_x)**2).sum()
            B[0] = -0.5 / langevin_stddev**2 * (
                (preimage_proposed_x - current_x)**2).sum()
            if proposal_noise_scheme == 'merge_x':
                A[1] = -1 * np.log(
                    np.linalg.det((1 - langevin_beta) * np.eye(d) +
                                  langevin_beta * r_prime(preimage_current_x)))
                B[1] = -1 * np.log(
                    np.linalg.det(
                        (1 - langevin_beta) * np.eye(d) +
                        langevin_beta * r_prime(preimage_proposed_x)))
            elif proposal_noise_scheme == 'noise_E':
                # clueless
                A[1] = -1 * np.log(
                    np.linalg.det((-langevin_beta) * np.eye(d) +
                                  langevin_beta * r_prime(preimage_current_x)))
                B[1] = -1 * np.log(
                    np.linalg.det(
                        (-langevin_beta) * np.eye(d) +
                        langevin_beta * r_prime(preimage_proposed_x)))
                #pass
            elif proposal_noise_scheme == 'noise_r':
                # clueless
                A[1] = -1 * np.log(
                    np.linalg.det(langevin_beta * r_prime(preimage_current_x)))
                B[1] = -1 * np.log(
                    np.linalg.det(
                        langevin_beta * r_prime(preimage_proposed_x)))
                #pass
            else:
                raise ("Unrecognized proposal_noise_scheme : %s" %
                       proposal_noise_scheme)

            asymmetric_correction_log_factor = A[0] + A[1] - B[0] - B[1]

        return (proposed_x, preimage_proposed_x,
                asymmetric_correction_log_factor)

    def iterate_N_times(current_x, preimage_current_x, energy_difference, N):
        for _ in np.arange(N):
            (proposed_x, preimage_proposed_x,
             asymmetric_correction_log_factor) = langevin_proposal(
                 current_x, preimage_current_x)

            if accept_all_proposals:
                loga = 0.0
            else:
                # This is a - in front of the energy difference because
                # log( p(proposed_x) / p(current_x) ) \approx -E(proposed_x) - -E(current_x) = - energy_difference(proposed_x, current_x)
                loga = -energy_difference(
                    proposed_x,
                    current_x) / temperature + asymmetric_correction_log_factor
                # loga = - energy_difference(proposed_x, current_x) + asymmetric_correction_log_factor

            if accept_all_proposals or loga >= 0.0 or loga >= np.log(
                    np.random.uniform(0, 1)):
                # accepted !
                current_x = proposed_x
                preimage_current_x = preimage_proposed_x
                iterate_N_times.accepted_counter += 1
            else:
                iterate_N_times.rejected_counter += 1

        return (current_x, preimage_current_x)

    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0

    # Start with the burn-in iterations.
    current_x = x0
    # not quite the actual pre-image, but it's just for initialization purposes
    preimage_current_x = current_x
    (current_x,
     preimage_current_x) = iterate_N_times(current_x, preimage_current_x,
                                           energy_difference, burn_in)

    # Then we can think about collecting samples.
    samples_list = []
    # Start from the 'current_x' from the burn_in
    # and not from x0. Reset the acceptance counters.
    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0

    progress_logger = make_progress_logger("Sampling")

    for n in np.arange(0, N):
        (current_x,
         preimage_current_x) = iterate_N_times(current_x, preimage_current_x,
                                               energy_difference,
                                               thinning_factor)
        # collect sample after running through the thinning iterations
        samples_list.append(current_x)
        progress_logger(1.0 * n / N)

    samples = np.vstack(samples_list)
    acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (
        iterate_N_times.accepted_counter + iterate_N_times.rejected_counter)

    return (samples, acceptance_ratio, noise_levels)
Esempio n. 7
0
def sample_chain(x0,
                 N,
                 energy_difference,
                 noise_levels,
                 r,
                 r_prime,
                 f_prime,
                 thinning_factor=1,
                 burn_in=0,
                 accept_all_proposals=False,
                 proposal_noise_scheme='merge_x',
                 omit_asymmetric_proposal_factor=False):
    """
        f        g
    X -----> H -----> X

    dim(X) = m
    dim(H) = n
    r = g * f

    In this implementation, we use the following shapes for the arguments.
    r       : R^m -> R^n
    r_prime : R^m -> R^m
    f_prime : R^m -> R^n
    
    energy_difference : (R^m, R^m) -> R
                        proposed_x, current_x   |->   log(p(proposed_x)) - log(p(current_x))

    noise_levels is a dict with keys 
    ["train_stddev"], ["train_stddev", "langevin_beta"] or ["train_stddev", "langevin_stddev"]
    """

    assert len(x0.shape) == 1, "Wrong dimension for x0."
    assert f_prime

    train_stddev = noise_levels["train_stddev"]
    langevin_stddev = noise_levels["langevin_stddev"]
    langevin_beta = noise_levels["langevin_beta"]
    temperature = noise_levels["temperature"]

    # TODO : use an equivalent to the 'proposal_noise_scheme'
    assert proposal_noise_scheme == "merge_x"

    def proposal(current_x, preimage_current_x):

        want_renormalization_of_J = False

        d = current_x.shape[0]

        if want_renormalization_of_J:
            M = f_prime(current_x)
            J = M / np.linalg.norm(M, 2) * langevin_stddev
            del M
        else:
            J = f_prime(current_x) * langevin_stddev

        det_JTJ = np.linalg.det(J.T.dot(J))
        z = np.random.normal(size=J.shape[0])
        preimage_proposed_x = current_x + J.T.dot(z)
        proposed_x = (1 -
                      langevin_beta) * preimage_proposed_x + langevin_beta * r(
                          preimage_proposed_x)

        if omit_asymmetric_proposal_factor:
            asymmetric_correction_log_factor = 0.0
        else:

            if want_renormalization_of_J:
                M = f_prime(proposed_x)
                proposed_J = M / np.linalg.norm(M, 2) * langevin_stddev
                del M
            else:
                proposed_J = f_prime(proposed_x) * langevin_stddev

            det_proposed_JTJ = np.linalg.det(proposed_J.T.dot(proposed_J))

            # Bear in mind that the covariance of the mvn stemming from current_x
            # will be J^T J and not just J.
            assert J.shape[1] == d
            assert proposed_J.shape[1] == d

            #print "======================"
            #print J.T.dot( J )
            #print proposed_J.T.dot( proposed_J )
            #print "======================"

            # We will essentially bypass the SVD decomposition by
            # using J^T J instead of V^T D^2 V from the SVD.
            # The two quantities are equivalent.
            # It would still be nice, in a way, to have access to the eigenvalues
            # in order to have more control (truncating ?) and be able to log them
            # as some kind of sanity check (to check Yoshua's fast decay intuition).

            # Now we need to compute
            # log q( current_x | proposed_x ) - log q( proposed_x | current_x )

            A = np.zeros((2, ))
            v = (preimage_current_x - proposed_x)
            A[0] = -0.5 * d * np.log(
                2 * np.pi) - 0.5 * np.log(det_proposed_JTJ) - 0.5 * v.dot(
                    np.linalg.inv(proposed_J.T.dot(proposed_J))).dot(v)
            A[1] = -1 * np.log(
                np.linalg.det((1 - langevin_beta) * np.eye(d) +
                              langevin_beta * r_prime(preimage_current_x)))

            B = np.zeros((2, ))
            v = (preimage_proposed_x - current_x)
            B[0] = -0.5 * d * np.log(2 * np.pi) - 0.5 * np.log(
                det_JTJ) - 0.5 * v.dot(np.linalg.inv(J.T.dot(J))).dot(v)
            B[1] = -1 * np.log(
                np.linalg.det((1 - langevin_beta) * np.eye(d) +
                              langevin_beta * r_prime(preimage_proposed_x)))

            asymmetric_correction_log_factor = A[0] + A[1] - B[0] - B[1]
        # end if omit_asymmetric_proposal_factor

        return (proposed_x, preimage_proposed_x,
                asymmetric_correction_log_factor)

    # end of proposal function

    def iterate_N_times(current_x, preimage_current_x, energy_difference, N):
        for _ in np.arange(N):
            (proposed_x, preimage_proposed_x,
             asymmetric_correction_log_factor) = proposal(
                 current_x, preimage_current_x)

            # This is a - in front of the energy difference because
            # log( p(proposed_x) / p(current_x) ) \approx -E(proposed_x) - -E(current_x) = - energy_difference(proposed_x, current_x)
            # loga = - energy_difference(proposed_x, current_x) + asymmetric_correction_log_factor
            loga = -energy_difference(
                proposed_x,
                current_x) / temperature + asymmetric_correction_log_factor
            if accept_all_proposals or loga >= 0 or loga >= np.log(
                    np.random.uniform(0, 1)):
                # accepted !
                current_x = proposed_x
                preimage_current_x = preimage_proposed_x
                iterate_N_times.accepted_counter += 1

                # DEBUG
                #print "Accepted transition with loga = %0.2f" % loga
                #print proposed_x
            else:
                iterate_N_times.rejected_counter += 1

                # DEBUG
                #print "Rejected transition with loga = %0.2f" % loga
                #print proposed_x

        return (current_x, preimage_current_x)

    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0

    # Start with the burn-in iterations.
    current_x = x0
    # not quite the actual pre-image, but it's just for initialization purposes
    preimage_current_x = current_x
    (current_x,
     preimage_current_x) = iterate_N_times(current_x, preimage_current_x,
                                           energy_difference, burn_in)

    # Then we can think about collecting samples.
    samples_list = []
    # Start from the 'current_x' from the burn_in
    # and not from x0. Reset the acceptance counters.
    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0

    progress_logger = make_progress_logger("Sampling")

    for n in np.arange(0, N):
        (current_x,
         preimage_current_x) = iterate_N_times(current_x, preimage_current_x,
                                               energy_difference,
                                               thinning_factor)
        # collect sample after running through the thinning iterations
        samples_list.append(current_x)
        progress_logger(1.0 * n / N)

    samples = np.vstack(samples_list)
    acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (
        iterate_N_times.accepted_counter + iterate_N_times.rejected_counter)

    return (samples, acceptance_ratio, noise_levels)
Esempio n. 8
0
def sample_chain(x0, N,
                 energy_difference, noise_levels,
                 r, r_prime, f_prime,
                 thinning_factor = 1, burn_in = 0,
                 accept_all_proposals = False,
                 proposal_noise_scheme = 'merge_x',
                 omit_asymmetric_proposal_factor = False):
    """
        f        g
    X -----> H -----> X

    dim(X) = m
    dim(H) = n
    r = g * f

    In this implementation, we use the following shapes for the arguments.
    r       : R^m -> R^n
    r_prime : R^m -> R^m
    f_prime : R^m -> R^n
    
    energy_difference : (R^m, R^m) -> R
                        proposed_x, current_x   |->   log(p(proposed_x)) - log(p(current_x))

    noise_levels is a dict with keys 
    ["train_stddev"], ["train_stddev", "langevin_beta"] or ["train_stddev", "langevin_stddev"]
    """

    assert len(x0.shape) == 1, "Wrong dimension for x0."
    assert f_prime

    train_stddev    = noise_levels["train_stddev"]
    langevin_stddev = noise_levels["langevin_stddev"]
    langevin_beta   = noise_levels["langevin_beta"]
    temperature     = noise_levels["temperature"]

    # TODO : use an equivalent to the 'proposal_noise_scheme'
    assert proposal_noise_scheme == "merge_x"

    def proposal(current_x, preimage_current_x):

        want_renormalization_of_J = False

        d = current_x.shape[0]

        if want_renormalization_of_J:
            M = f_prime(current_x)
            J = M / np.linalg.norm(M,2) * langevin_stddev
            del M
        else:
            J = f_prime(current_x) * langevin_stddev

        det_JTJ = np.linalg.det(J.T.dot(J))
        z = np.random.normal(size=J.shape[0])
        preimage_proposed_x = current_x + J.T.dot(z)
        proposed_x = (1-langevin_beta) * preimage_proposed_x + langevin_beta * r(preimage_proposed_x)

        if omit_asymmetric_proposal_factor:
            asymmetric_correction_log_factor = 0.0
        else:

            if want_renormalization_of_J:
                M = f_prime(proposed_x)
                proposed_J = M / np.linalg.norm(M,2) * langevin_stddev
                del M
            else:
                proposed_J = f_prime(proposed_x) * langevin_stddev

            det_proposed_JTJ = np.linalg.det(proposed_J.T.dot(proposed_J))

            # Bear in mind that the covariance of the mvn stemming from current_x
            # will be J^T J and not just J.
            assert J.shape[1] == d
            assert proposed_J.shape[1] == d

            #print "======================"
            #print J.T.dot( J )
            #print proposed_J.T.dot( proposed_J )
            #print "======================"

            # We will essentially bypass the SVD decomposition by
            # using J^T J instead of V^T D^2 V from the SVD.
            # The two quantities are equivalent.
            # It would still be nice, in a way, to have access to the eigenvalues
            # in order to have more control (truncating ?) and be able to log them
            # as some kind of sanity check (to check Yoshua's fast decay intuition).

            # Now we need to compute
            # log q( current_x | proposed_x ) - log q( proposed_x | current_x )

            A = np.zeros((2,))
            v = (preimage_current_x - proposed_x)
            A[0] = - 0.5 * d * np.log(2 * np.pi) - 0.5 * np.log(det_proposed_JTJ) - 0.5 * v.dot(np.linalg.inv(proposed_J.T.dot(proposed_J))).dot(v)
            A[1] = -1 * np.log( np.linalg.det( (1-langevin_beta) * np.eye(d) +  langevin_beta * r_prime(preimage_current_x)) )

            B = np.zeros((2,))
            v = (preimage_proposed_x - current_x)
            B[0] = - 0.5 * d * np.log(2 * np.pi) - 0.5 * np.log(det_JTJ) - 0.5 * v.dot(np.linalg.inv(J.T.dot(J))).dot(v)
            B[1] = -1 * np.log( np.linalg.det( (1-langevin_beta) * np.eye(d) +  langevin_beta * r_prime(preimage_proposed_x)) )

            asymmetric_correction_log_factor = A[0] + A[1] - B[0] - B[1]
        # end if omit_asymmetric_proposal_factor

        return (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor)


    # end of proposal function

    def iterate_N_times(current_x, preimage_current_x, energy_difference, N):
        for _ in np.arange(N):
            (proposed_x, preimage_proposed_x, asymmetric_correction_log_factor) = proposal(current_x, preimage_current_x)

            # This is a - in front of the energy difference because
            # log( p(proposed_x) / p(current_x) ) \approx -E(proposed_x) - -E(current_x) = - energy_difference(proposed_x, current_x)
            # loga = - energy_difference(proposed_x, current_x) + asymmetric_correction_log_factor
            loga = - energy_difference(proposed_x, current_x) / temperature + asymmetric_correction_log_factor
            if accept_all_proposals or loga >= 0 or loga >= np.log(np.random.uniform(0,1)):
                # accepted !
                current_x = proposed_x
                preimage_current_x = preimage_proposed_x
                iterate_N_times.accepted_counter += 1

                # DEBUG
                #print "Accepted transition with loga = %0.2f" % loga
                #print proposed_x
            else:
                iterate_N_times.rejected_counter += 1

                # DEBUG
                #print "Rejected transition with loga = %0.2f" % loga
                #print proposed_x


        return (current_x, preimage_current_x)

    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0


    # Start with the burn-in iterations.
    current_x = x0
    # not quite the actual pre-image, but it's just for initialization purposes
    preimage_current_x = current_x
    (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, burn_in)

    # Then we can think about collecting samples.
    samples_list = []
    # Start from the 'current_x' from the burn_in
    # and not from x0. Reset the acceptance counters.
    iterate_N_times.accepted_counter = 0
    iterate_N_times.rejected_counter = 0


    progress_logger = make_progress_logger("Sampling")

    for n in np.arange(0,N):
        (current_x, preimage_current_x) = iterate_N_times(current_x, preimage_current_x, energy_difference, thinning_factor)
        # collect sample after running through the thinning iterations
        samples_list.append(current_x)
        progress_logger(1.0*n/N)

    samples = np.vstack(samples_list)
    acceptance_ratio = iterate_N_times.accepted_counter * 1.0 / (iterate_N_times.accepted_counter + iterate_N_times.rejected_counter)

    return (samples, acceptance_ratio, noise_levels)
Esempio n. 9
0
    def fit_with_decreasing_noise(self,
                                  X,
                                  list_of_train_stddev,
                                  optimization_args,
                                  early_termination_args={},
                                  X_valid=None,
                                  list_of_additional_valid_stddev=None):
        """
        The 'optimization_args' filters through to the 'fit' function almost unchanged.

        There is the option of adding a a special provision
        for it's 'maxiter' entry when we get a list.
        In such a situation, we use one value of maxiter
        from the list for each value of list_of_train_stddev.

        The 'early_termination_args' is optional. It provides a way to
        stop the training if we determine that we started in a state
        that was irredeemable and would only lead to a bad local minimum.
        We can keep in mind the r(x) = x solution as a benchmark and
        observe that, with r(x) = x we would have a loss function that
        roughly equals
            d * train_stddev**2, where d is the dimension of the data.

        The 'early_termination_args' dict has one key for now.
            early_termination_args['stop_if_loss_greater_than'] = [...]
                or
            early_termination_args['stop_if_loss_greater_than'] = "auto"

        If X_valid is not None, we will also return the values of the
        objective function evaluated with those validation samples.
        Those values will be the onces according to which we will
        decide to stop or not the descent with the train_stddev values.
        """

        # If we were passed the argument "auto", we have to replace the
        # value with an array of corresponding values.
        if (early_termination_args.has_key('stop_if_loss_greater_than')
                and type(early_termination_args['stop_if_loss_greater_than'])
                == str):
            if early_termination_args['stop_if_loss_greater_than'] == "auto":
                early_termination_args['stop_if_loss_greater_than'] = [
                    X.shape[1] * train_stddev**2
                    for train_stddev in list_of_train_stddev
                ]
                print "early termination with losses : "
                print early_termination_args['stop_if_loss_greater_than']
            else:
                print "Wrong value for early_termination_args. Only valid string is 'auto'."
                print "Exiting."
                quit()

        # at some point we might want to decide to
        # record all the best_q for the sequence
        seq_train_mean_best_U_q = []
        seq_valid_mean_best_U_q = []
        i = 0
        progress_logger = make_progress_logger("Training")

        for train_stddev in list_of_train_stddev:

            sys.stdout.write("    Using train_stddev %f, " % train_stddev)
            (noisy_X, importance_sampling_weights
             ) = isotropic_gaussian_noise_and_importance_sampling_weights(
                 X, 4.0 * train_stddev, train_stddev)
            #noisy_X = X + np.random.normal(size = X.shape, scale = train_stddev)

            if optimization_args.has_key('maxiter') and type(
                    optimization_args['maxiter']) in [list, np.array]:
                assert len(
                    optimization_args['maxiter']) == len(list_of_train_stddev)
                optimization_args0 = conj(optimization_args, "maxiter",
                                          optimization_args['maxiter'][i])
            else:
                optimization_args0 = optimization_args
            (best_q, train_U_best_q_) = self.fit(X, noisy_X,
                                                 optimization_args0)
            #(best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args)

            train_U_best_q = self.q_loss(best_q, X, noisy_X,
                                         importance_sampling_weights).sum()
            # sanity check to make sure that we're evaluating this right
            assert (abs(train_U_best_q - train_U_best_q_) < 1e-8)

            train_mean_U_best_q = train_U_best_q / X.shape[0]
            seq_train_mean_best_U_q.append(train_mean_U_best_q)
            sys.stdout.write("train mean loss is %f, " %
                             (train_mean_U_best_q, ))

            if not (X_valid == None):
                (noisy_X_valid, importance_sampling_weights
                 ) = isotropic_gaussian_noise_and_importance_sampling_weights(
                     X_valid, 4.0 * train_stddev, train_stddev)

                #noisy_X_valid = X_valid + np.random.normal(size = X_valid.shape, scale = train_stddev)
                valid_U_best_q = self.q_loss(
                    best_q, X_valid, noisy_X_valid,
                    importance_sampling_weights).sum()
                valid_mean_U_best_q = valid_U_best_q / X_valid.shape[0]
                seq_valid_mean_best_U_q.append(valid_mean_U_best_q)
                sys.stdout.write("valid mean loss is %f." %
                                 (valid_mean_U_best_q, ))

                # if we're dealing with a validation set, it will be the one used
                # to determine the stopping point
                if (early_termination_args.has_key('stop_if_loss_greater_than')
                        and
                        early_termination_args['stop_if_loss_greater_than'][i]
                        < valid_mean_U_best_q):
                    break
            else:
                # if we don't have a validation set, then we'll use mean_U_best_q
                # for the termination condition

                if (early_termination_args.has_key('stop_if_loss_greater_than')
                        and
                        early_termination_args['stop_if_loss_greater_than'][i]
                        < mean_U_best_q):
                    break

            print ""
            progress_logger(1.0 * i / len(list_of_train_stddev))
            i += 1
        # end for

        # might as well pad the rest of the list to
        # signify that we terminated early
        while len(seq_train_mean_best_U_q) < len(list_of_train_stddev):
            seq_train_mean_best_U_q.append(np.nan)
        while len(seq_valid_mean_best_U_q) < len(list_of_train_stddev):
            seq_valid_mean_best_U_q.append(np.nan)

        # Now we want to recompute the model losses for all the values of
        # the train_stddev, but using the final parameters best_q.
        # This will be used as an addition quality evaluation to determine
        # how the DAE treats data that's relatively far from the manifold
        # once it's done training.
        # It might be even more informative than the validation losses.

        seq_valid_mean_U_final_best_q = None
        seq_alt_valid_mean_U_final_best_q = None
        if not (X_valid == None):
            nreps = 10
            # This thing doesn't work with the list comprehension. You need to generate the data every time.
            (noisy_X_valid, importance_sampling_weights
             ) = isotropic_gaussian_noise_and_importance_sampling_weights(
                 X_valid, 4.0 * train_stddev, train_stddev)
            seq_valid_mean_U_final_best_q = [
                np.array([
                    self.q_loss(best_q, X_valid, noisy_X_valid,
                                importance_sampling_weights).sum() /
                    X_valid.shape[0] for _ in range(nreps)
                ]).mean() for train_stddev in list_of_train_stddev
            ]

            if (list_of_additional_valid_stddev
                    is not None) and len(list_of_additional_valid_stddev) > 0:
                # TODO : use some kind of tool to generate the importance_sampling_weights
                seq_alt_valid_mean_U_final_best_q = [
                    np.array([
                        self.q_loss(
                            best_q, X_valid, X_valid +
                            np.random.normal(size=X_valid.shape,
                                             scale=alt_valid_stddev)).sum() /
                        X_valid.shape[0] for _ in range(nreps)
                    ]).mean()
                    for alt_valid_stddev in list_of_additional_valid_stddev
                ]
        # end if

        return (seq_train_mean_best_U_q, seq_valid_mean_best_U_q,
                seq_valid_mean_U_final_best_q,
                seq_alt_valid_mean_U_final_best_q)
Esempio n. 10
0
    def fit_with_stddevs_sequence(self, X, X_valid, stddevs,
                                  optimization_args):
        """
        stddevs has fields 'train', 'valid' and any number of other variants on 'valid'.
        The special key is 'train', used for training.
        The validation errors are computed with all the other keys that contain
        information about the stddev. Obviously, we want to use one called 'valid', but
        we can also have different alternatives such as 'alt_valid' or 'valid2' with
        a different sequence of stddevs.

        stddevs is of the form {'train' : [{'target' : 1.0, 'sampled' : 4.0},
                                           {'target' : 0.8, 'sampled' : 3.0},
                                           ...
                                           ],
                                'valid' : [{'target' : 1.0, 'sampled' : 4.0},
                                           {'target' : 0.8, 'sampled' : 3.0},
                                           ...
                                           ],
                                ...
                                }

        X is an array of shape (n_train, d)
        X_valid is an array of shape (n_valid, d). It can be None.

        optimisation_args passed through to the method 'fit' of this class.
        example of optimation_args :
                                     {'method' : 'fmin_l_bfgs_b',
                                      'maxiter' : maxiter,
                                      'm':lbfgs_rank}

        Returns the losses for all the stddevs. The variable 'best_q_mean_losses'.
        """

        validate_the_stddevs_argument(stddevs)

        # the walkback_vector_func is the function r(x) that
        # we have from this DAE
        walkback_vector_func = lambda X: self.encode_decode(X)

        progress_logger = make_progress_logger("Training")

        best_q_mean_losses = dict([(key, []) for key in stddevs.keys()])
        # Summary :
        #     Everything that follows is just a way to mutate the value of 'best_q'.
        #     That 'best_q' variable contains the learned parameters.
        #     We log various things based on the current value of 'best_q' and
        #     the datasets (X, X_valid).
        #     At the end of the day, we're left with 'best_q' and stuff logged
        #     in 'best_q_mean_losses' to make an informed decision about the
        #     usefulness of the model learned.

        M = len(stddevs['train'])
        for m in range(0, M):

            e = stddevs['train'][m]
            (noisy_X,
             importance_sampling_weights) = get_noisy_X_and_importance_weights(
                 X, e, walkback_vector_func)

            (best_q, train_U_best_q) = self.fit(X, noisy_X,
                                                importance_sampling_weights,
                                                optimization_args)

            train_mean_U_best_q = train_U_best_q / X.shape[0]
            best_q_mean_losses['train'].append(train_mean_U_best_q)
            sys.stdout.write("        train mean loss is %f\n" %
                             (train_mean_U_best_q, ))

            if X_valid is not None:

                for key in stddevs.keys():
                    if key == 'train':
                        continue

                    e = stddevs[key][m]
                    if e['sampled'] is None:
                        best_q_mean_losses[key].append(None)
                        continue

                    (noisy_X_valid, importance_sampling_weights
                     ) = get_noisy_X_and_importance_weights(
                         X_valid, e, walkback_vector_func)

                    some_valid_U_best_q = self.q_loss(
                        best_q, X_valid, noisy_X_valid,
                        importance_sampling_weights).sum()

                    # Notice : Despite the importance_sampling_weights being used,
                    # I think that we are still doing the right thing by normalizing by
                    # X_valid.shape[0]. I was a bit afraid that we'd be throwing off everything
                    # by using these coefficients, but now I think that we won't find ourselves
                    # in a situation where the validation loss will be useless because of the
                    # wild importance sampling weights.

                    some_valid_mean_U_best_q = some_valid_U_best_q / X_valid.shape[
                        0]
                    best_q_mean_losses[key].append(some_valid_mean_U_best_q)
                    sys.stdout.write("        %s mean loss is %f\n" % (
                        str(key),
                        some_valid_mean_U_best_q,
                    ))

                    progress_logger(1.0 * (m + 1) / M)

        return best_q_mean_losses