def main(argv):
    """
       n_train
       n_valid
       n_test
       d is the dimension of the samples. Should be higher than 2 and preferable 10 or more.
       mixing_prop controls how much of the vector v_t we mix in with the proposal for v_{t+1}
       leading_eigenvalue
       n_components
       output_dir is the directory in which we'll write the results
    """

    import getopt
    import cPickle

    try:
        opts, args = getopt.getopt(sys.argv[1:], "hv", ["d=", "n_train=", "n_valid=", "n_test=", "ratio_eigvals=", "n_components=", "output_dir="])
    except getopt.GetoptError as err:
        # print help information and exit:
        print str(err) # will print something like "option -a not recognized"
        usage()
        sys.exit(2)

    n_train = None
    n_valid = None
    n_test = None
    d = None
    ratio_eigvals = 1.0
    output_dir = None

    verbose = False
    for o, a in opts:
        if o == "-v":
            verbose = True
        elif o in ("-h", "--help"):
            usage()
            sys.exit()
        elif o in ("--n_train"):
            n_train = int(a)
        elif o in ("--n_valid"):
            n_valid = int(a)
        elif o in ("--n_test"):
            n_test = int(a)
        elif o in ("--d"):
            d = int(a)
        elif o in ("--ratio_eigvals"):
            ratio_eigvals = float(a)
        elif o in ("--n_components"):
            n_components = int(a)
        elif o in ("--output_dir"):
            output_dir = a
        else:
            assert False, "unhandled option"
 
    assert n_train
    assert n_valid
    assert n_test
    assert d
    assert n_components
    assert output_dir

    start_time = time.time()

    (component_means, component_covariances, f_parameters) = sample_manifold_components(d, n_components, ratio_eigvals)
    assert component_means != None
    assert component_covariances != None

    # updated method
    mixturemvn = gaussian_mixture_tools.MixtureMVN(component_means, component_covariances)
    (samples, component_indices) = mixturemvn.sample(n_train + n_valid + n_test, want_indices=True)

    # deprecated method
    #(samples, component_indices) = gaussian_mixture_tools.sample_from_mixture(component_means, component_covariances, n_train + n_valid + n_test)
    
    end_time = time.time()
    computational_cost_in_seconds = int(end_time - start_time)
    print "Sampling took %d seconds." % computational_cost_in_seconds

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print "Creating directory %s" % output_dir,

    extra_props = {'component_means':component_means,
                   'component_covariances':component_covariances,
                   #'n_train':n_train,
                   #'n_test':n_test,
                   'd':d,
                   'ratio_eigvals':ratio_eigvals,
                   'n_components':n_components,
                   'f_parameters':f_parameters,
                   'computational_cost_in_seconds':computational_cost_in_seconds}

    ### TRAIN ###

    train_samples = samples[0:n_train,:]
    train_component_indices = component_indices[0:n_train]
    train_samples_filename = os.path.join(output_dir, "train_samples.pkl")
    train_samples_extra_filename = os.path.join(output_dir, "train_samples_extra.pkl")

    cPickle.dump(train_samples, open(train_samples_filename, "w"))
    cPickle.dump(conj(conj(extra_props,
                           ('n', n_train)),
                      ('component_indices', train_component_indices)),
                 open(train_samples_extra_filename, "w"))
    print "wrote " + train_samples_filename
    print "wrote " + train_samples_extra_filename

    ### VALID ###

    valid_samples = samples[n_train:(n_train + n_valid),:]
    valid_component_indices = component_indices[n_train:(n_train + n_valid)]
    valid_samples_filename = os.path.join(output_dir, "valid_samples.pkl")
    valid_samples_extra_filename = os.path.join(output_dir, "valid_samples_extra.pkl")

    cPickle.dump(valid_samples, open(valid_samples_filename, "w"))
    cPickle.dump(conj(conj(extra_props,
                           ('n', n_valid)),
                      ('component_indices', valid_component_indices)),
                 open(valid_samples_extra_filename, "w"))
    print "wrote " + valid_samples_filename
    print "wrote " + valid_samples_extra_filename

    ### TEST ###

    test_samples = samples[(n_train + n_valid):(n_train + n_valid + n_test),:]
    test_component_indices= component_indices[(n_train + n_valid):(n_train + n_valid + n_test)]
    test_samples_filename  = os.path.join(output_dir, "test_samples.pkl")
    test_samples_extra_filename  = os.path.join(output_dir, "test_samples_extra.pkl")

    cPickle.dump(test_samples, open(test_samples_filename, "w"))
    cPickle.dump(conj(conj(extra_props,
                           ('n', n_test)),
                      ('component_indices', test_component_indices)),
                 open(test_samples_extra_filename, "w"))
    print "wrote " + test_samples_filename
    print "wrote " + test_samples_extra_filename


    for i in range(0,d-1):
        output_image_file = os.path.join(output_dir,"overview_dimensions_%d_and_%d.png" % (i,i+1))
        if samples.shape[0] > 500:
            plot_the_overview(samples[0:500,:], i, i+1, output_image_file)
        else:
            plot_the_overview(samples, i, i+1, output_image_file)
        print "wrote " + output_image_file


    for i in range(0,d-1):
        output_image_file = os.path.join(output_dir,"component_means_%d_and_%d.png" % (i,i+1))
        plot_the_overview(component_means, i, i+1, output_image_file)
        print "wrote " + output_image_file
def main(argv):
    """
       n_train
       n_valid
       n_test
       d is the dimension of the samples. Should be higher than 2 and preferable 10 or more.
       mixing_prop controls how much of the vector v_t we mix in with the proposal for v_{t+1}
       leading_eigenvalue
       n_components
       output_dir is the directory in which we'll write the results
    """

    import getopt
    import cPickle

    try:
        opts, args = getopt.getopt(sys.argv[1:], "hv", [
            "d=", "n_train=", "n_valid=", "n_test=", "ratio_eigvals=",
            "n_components=", "output_dir="
        ])
    except getopt.GetoptError as err:
        # print help information and exit:
        print str(err)  # will print something like "option -a not recognized"
        usage()
        sys.exit(2)

    n_train = None
    n_valid = None
    n_test = None
    d = None
    ratio_eigvals = 1.0
    output_dir = None

    verbose = False
    for o, a in opts:
        if o == "-v":
            verbose = True
        elif o in ("-h", "--help"):
            usage()
            sys.exit()
        elif o in ("--n_train"):
            n_train = int(a)
        elif o in ("--n_valid"):
            n_valid = int(a)
        elif o in ("--n_test"):
            n_test = int(a)
        elif o in ("--d"):
            d = int(a)
        elif o in ("--ratio_eigvals"):
            ratio_eigvals = float(a)
        elif o in ("--n_components"):
            n_components = int(a)
        elif o in ("--output_dir"):
            output_dir = a
        else:
            assert False, "unhandled option"

    assert n_train
    assert n_valid
    assert n_test
    assert d
    assert n_components
    assert output_dir

    start_time = time.time()

    (component_means, component_covariances,
     f_parameters) = sample_manifold_components(d, n_components, ratio_eigvals)
    assert component_means != None
    assert component_covariances != None

    # updated method
    mixturemvn = gaussian_mixture_tools.MixtureMVN(component_means,
                                                   component_covariances)
    (samples,
     component_indices) = mixturemvn.sample(n_train + n_valid + n_test,
                                            want_indices=True)

    # deprecated method
    #(samples, component_indices) = gaussian_mixture_tools.sample_from_mixture(component_means, component_covariances, n_train + n_valid + n_test)

    end_time = time.time()
    computational_cost_in_seconds = int(end_time - start_time)
    print "Sampling took %d seconds." % computational_cost_in_seconds

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
        print "Creating directory %s" % output_dir,

    extra_props = {
        'component_means': component_means,
        'component_covariances': component_covariances,
        #'n_train':n_train,
        #'n_test':n_test,
        'd': d,
        'ratio_eigvals': ratio_eigvals,
        'n_components': n_components,
        'f_parameters': f_parameters,
        'computational_cost_in_seconds': computational_cost_in_seconds
    }

    ### TRAIN ###

    train_samples = samples[0:n_train, :]
    train_component_indices = component_indices[0:n_train]
    train_samples_filename = os.path.join(output_dir, "train_samples.pkl")
    train_samples_extra_filename = os.path.join(output_dir,
                                                "train_samples_extra.pkl")

    cPickle.dump(train_samples, open(train_samples_filename, "w"))
    cPickle.dump(
        conj(conj(extra_props, ('n', n_train)),
             ('component_indices', train_component_indices)),
        open(train_samples_extra_filename, "w"))
    print "wrote " + train_samples_filename
    print "wrote " + train_samples_extra_filename

    ### VALID ###

    valid_samples = samples[n_train:(n_train + n_valid), :]
    valid_component_indices = component_indices[n_train:(n_train + n_valid)]
    valid_samples_filename = os.path.join(output_dir, "valid_samples.pkl")
    valid_samples_extra_filename = os.path.join(output_dir,
                                                "valid_samples_extra.pkl")

    cPickle.dump(valid_samples, open(valid_samples_filename, "w"))
    cPickle.dump(
        conj(conj(extra_props, ('n', n_valid)),
             ('component_indices', valid_component_indices)),
        open(valid_samples_extra_filename, "w"))
    print "wrote " + valid_samples_filename
    print "wrote " + valid_samples_extra_filename

    ### TEST ###

    test_samples = samples[(n_train + n_valid):(n_train + n_valid + n_test), :]
    test_component_indices = component_indices[(n_train +
                                                n_valid):(n_train + n_valid +
                                                          n_test)]
    test_samples_filename = os.path.join(output_dir, "test_samples.pkl")
    test_samples_extra_filename = os.path.join(output_dir,
                                               "test_samples_extra.pkl")

    cPickle.dump(test_samples, open(test_samples_filename, "w"))
    cPickle.dump(
        conj(conj(extra_props, ('n', n_test)),
             ('component_indices', test_component_indices)),
        open(test_samples_extra_filename, "w"))
    print "wrote " + test_samples_filename
    print "wrote " + test_samples_extra_filename

    for i in range(0, d - 1):
        output_image_file = os.path.join(
            output_dir, "overview_dimensions_%d_and_%d.png" % (i, i + 1))
        if samples.shape[0] > 500:
            plot_the_overview(samples[0:500, :], i, i + 1, output_image_file)
        else:
            plot_the_overview(samples, i, i + 1, output_image_file)
        print "wrote " + output_image_file

    for i in range(0, d - 1):
        output_image_file = os.path.join(
            output_dir, "component_means_%d_and_%d.png" % (i, i + 1))
        plot_the_overview(component_means, i, i + 1, output_image_file)
        print "wrote " + output_image_file
Esempio n. 3
0
    def fit_with_decreasing_noise(
        self,
        X,
        list_of_train_stddev,
        optimization_args,
        early_termination_args={},
        X_valid=None,
        list_of_additional_valid_stddev=None,
    ):
        """
        The 'optimization_args' filters through to the 'fit' function almost unchanged.

        There is the option of adding a a special provision
        for it's 'maxiter' entry when we get a list.
        In such a situation, we use one value of maxiter
        from the list for each value of list_of_train_stddev.

        The 'early_termination_args' is optional. It provides a way to
        stop the training if we determine that we started in a state
        that was irredeemable and would only lead to a bad local minimum.
        We can keep in mind the r(x) = x solution as a benchmark and
        observe that, with r(x) = x we would have a loss function that
        roughly equals
            d * train_stddev**2, where d is the dimension of the data.

        The 'early_termination_args' dict has one key for now.
            early_termination_args['stop_if_loss_greater_than'] = [...]
                or
            early_termination_args['stop_if_loss_greater_than'] = "auto"

        If X_valid is not None, we will also return the values of the
        objective function evaluated with those validation samples.
        Those values will be the onces according to which we will
        decide to stop or not the descent with the train_stddev values.
        """

        # If we were passed the argument "auto", we have to replace the
        # value with an array of corresponding values.
        if (
            early_termination_args.has_key("stop_if_loss_greater_than")
            and type(early_termination_args["stop_if_loss_greater_than"]) == str
        ):
            if early_termination_args["stop_if_loss_greater_than"] == "auto":
                early_termination_args["stop_if_loss_greater_than"] = [
                    X.shape[1] * train_stddev ** 2 for train_stddev in list_of_train_stddev
                ]
                print "early termination with losses : "
                print early_termination_args["stop_if_loss_greater_than"]
            else:
                print "Wrong value for early_termination_args. Only valid string is 'auto'."
                print "Exiting."
                quit()

        # at some point we might want to decide to
        # record all the best_q for the sequence
        seq_train_mean_best_U_q = []
        seq_valid_mean_best_U_q = []
        i = 0
        progress_logger = make_progress_logger("Training")

        for train_stddev in list_of_train_stddev:

            sys.stdout.write("    Using train_stddev %f, " % train_stddev)
            (noisy_X, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights(
                X, 4.0 * train_stddev, train_stddev
            )
            # noisy_X = X + np.random.normal(size = X.shape, scale = train_stddev)

            if optimization_args.has_key("maxiter") and type(optimization_args["maxiter"]) in [list, np.array]:
                assert len(optimization_args["maxiter"]) == len(list_of_train_stddev)
                optimization_args0 = conj(optimization_args, "maxiter", optimization_args["maxiter"][i])
            else:
                optimization_args0 = optimization_args
            (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args0)
            # (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args)

            train_U_best_q = self.q_loss(best_q, X, noisy_X, importance_sampling_weights).sum()
            # sanity check to make sure that we're evaluating this right
            assert abs(train_U_best_q - train_U_best_q_) < 1e-8

            train_mean_U_best_q = train_U_best_q / X.shape[0]
            seq_train_mean_best_U_q.append(train_mean_U_best_q)
            sys.stdout.write("train mean loss is %f, " % (train_mean_U_best_q,))

            if not (X_valid == None):
                (noisy_X_valid, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights(
                    X_valid, 4.0 * train_stddev, train_stddev
                )

                # noisy_X_valid = X_valid + np.random.normal(size = X_valid.shape, scale = train_stddev)
                valid_U_best_q = self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum()
                valid_mean_U_best_q = valid_U_best_q / X_valid.shape[0]
                seq_valid_mean_best_U_q.append(valid_mean_U_best_q)
                sys.stdout.write("valid mean loss is %f." % (valid_mean_U_best_q,))

                # if we're dealing with a validation set, it will be the one used
                # to determine the stopping point
                if (
                    early_termination_args.has_key("stop_if_loss_greater_than")
                    and early_termination_args["stop_if_loss_greater_than"][i] < valid_mean_U_best_q
                ):
                    break
            else:
                # if we don't have a validation set, then we'll use mean_U_best_q
                # for the termination condition

                if (
                    early_termination_args.has_key("stop_if_loss_greater_than")
                    and early_termination_args["stop_if_loss_greater_than"][i] < mean_U_best_q
                ):
                    break

            print ""
            progress_logger(1.0 * i / len(list_of_train_stddev))
            i += 1
        # end for

        # might as well pad the rest of the list to
        # signify that we terminated early
        while len(seq_train_mean_best_U_q) < len(list_of_train_stddev):
            seq_train_mean_best_U_q.append(np.nan)
        while len(seq_valid_mean_best_U_q) < len(list_of_train_stddev):
            seq_valid_mean_best_U_q.append(np.nan)

        # Now we want to recompute the model losses for all the values of
        # the train_stddev, but using the final parameters best_q.
        # This will be used as an addition quality evaluation to determine
        # how the DAE treats data that's relatively far from the manifold
        # once it's done training.
        # It might be even more informative than the validation losses.

        seq_valid_mean_U_final_best_q = None
        seq_alt_valid_mean_U_final_best_q = None
        if not (X_valid == None):
            nreps = 10
            # This thing doesn't work with the list comprehension. You need to generate the data every time.
            (noisy_X_valid, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights(
                X_valid, 4.0 * train_stddev, train_stddev
            )
            seq_valid_mean_U_final_best_q = [
                np.array(
                    [
                        self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum()
                        / X_valid.shape[0]
                        for _ in range(nreps)
                    ]
                ).mean()
                for train_stddev in list_of_train_stddev
            ]

            if (list_of_additional_valid_stddev is not None) and len(list_of_additional_valid_stddev) > 0:
                # TODO : use some kind of tool to generate the importance_sampling_weights
                seq_alt_valid_mean_U_final_best_q = [
                    np.array(
                        [
                            self.q_loss(
                                best_q, X_valid, X_valid + np.random.normal(size=X_valid.shape, scale=alt_valid_stddev)
                            ).sum()
                            / X_valid.shape[0]
                            for _ in range(nreps)
                        ]
                    ).mean()
                    for alt_valid_stddev in list_of_additional_valid_stddev
                ]
        # end if

        return (
            seq_train_mean_best_U_q,
            seq_valid_mean_best_U_q,
            seq_valid_mean_U_final_best_q,
            seq_alt_valid_mean_U_final_best_q,
        )
Esempio n. 4
0
    def fit_with_decreasing_noise(self,
                                  X,
                                  list_of_train_stddev,
                                  optimization_args,
                                  early_termination_args={},
                                  X_valid=None,
                                  list_of_additional_valid_stddev=None):
        """
        The 'optimization_args' filters through to the 'fit' function almost unchanged.

        There is the option of adding a a special provision
        for it's 'maxiter' entry when we get a list.
        In such a situation, we use one value of maxiter
        from the list for each value of list_of_train_stddev.

        The 'early_termination_args' is optional. It provides a way to
        stop the training if we determine that we started in a state
        that was irredeemable and would only lead to a bad local minimum.
        We can keep in mind the r(x) = x solution as a benchmark and
        observe that, with r(x) = x we would have a loss function that
        roughly equals
            d * train_stddev**2, where d is the dimension of the data.

        The 'early_termination_args' dict has one key for now.
            early_termination_args['stop_if_loss_greater_than'] = [...]
                or
            early_termination_args['stop_if_loss_greater_than'] = "auto"

        If X_valid is not None, we will also return the values of the
        objective function evaluated with those validation samples.
        Those values will be the onces according to which we will
        decide to stop or not the descent with the train_stddev values.
        """

        # If we were passed the argument "auto", we have to replace the
        # value with an array of corresponding values.
        if (early_termination_args.has_key('stop_if_loss_greater_than')
                and type(early_termination_args['stop_if_loss_greater_than'])
                == str):
            if early_termination_args['stop_if_loss_greater_than'] == "auto":
                early_termination_args['stop_if_loss_greater_than'] = [
                    X.shape[1] * train_stddev**2
                    for train_stddev in list_of_train_stddev
                ]
                print "early termination with losses : "
                print early_termination_args['stop_if_loss_greater_than']
            else:
                print "Wrong value for early_termination_args. Only valid string is 'auto'."
                print "Exiting."
                quit()

        # at some point we might want to decide to
        # record all the best_q for the sequence
        seq_train_mean_best_U_q = []
        seq_valid_mean_best_U_q = []
        i = 0
        progress_logger = make_progress_logger("Training")

        for train_stddev in list_of_train_stddev:

            sys.stdout.write("    Using train_stddev %f, " % train_stddev)
            (noisy_X, importance_sampling_weights
             ) = isotropic_gaussian_noise_and_importance_sampling_weights(
                 X, 4.0 * train_stddev, train_stddev)
            #noisy_X = X + np.random.normal(size = X.shape, scale = train_stddev)

            if optimization_args.has_key('maxiter') and type(
                    optimization_args['maxiter']) in [list, np.array]:
                assert len(
                    optimization_args['maxiter']) == len(list_of_train_stddev)
                optimization_args0 = conj(optimization_args, "maxiter",
                                          optimization_args['maxiter'][i])
            else:
                optimization_args0 = optimization_args
            (best_q, train_U_best_q_) = self.fit(X, noisy_X,
                                                 optimization_args0)
            #(best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args)

            train_U_best_q = self.q_loss(best_q, X, noisy_X,
                                         importance_sampling_weights).sum()
            # sanity check to make sure that we're evaluating this right
            assert (abs(train_U_best_q - train_U_best_q_) < 1e-8)

            train_mean_U_best_q = train_U_best_q / X.shape[0]
            seq_train_mean_best_U_q.append(train_mean_U_best_q)
            sys.stdout.write("train mean loss is %f, " %
                             (train_mean_U_best_q, ))

            if not (X_valid == None):
                (noisy_X_valid, importance_sampling_weights
                 ) = isotropic_gaussian_noise_and_importance_sampling_weights(
                     X_valid, 4.0 * train_stddev, train_stddev)

                #noisy_X_valid = X_valid + np.random.normal(size = X_valid.shape, scale = train_stddev)
                valid_U_best_q = self.q_loss(
                    best_q, X_valid, noisy_X_valid,
                    importance_sampling_weights).sum()
                valid_mean_U_best_q = valid_U_best_q / X_valid.shape[0]
                seq_valid_mean_best_U_q.append(valid_mean_U_best_q)
                sys.stdout.write("valid mean loss is %f." %
                                 (valid_mean_U_best_q, ))

                # if we're dealing with a validation set, it will be the one used
                # to determine the stopping point
                if (early_termination_args.has_key('stop_if_loss_greater_than')
                        and
                        early_termination_args['stop_if_loss_greater_than'][i]
                        < valid_mean_U_best_q):
                    break
            else:
                # if we don't have a validation set, then we'll use mean_U_best_q
                # for the termination condition

                if (early_termination_args.has_key('stop_if_loss_greater_than')
                        and
                        early_termination_args['stop_if_loss_greater_than'][i]
                        < mean_U_best_q):
                    break

            print ""
            progress_logger(1.0 * i / len(list_of_train_stddev))
            i += 1
        # end for

        # might as well pad the rest of the list to
        # signify that we terminated early
        while len(seq_train_mean_best_U_q) < len(list_of_train_stddev):
            seq_train_mean_best_U_q.append(np.nan)
        while len(seq_valid_mean_best_U_q) < len(list_of_train_stddev):
            seq_valid_mean_best_U_q.append(np.nan)

        # Now we want to recompute the model losses for all the values of
        # the train_stddev, but using the final parameters best_q.
        # This will be used as an addition quality evaluation to determine
        # how the DAE treats data that's relatively far from the manifold
        # once it's done training.
        # It might be even more informative than the validation losses.

        seq_valid_mean_U_final_best_q = None
        seq_alt_valid_mean_U_final_best_q = None
        if not (X_valid == None):
            nreps = 10
            # This thing doesn't work with the list comprehension. You need to generate the data every time.
            (noisy_X_valid, importance_sampling_weights
             ) = isotropic_gaussian_noise_and_importance_sampling_weights(
                 X_valid, 4.0 * train_stddev, train_stddev)
            seq_valid_mean_U_final_best_q = [
                np.array([
                    self.q_loss(best_q, X_valid, noisy_X_valid,
                                importance_sampling_weights).sum() /
                    X_valid.shape[0] for _ in range(nreps)
                ]).mean() for train_stddev in list_of_train_stddev
            ]

            if (list_of_additional_valid_stddev
                    is not None) and len(list_of_additional_valid_stddev) > 0:
                # TODO : use some kind of tool to generate the importance_sampling_weights
                seq_alt_valid_mean_U_final_best_q = [
                    np.array([
                        self.q_loss(
                            best_q, X_valid, X_valid +
                            np.random.normal(size=X_valid.shape,
                                             scale=alt_valid_stddev)).sum() /
                        X_valid.shape[0] for _ in range(nreps)
                    ]).mean()
                    for alt_valid_stddev in list_of_additional_valid_stddev
                ]
        # end if

        return (seq_train_mean_best_U_q, seq_valid_mean_best_U_q,
                seq_valid_mean_U_final_best_q,
                seq_alt_valid_mean_U_final_best_q)