def main(argv): """ n_train n_valid n_test d is the dimension of the samples. Should be higher than 2 and preferable 10 or more. mixing_prop controls how much of the vector v_t we mix in with the proposal for v_{t+1} leading_eigenvalue n_components output_dir is the directory in which we'll write the results """ import getopt import cPickle try: opts, args = getopt.getopt(sys.argv[1:], "hv", ["d=", "n_train=", "n_valid=", "n_test=", "ratio_eigvals=", "n_components=", "output_dir="]) except getopt.GetoptError as err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) n_train = None n_valid = None n_test = None d = None ratio_eigvals = 1.0 output_dir = None verbose = False for o, a in opts: if o == "-v": verbose = True elif o in ("-h", "--help"): usage() sys.exit() elif o in ("--n_train"): n_train = int(a) elif o in ("--n_valid"): n_valid = int(a) elif o in ("--n_test"): n_test = int(a) elif o in ("--d"): d = int(a) elif o in ("--ratio_eigvals"): ratio_eigvals = float(a) elif o in ("--n_components"): n_components = int(a) elif o in ("--output_dir"): output_dir = a else: assert False, "unhandled option" assert n_train assert n_valid assert n_test assert d assert n_components assert output_dir start_time = time.time() (component_means, component_covariances, f_parameters) = sample_manifold_components(d, n_components, ratio_eigvals) assert component_means != None assert component_covariances != None # updated method mixturemvn = gaussian_mixture_tools.MixtureMVN(component_means, component_covariances) (samples, component_indices) = mixturemvn.sample(n_train + n_valid + n_test, want_indices=True) # deprecated method #(samples, component_indices) = gaussian_mixture_tools.sample_from_mixture(component_means, component_covariances, n_train + n_valid + n_test) end_time = time.time() computational_cost_in_seconds = int(end_time - start_time) print "Sampling took %d seconds." % computational_cost_in_seconds if not os.path.exists(output_dir): os.makedirs(output_dir) print "Creating directory %s" % output_dir, extra_props = {'component_means':component_means, 'component_covariances':component_covariances, #'n_train':n_train, #'n_test':n_test, 'd':d, 'ratio_eigvals':ratio_eigvals, 'n_components':n_components, 'f_parameters':f_parameters, 'computational_cost_in_seconds':computational_cost_in_seconds} ### TRAIN ### train_samples = samples[0:n_train,:] train_component_indices = component_indices[0:n_train] train_samples_filename = os.path.join(output_dir, "train_samples.pkl") train_samples_extra_filename = os.path.join(output_dir, "train_samples_extra.pkl") cPickle.dump(train_samples, open(train_samples_filename, "w")) cPickle.dump(conj(conj(extra_props, ('n', n_train)), ('component_indices', train_component_indices)), open(train_samples_extra_filename, "w")) print "wrote " + train_samples_filename print "wrote " + train_samples_extra_filename ### VALID ### valid_samples = samples[n_train:(n_train + n_valid),:] valid_component_indices = component_indices[n_train:(n_train + n_valid)] valid_samples_filename = os.path.join(output_dir, "valid_samples.pkl") valid_samples_extra_filename = os.path.join(output_dir, "valid_samples_extra.pkl") cPickle.dump(valid_samples, open(valid_samples_filename, "w")) cPickle.dump(conj(conj(extra_props, ('n', n_valid)), ('component_indices', valid_component_indices)), open(valid_samples_extra_filename, "w")) print "wrote " + valid_samples_filename print "wrote " + valid_samples_extra_filename ### TEST ### test_samples = samples[(n_train + n_valid):(n_train + n_valid + n_test),:] test_component_indices= component_indices[(n_train + n_valid):(n_train + n_valid + n_test)] test_samples_filename = os.path.join(output_dir, "test_samples.pkl") test_samples_extra_filename = os.path.join(output_dir, "test_samples_extra.pkl") cPickle.dump(test_samples, open(test_samples_filename, "w")) cPickle.dump(conj(conj(extra_props, ('n', n_test)), ('component_indices', test_component_indices)), open(test_samples_extra_filename, "w")) print "wrote " + test_samples_filename print "wrote " + test_samples_extra_filename for i in range(0,d-1): output_image_file = os.path.join(output_dir,"overview_dimensions_%d_and_%d.png" % (i,i+1)) if samples.shape[0] > 500: plot_the_overview(samples[0:500,:], i, i+1, output_image_file) else: plot_the_overview(samples, i, i+1, output_image_file) print "wrote " + output_image_file for i in range(0,d-1): output_image_file = os.path.join(output_dir,"component_means_%d_and_%d.png" % (i,i+1)) plot_the_overview(component_means, i, i+1, output_image_file) print "wrote " + output_image_file
def main(argv): """ n_train n_valid n_test d is the dimension of the samples. Should be higher than 2 and preferable 10 or more. mixing_prop controls how much of the vector v_t we mix in with the proposal for v_{t+1} leading_eigenvalue n_components output_dir is the directory in which we'll write the results """ import getopt import cPickle try: opts, args = getopt.getopt(sys.argv[1:], "hv", [ "d=", "n_train=", "n_valid=", "n_test=", "ratio_eigvals=", "n_components=", "output_dir=" ]) except getopt.GetoptError as err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage() sys.exit(2) n_train = None n_valid = None n_test = None d = None ratio_eigvals = 1.0 output_dir = None verbose = False for o, a in opts: if o == "-v": verbose = True elif o in ("-h", "--help"): usage() sys.exit() elif o in ("--n_train"): n_train = int(a) elif o in ("--n_valid"): n_valid = int(a) elif o in ("--n_test"): n_test = int(a) elif o in ("--d"): d = int(a) elif o in ("--ratio_eigvals"): ratio_eigvals = float(a) elif o in ("--n_components"): n_components = int(a) elif o in ("--output_dir"): output_dir = a else: assert False, "unhandled option" assert n_train assert n_valid assert n_test assert d assert n_components assert output_dir start_time = time.time() (component_means, component_covariances, f_parameters) = sample_manifold_components(d, n_components, ratio_eigvals) assert component_means != None assert component_covariances != None # updated method mixturemvn = gaussian_mixture_tools.MixtureMVN(component_means, component_covariances) (samples, component_indices) = mixturemvn.sample(n_train + n_valid + n_test, want_indices=True) # deprecated method #(samples, component_indices) = gaussian_mixture_tools.sample_from_mixture(component_means, component_covariances, n_train + n_valid + n_test) end_time = time.time() computational_cost_in_seconds = int(end_time - start_time) print "Sampling took %d seconds." % computational_cost_in_seconds if not os.path.exists(output_dir): os.makedirs(output_dir) print "Creating directory %s" % output_dir, extra_props = { 'component_means': component_means, 'component_covariances': component_covariances, #'n_train':n_train, #'n_test':n_test, 'd': d, 'ratio_eigvals': ratio_eigvals, 'n_components': n_components, 'f_parameters': f_parameters, 'computational_cost_in_seconds': computational_cost_in_seconds } ### TRAIN ### train_samples = samples[0:n_train, :] train_component_indices = component_indices[0:n_train] train_samples_filename = os.path.join(output_dir, "train_samples.pkl") train_samples_extra_filename = os.path.join(output_dir, "train_samples_extra.pkl") cPickle.dump(train_samples, open(train_samples_filename, "w")) cPickle.dump( conj(conj(extra_props, ('n', n_train)), ('component_indices', train_component_indices)), open(train_samples_extra_filename, "w")) print "wrote " + train_samples_filename print "wrote " + train_samples_extra_filename ### VALID ### valid_samples = samples[n_train:(n_train + n_valid), :] valid_component_indices = component_indices[n_train:(n_train + n_valid)] valid_samples_filename = os.path.join(output_dir, "valid_samples.pkl") valid_samples_extra_filename = os.path.join(output_dir, "valid_samples_extra.pkl") cPickle.dump(valid_samples, open(valid_samples_filename, "w")) cPickle.dump( conj(conj(extra_props, ('n', n_valid)), ('component_indices', valid_component_indices)), open(valid_samples_extra_filename, "w")) print "wrote " + valid_samples_filename print "wrote " + valid_samples_extra_filename ### TEST ### test_samples = samples[(n_train + n_valid):(n_train + n_valid + n_test), :] test_component_indices = component_indices[(n_train + n_valid):(n_train + n_valid + n_test)] test_samples_filename = os.path.join(output_dir, "test_samples.pkl") test_samples_extra_filename = os.path.join(output_dir, "test_samples_extra.pkl") cPickle.dump(test_samples, open(test_samples_filename, "w")) cPickle.dump( conj(conj(extra_props, ('n', n_test)), ('component_indices', test_component_indices)), open(test_samples_extra_filename, "w")) print "wrote " + test_samples_filename print "wrote " + test_samples_extra_filename for i in range(0, d - 1): output_image_file = os.path.join( output_dir, "overview_dimensions_%d_and_%d.png" % (i, i + 1)) if samples.shape[0] > 500: plot_the_overview(samples[0:500, :], i, i + 1, output_image_file) else: plot_the_overview(samples, i, i + 1, output_image_file) print "wrote " + output_image_file for i in range(0, d - 1): output_image_file = os.path.join( output_dir, "component_means_%d_and_%d.png" % (i, i + 1)) plot_the_overview(component_means, i, i + 1, output_image_file) print "wrote " + output_image_file
def fit_with_decreasing_noise( self, X, list_of_train_stddev, optimization_args, early_termination_args={}, X_valid=None, list_of_additional_valid_stddev=None, ): """ The 'optimization_args' filters through to the 'fit' function almost unchanged. There is the option of adding a a special provision for it's 'maxiter' entry when we get a list. In such a situation, we use one value of maxiter from the list for each value of list_of_train_stddev. The 'early_termination_args' is optional. It provides a way to stop the training if we determine that we started in a state that was irredeemable and would only lead to a bad local minimum. We can keep in mind the r(x) = x solution as a benchmark and observe that, with r(x) = x we would have a loss function that roughly equals d * train_stddev**2, where d is the dimension of the data. The 'early_termination_args' dict has one key for now. early_termination_args['stop_if_loss_greater_than'] = [...] or early_termination_args['stop_if_loss_greater_than'] = "auto" If X_valid is not None, we will also return the values of the objective function evaluated with those validation samples. Those values will be the onces according to which we will decide to stop or not the descent with the train_stddev values. """ # If we were passed the argument "auto", we have to replace the # value with an array of corresponding values. if ( early_termination_args.has_key("stop_if_loss_greater_than") and type(early_termination_args["stop_if_loss_greater_than"]) == str ): if early_termination_args["stop_if_loss_greater_than"] == "auto": early_termination_args["stop_if_loss_greater_than"] = [ X.shape[1] * train_stddev ** 2 for train_stddev in list_of_train_stddev ] print "early termination with losses : " print early_termination_args["stop_if_loss_greater_than"] else: print "Wrong value for early_termination_args. Only valid string is 'auto'." print "Exiting." quit() # at some point we might want to decide to # record all the best_q for the sequence seq_train_mean_best_U_q = [] seq_valid_mean_best_U_q = [] i = 0 progress_logger = make_progress_logger("Training") for train_stddev in list_of_train_stddev: sys.stdout.write(" Using train_stddev %f, " % train_stddev) (noisy_X, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights( X, 4.0 * train_stddev, train_stddev ) # noisy_X = X + np.random.normal(size = X.shape, scale = train_stddev) if optimization_args.has_key("maxiter") and type(optimization_args["maxiter"]) in [list, np.array]: assert len(optimization_args["maxiter"]) == len(list_of_train_stddev) optimization_args0 = conj(optimization_args, "maxiter", optimization_args["maxiter"][i]) else: optimization_args0 = optimization_args (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args0) # (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args) train_U_best_q = self.q_loss(best_q, X, noisy_X, importance_sampling_weights).sum() # sanity check to make sure that we're evaluating this right assert abs(train_U_best_q - train_U_best_q_) < 1e-8 train_mean_U_best_q = train_U_best_q / X.shape[0] seq_train_mean_best_U_q.append(train_mean_U_best_q) sys.stdout.write("train mean loss is %f, " % (train_mean_U_best_q,)) if not (X_valid == None): (noisy_X_valid, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights( X_valid, 4.0 * train_stddev, train_stddev ) # noisy_X_valid = X_valid + np.random.normal(size = X_valid.shape, scale = train_stddev) valid_U_best_q = self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() valid_mean_U_best_q = valid_U_best_q / X_valid.shape[0] seq_valid_mean_best_U_q.append(valid_mean_U_best_q) sys.stdout.write("valid mean loss is %f." % (valid_mean_U_best_q,)) # if we're dealing with a validation set, it will be the one used # to determine the stopping point if ( early_termination_args.has_key("stop_if_loss_greater_than") and early_termination_args["stop_if_loss_greater_than"][i] < valid_mean_U_best_q ): break else: # if we don't have a validation set, then we'll use mean_U_best_q # for the termination condition if ( early_termination_args.has_key("stop_if_loss_greater_than") and early_termination_args["stop_if_loss_greater_than"][i] < mean_U_best_q ): break print "" progress_logger(1.0 * i / len(list_of_train_stddev)) i += 1 # end for # might as well pad the rest of the list to # signify that we terminated early while len(seq_train_mean_best_U_q) < len(list_of_train_stddev): seq_train_mean_best_U_q.append(np.nan) while len(seq_valid_mean_best_U_q) < len(list_of_train_stddev): seq_valid_mean_best_U_q.append(np.nan) # Now we want to recompute the model losses for all the values of # the train_stddev, but using the final parameters best_q. # This will be used as an addition quality evaluation to determine # how the DAE treats data that's relatively far from the manifold # once it's done training. # It might be even more informative than the validation losses. seq_valid_mean_U_final_best_q = None seq_alt_valid_mean_U_final_best_q = None if not (X_valid == None): nreps = 10 # This thing doesn't work with the list comprehension. You need to generate the data every time. (noisy_X_valid, importance_sampling_weights) = isotropic_gaussian_noise_and_importance_sampling_weights( X_valid, 4.0 * train_stddev, train_stddev ) seq_valid_mean_U_final_best_q = [ np.array( [ self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() / X_valid.shape[0] for _ in range(nreps) ] ).mean() for train_stddev in list_of_train_stddev ] if (list_of_additional_valid_stddev is not None) and len(list_of_additional_valid_stddev) > 0: # TODO : use some kind of tool to generate the importance_sampling_weights seq_alt_valid_mean_U_final_best_q = [ np.array( [ self.q_loss( best_q, X_valid, X_valid + np.random.normal(size=X_valid.shape, scale=alt_valid_stddev) ).sum() / X_valid.shape[0] for _ in range(nreps) ] ).mean() for alt_valid_stddev in list_of_additional_valid_stddev ] # end if return ( seq_train_mean_best_U_q, seq_valid_mean_best_U_q, seq_valid_mean_U_final_best_q, seq_alt_valid_mean_U_final_best_q, )
def fit_with_decreasing_noise(self, X, list_of_train_stddev, optimization_args, early_termination_args={}, X_valid=None, list_of_additional_valid_stddev=None): """ The 'optimization_args' filters through to the 'fit' function almost unchanged. There is the option of adding a a special provision for it's 'maxiter' entry when we get a list. In such a situation, we use one value of maxiter from the list for each value of list_of_train_stddev. The 'early_termination_args' is optional. It provides a way to stop the training if we determine that we started in a state that was irredeemable and would only lead to a bad local minimum. We can keep in mind the r(x) = x solution as a benchmark and observe that, with r(x) = x we would have a loss function that roughly equals d * train_stddev**2, where d is the dimension of the data. The 'early_termination_args' dict has one key for now. early_termination_args['stop_if_loss_greater_than'] = [...] or early_termination_args['stop_if_loss_greater_than'] = "auto" If X_valid is not None, we will also return the values of the objective function evaluated with those validation samples. Those values will be the onces according to which we will decide to stop or not the descent with the train_stddev values. """ # If we were passed the argument "auto", we have to replace the # value with an array of corresponding values. if (early_termination_args.has_key('stop_if_loss_greater_than') and type(early_termination_args['stop_if_loss_greater_than']) == str): if early_termination_args['stop_if_loss_greater_than'] == "auto": early_termination_args['stop_if_loss_greater_than'] = [ X.shape[1] * train_stddev**2 for train_stddev in list_of_train_stddev ] print "early termination with losses : " print early_termination_args['stop_if_loss_greater_than'] else: print "Wrong value for early_termination_args. Only valid string is 'auto'." print "Exiting." quit() # at some point we might want to decide to # record all the best_q for the sequence seq_train_mean_best_U_q = [] seq_valid_mean_best_U_q = [] i = 0 progress_logger = make_progress_logger("Training") for train_stddev in list_of_train_stddev: sys.stdout.write(" Using train_stddev %f, " % train_stddev) (noisy_X, importance_sampling_weights ) = isotropic_gaussian_noise_and_importance_sampling_weights( X, 4.0 * train_stddev, train_stddev) #noisy_X = X + np.random.normal(size = X.shape, scale = train_stddev) if optimization_args.has_key('maxiter') and type( optimization_args['maxiter']) in [list, np.array]: assert len( optimization_args['maxiter']) == len(list_of_train_stddev) optimization_args0 = conj(optimization_args, "maxiter", optimization_args['maxiter'][i]) else: optimization_args0 = optimization_args (best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args0) #(best_q, train_U_best_q_) = self.fit(X, noisy_X, optimization_args) train_U_best_q = self.q_loss(best_q, X, noisy_X, importance_sampling_weights).sum() # sanity check to make sure that we're evaluating this right assert (abs(train_U_best_q - train_U_best_q_) < 1e-8) train_mean_U_best_q = train_U_best_q / X.shape[0] seq_train_mean_best_U_q.append(train_mean_U_best_q) sys.stdout.write("train mean loss is %f, " % (train_mean_U_best_q, )) if not (X_valid == None): (noisy_X_valid, importance_sampling_weights ) = isotropic_gaussian_noise_and_importance_sampling_weights( X_valid, 4.0 * train_stddev, train_stddev) #noisy_X_valid = X_valid + np.random.normal(size = X_valid.shape, scale = train_stddev) valid_U_best_q = self.q_loss( best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() valid_mean_U_best_q = valid_U_best_q / X_valid.shape[0] seq_valid_mean_best_U_q.append(valid_mean_U_best_q) sys.stdout.write("valid mean loss is %f." % (valid_mean_U_best_q, )) # if we're dealing with a validation set, it will be the one used # to determine the stopping point if (early_termination_args.has_key('stop_if_loss_greater_than') and early_termination_args['stop_if_loss_greater_than'][i] < valid_mean_U_best_q): break else: # if we don't have a validation set, then we'll use mean_U_best_q # for the termination condition if (early_termination_args.has_key('stop_if_loss_greater_than') and early_termination_args['stop_if_loss_greater_than'][i] < mean_U_best_q): break print "" progress_logger(1.0 * i / len(list_of_train_stddev)) i += 1 # end for # might as well pad the rest of the list to # signify that we terminated early while len(seq_train_mean_best_U_q) < len(list_of_train_stddev): seq_train_mean_best_U_q.append(np.nan) while len(seq_valid_mean_best_U_q) < len(list_of_train_stddev): seq_valid_mean_best_U_q.append(np.nan) # Now we want to recompute the model losses for all the values of # the train_stddev, but using the final parameters best_q. # This will be used as an addition quality evaluation to determine # how the DAE treats data that's relatively far from the manifold # once it's done training. # It might be even more informative than the validation losses. seq_valid_mean_U_final_best_q = None seq_alt_valid_mean_U_final_best_q = None if not (X_valid == None): nreps = 10 # This thing doesn't work with the list comprehension. You need to generate the data every time. (noisy_X_valid, importance_sampling_weights ) = isotropic_gaussian_noise_and_importance_sampling_weights( X_valid, 4.0 * train_stddev, train_stddev) seq_valid_mean_U_final_best_q = [ np.array([ self.q_loss(best_q, X_valid, noisy_X_valid, importance_sampling_weights).sum() / X_valid.shape[0] for _ in range(nreps) ]).mean() for train_stddev in list_of_train_stddev ] if (list_of_additional_valid_stddev is not None) and len(list_of_additional_valid_stddev) > 0: # TODO : use some kind of tool to generate the importance_sampling_weights seq_alt_valid_mean_U_final_best_q = [ np.array([ self.q_loss( best_q, X_valid, X_valid + np.random.normal(size=X_valid.shape, scale=alt_valid_stddev)).sum() / X_valid.shape[0] for _ in range(nreps) ]).mean() for alt_valid_stddev in list_of_additional_valid_stddev ] # end if return (seq_train_mean_best_U_q, seq_valid_mean_best_U_q, seq_valid_mean_U_final_best_q, seq_alt_valid_mean_U_final_best_q)