def init_params(options): feadim = options['featureMaps'] ctxdim = 512 hdim = options['hdim'] actionNum = options['actions'] fdim = 64 params = OrderedDict() #params=Saliency_init(params,ctxdim,prefix="recog"); #params=ff_init(params,ctxdim,512,prefix="recog",name='ctx_pre'); params = Linger_init(params, feadim, ctxdim, prefix='recog', name='linger') # FNN channel params = ff_init(params, ctxdim, fdim, prefix="recog", name='highway') # LSTM params = LSTM_init(params, ctxdim, hdim, prefix="recog", name='lstm') params = ff_init(params, hdim, fdim, prefix="recog", name='fullconn') params = ff_init(params, fdim, actionNum, prefix="recog", name='output') tparams = share_params(params) # loading params if need loadfrom = options['loadfrom'] if options['load'] and os.path.exists(loadfrom): print "loading model parameters from ", loadfrom tparams = load_params(loadfrom, tparams, strict=False) return tparams
def init_params(options): ctxdim=options['featureMaps']; hdim=options['hdim']; actionNum=options['actions']; fdim=options['fdim']; params=OrderedDict(); params=Saliency_init(params,ctxdim,prefix="recog",name='saliency'); # FNN channel params=ff_init(params,ctxdim,fdim,prefix="recog",name='fullconn'); # LSTM channel params=LSTM_init(params,ctxdim,hdim,prefix="recog",name='lstm'); params=ff_init(params,hdim,fdim,prefix="recog",name='fullconn_lstm'); #output params=ff_init(params,fdim,actionNum,prefix="recog",name='output'); tparams=share_params(params); # loading params if need loadfrom=options['loadfrom']; if options['load'] and os.path.exists(loadfrom): print "loading model parameters from ",loadfrom; tparams = load_params(loadfrom, tparams,strict=False); return tparams
def init_params(options): ctxdim=options['featureMaps']; hdim=options['hdim']; actionNum=options['actions']; fdim=options['fdim']; # print actionNum; params=OrderedDict(); params=SaliencyLSTM_init(params,ctxdim,prefix="recog",name='saliencyLSTM'); params=ff_init(params,ctxdim,fdim,prefix="recog",name='channel0'); params=LSTM_init(params,ctxdim,hdim,prefix="recog",name='lstm'); params=ff_init(params,hdim,fdim,prefix="recog",name='channel1'); #output params=ff_init(params,fdim,actionNum,prefix="recog",name='output'); tparams=share_params(params); # loading params if need loadfrom=options['loadfrom']; if options['load'] and os.path.exists(loadfrom): print "loading model parameters from ",loadfrom; tparams = load_params(loadfrom, tparams,strict=False); return tparams
def __init__(self, load=None, **kwargs): if load is None: args = {} else: args = util.load_params(load, 'train') util.update(args, mode=RL.Mode.TRAIN, **kwargs) print(args) Default.__init__(self, **args) if self.init: self.model.init() self.model.save() else: self.model.restore() context = zmq.Context.instance() self.experience_socket = context.socket(zmq.PULL) experience_addr = "tcp://%s:%d" % (self.dump, util.port(self.model.name + "/experience")) self.experience_socket.bind(experience_addr) self.params_socket = context.socket(zmq.PUB) params_addr = "tcp://%s:%d" % (self.dump, util.port(self.model.name + "/params")) print("Binding params socket to", params_addr) self.params_socket.bind(params_addr) self.sweep_size = self.batches * self.batch_size print("Sweep size", self.sweep_size) self.buffer = util.CircularQueue(self.sweep_size) self.last_save = time.time()
def init_params(options): ctxdim = options['featureMaps'] hdim = options['hdim'] actionNum = options['actions'] fdim = options['fdim'] params = OrderedDict() params = Saliency_init(params, ctxdim, prefix="recog", name='saliency') # FNN channel params = ff_init(params, ctxdim, fdim, prefix="recog", name='fullconn') # LSTM channel params = LSTM_init(params, ctxdim, hdim, prefix="recog", name='lstm') params = ff_init(params, hdim, fdim, prefix="recog", name='fullconn_lstm') #output params = ff_init(params, fdim, actionNum, prefix="recog", name='output') tparams = share_params(params) # loading params if need loadfrom = options['loadfrom'] if options['load'] and os.path.exists(loadfrom): print "loading model parameters from ", loadfrom tparams = load_params(loadfrom, tparams, strict=False) return tparams
def gen_model(idx, context, model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) def _gencap(cc0): sample, score = gen_sample(tparams, f_init, f_next, cc0, options, trng=trng, k=k, maxlen=200, stochastic=False) # adjust for length bias if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] seq = _gencap(context) return (idx, seq)
def init_params(options): ctxdim=options['featureMaps']; hdim=options['hdim']; actionNum=options['actions']; fdim=options['fdim']; # print actionNum; params=OrderedDict(); params=SaliencyFgbg_init(params,options['locations'],options['featureMaps'],prefix="recog",name='saliencyFgbg'); params=ff_init(params,ctxdim,fdim,prefix="recog",name='channel0'); params=LSTM_init(params,ctxdim,hdim,prefix="recog",name='lstm1'); params=ff_init(params,hdim,fdim,prefix="recog",name='channel1'); #output params=ff_init(params,fdim,actionNum,prefix="recog",name='output'); # params['recog/saliencyFgbg_w']=theano.gradient.grad_clip(params['recog/saliencyFgbg_w'],-0.1,0.1); tparams=share_params(params); # loading params if need loadfrom=options['loadfrom']; if options['load']: if os.path.exists(loadfrom): print "loading model parameters from ",loadfrom; tparams = load_params(loadfrom, tparams,strict=False); else: print "Not exist ",loadfrom; return tparams
def init_params(options): ctxdim = options['featureMaps'] hdim = options['hdim'] actionNum = options['actions'] fdim = options['fdim'] # print actionNum; params = OrderedDict() params = SaliencyFgbg_init(params, options['locations'], options['featureMaps'], prefix="recog", name='saliencyFgbg') params = ff_init(params, ctxdim, fdim, prefix="recog", name='channel0') params = LSTM_init(params, ctxdim, hdim, prefix="recog", name='lstm1') params = ff_init(params, hdim, fdim, prefix="recog", name='channel1') #output params = ff_init(params, fdim, actionNum, prefix="recog", name='output') # params['recog/saliencyFgbg_w']=theano.gradient.grad_clip(params['recog/saliencyFgbg_w'],-0.1,0.1); tparams = share_params(params) # loading params if need loadfrom = options['loadfrom'] if options['load']: if os.path.exists(loadfrom): print "loading model parameters from ", loadfrom tparams = load_params(loadfrom, tparams, strict=False) else: print "Not exist ", loadfrom return tparams
def get_model(): model_params = load_params()["model"] if model_params["name"].lower() == "mlp": p = model_params["mlp"] model = mlp(p["units"], p["activation"]) elif model_params["name"].lower() == "cnn": p = model_params["cnn"] model = cnn(dense_units=p["dense_units"], conv_kernel=(p["conv_kernel_size"], p["conv_kernel_size"]), conv_units=p["conv_units"], dropout=p["dropout"], activation=p["activation"]) else: raise Exception( f"No Model with the name {model_params['name']} is defined") if model_params["optimizer"].lower() == "adam": optimizer = tf.keras.optimizers.Adam() elif model_params["optimizer"].lower() == "sgd": optimizer = tf.keras.optimizers.SGD() elif model_params["optimizer"].lower() == "rmsprop": optimizer = tf.keras.optimizers.RMSprop() elif model_params["optimizer"].lower() == "adadelta": optimizer = tf.keras.optimizers.Adadelta() elif model_params["optimizer"].lower() == "adagrad": optimizer = tf.keras.optimizers.Adagrad() elif model_params["optimizer"].lower() == "adamax": optimizer = tf.keras.optimizers.Adamax() elif model_params["optimizer"].lower() == "nadam": optimizer = tf.keras.optimizers.Nadam() elif model_params["optimizer"].lower() == "ftrl": optimizer = tf.keras.optimizers.Ftrl() else: raise Exception( f"No optimizer with the name {model_params['optimizer']} is defined" ) loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True) metrics = [ tf.keras.metrics.CategoricalAccuracy(), tf.keras.metrics.Precision(), tf.keras.metrics.Recall(), tf.keras.metrics.AUC(curve="ROC", name="ROC", multi_label=True), tf.keras.metrics.AUC(curve="PR", name="PR", multi_label=True), tf.keras.metrics.TruePositives(), tf.keras.metrics.TrueNegatives(), tf.keras.metrics.FalsePositives(), tf.keras.metrics.FalseNegatives() ] model.compile( optimizer=optimizer, loss=loss, metrics=metrics, ) return model
def gen_model(queue, rqueue, pid, model, options, k, normalize, word_idict, sampling): import theano from theano import tensor from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams trng = RandomStreams(1234) # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng, sampling=sampling) def _gencap(cc0): sample, score = gen_sample(tparams, f_init, f_next, cc0, options, trng=trng, k=k, maxlen=200, stochastic=False) # adjust for length bias if normalize: lengths = numpy.array([len(s) for s in sample]) score = score / lengths sidx = numpy.argmin(score) return sample[sidx] while True: req = queue.get() # exit signal if req is None: break idx, context = req[0], req[1] print "Processing example %d in process # %d" % (idx, pid) seq = _gencap(context) print seq rqueue.put((idx, seq)) print "Added example %d to the result queue" % idx print "gen_model process w/ pid %d has returned..." % pid return
def gen_model(model, options): # this is zero indicate we are not using dropout in the graph use_noise = theano.shared(numpy.float32(0.), name='use_noise') # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # build the sampling computational graph # see capgen.py for more detailed explanations f_init, f_next = build_sampler(tparams, options, use_noise, trng) return f_init, f_next
def main(): params = load_params() test_img, test_labels = load_npz_data( "data/fashion-mnist/preprocessed/mnist-test.npz") model = tf.keras.models.load_model("models/fashion-mnist/model.h5") metrics_dict = model.evaluate( test_img, test_labels, batch_size=params["train"]["batch_size"], return_dict=True, ) metrics_file = "metrics.json" with open(metrics_file, "w") as f: f.write(json.dumps(metrics_dict))
def __init__(self, param_folder, data_folder, overlap_thresh=0.5, n=100, use_cuda=False): """ data_folder: the folder of a sequence that has ground-truth (can be BU-RU dataset, look into this) param_folder: folder in which all params files are populated already using hog_multi_trainer """ self.imgs, self.pos_rects = util.read_imgs(data_folder) # self.params contains all the combinations of [BB,bsize,csize,nbins,weights] # list in list structure basically self.sign, self.params = util.load_params(param_folder) self.overlap_thresh = overlap_thresh self.n = n self.use_cuda = use_cuda
def main(): params = load_params()["train"] if params["resume"] and os.path.exists(MODEL_FILE): m = tf.keras.models.load_model(MODEL_FILE) else: m = models.get_model() m.summary() whole_train_img, whole_train_labels = load_npz_data( "data/preprocessed/mnist-train.npz") test_img, test_labels = load_npz_data("data/preprocessed/mnist-test.npz") validation_split_index = int( (1 - params["validation_split"]) * whole_train_img.shape[0]) if validation_split_index == whole_train_img.shape[0]: x_train = whole_train_img x_valid = test_img y_train = whole_train_labels y_valid = test_labels else: x_train = whole_train_img[:validation_split_index] x_valid = whole_train_img[validation_split_index:] y_train = whole_train_labels[:validation_split_index] y_valid = whole_train_labels[validation_split_index:] print(f"x_train: {x_train.shape}") print(f"x_valid: {x_valid.shape}") print(f"y_train: {y_train.shape}") print(f"y_valid: {y_valid.shape}") dvclive.init("training_metrics") m.fit(x_train, y_train, batch_size=params["batch_size"], epochs=params["epochs"], verbose=1, validation_data=(x_valid, y_valid), callbacks=[DVCLiveCallback()]) with open("logs.csv", "w") as f: f.write(history_to_csv(history)) m.save(MODEL_FILE)
def main(): params = load_params()["train"] m = models.get_model() m.summary() whole_train_img, whole_train_labels = load_npz_data( "data/fashion-mnist/preprocessed/mnist-train.npz" ) test_img, test_labels = load_npz_data( "data/fashion-mnist/preprocessed/mnist-test.npz" ) validation_split_index = int( (1 - params["validation_split"]) * whole_train_img.shape[0] ) if validation_split_index == whole_train_img.shape[0]: x_train = whole_train_img x_valid = test_img y_train = whole_train_labels y_valid = test_labels else: x_train = whole_train_img[:validation_split_index] x_valid = whole_train_img[validation_split_index:] y_train = whole_train_labels[:validation_split_index] y_valid = whole_train_labels[validation_split_index:] print(f"x_train: {x_train.shape}") print(f"x_valid: {x_valid.shape}") print(f"y_train: {y_train.shape}") print(f"y_valid: {y_valid.shape}") history = m.fit( x_train, y_train, batch_size=params["batch_size"], epochs=params["epochs"], verbose=1, validation_data=(x_valid, y_valid), ) with open("logs.csv", "w") as f: f.write(history_to_csv(history)) m.save("models/fashion-mnist/model.h5")
def main(): params = load_params()["prepare"] print(params) training_images = mnist_images_idx_to_array( "data/fashion-mnist/raw/train-images-idx3-ubyte.gz") # print(f"Read training data: {training_images}") training_labels = mnist_labels_idx_to_array( "data/fashion-mnist/raw/train-labels-idx1-ubyte.gz") # print(f"Read training labels: {training_labels}") testing_images = mnist_images_idx_to_array( "data/fashion-mnist/raw/t10k-images-idx3-ubyte.gz") # print(f"Read testing data: {testing_images}") testing_labels = mnist_labels_idx_to_array( "data/fashion-mnist/raw/t10k-labels-idx1-ubyte.gz") # print(f"Read testing labels: {testing_labels}") if params["remix"]: training_images, testing_images, training_labels, testing_labels = remix( images1=training_images, images2=testing_images, labels1=training_labels, labels2=testing_labels, seed=params["seed"], split=params["remix_split"]) assert training_images.shape[0] + testing_images.shape[0] == 70000 assert training_labels.shape[0] + testing_labels.shape[0] == 70000 print(f"Training Dataset Shape: {training_images.shape}") print(f"Testing Dataset Shape: {testing_images.shape}") print(f"Training Labels: {training_labels}") print(f"Testing Labels: {testing_labels}") os.makedirs("data/fashion-mnist/prepared") np.savez("data/fashion-mnist/prepared/mnist-train.npz", images=training_images, labels=training_labels) np.savez("data/fashion-mnist/prepared/mnist-test.npz", images=testing_images, labels=testing_labels)
def main(): params = load_params()["preprocess"] print(params) training_images, training_labels = load_npz_data( "data/fashion-mnist/prepared/mnist-train.npz") testing_images, testing_labels = load_npz_data( "data/fashion-mnist/prepared/mnist-test.npz") seed = params["seed"] if params["normalize"]: training_images = normalize(training_images) testing_images = normalize(testing_images) if params["shuffle"]: training_images, training_labels = shuffle_in_parallel( seed, training_images, training_labels) testing_images, testing_labels = shuffle_in_parallel( seed, testing_images, testing_labels) training_labels = tf.keras.utils.to_categorical(training_labels, num_classes=10, dtype="float32") testing_labels = tf.keras.utils.to_categorical(testing_labels, num_classes=10, dtype="float32") print( f"Training Images: {training_images.shape} - {training_images.dtype}") print(f"Testing Images: {testing_images.shape} - {testing_images.dtype}") if not os.path.exists("data/fashion-mnist/preprocessed"): os.makedirs("data/fashion-mnist/preprocessed") np.savez("data/fashion-mnist/preprocessed/mnist-train.npz", images=training_images, labels=training_labels) np.savez("data/fashion-mnist/preprocessed/mnist-test.npz", images=testing_images, labels=testing_labels)
def init_params(options): ctxdim=256; hdim=options['hdim']; actionNum=options['actions']; fdim=options['fdim']; params=OrderedDict(); # params=Saliency_init(params,ctxdim,prefix="recog",name='saliency'); # params=Linger_init(params,ctxdim,ctxdim,prefix='recog',name='linger'); # LSTM # params=LSTM_init(params,ctxdim,hdim,prefix="recog",name='lstm1'); # params=LSTM_init(params,hdim,512,prefix="recog",name='lstm2'); # params=LSTM_init(params,512,256,prefix="recog",name='lstm3'); # params=LSTM_init(params,256,128,prefix="recog",name='lstm4'); # multiChannel params=ff_init(params,ctxdim,fdim,prefix="recog",name='channel0'); # params=ff_init(params,hdim,fdim,prefix="recog",name='channel1'); # params=ff_init(params,512,fdim,prefix="recog",name='channel2'); # params=ff_init(params,256,fdim,prefix="recog",name='channel3'); # params=ff_init(params,128,fdim,prefix="recog",name='channel4'); #output params=ff_init(params,fdim,actionNum,prefix="recog",name='output'); tparams=share_params(params); tparams=cnn_init(tparams,cnn_net,prefix='recog',name='cnn'); # loading params if need loadfrom=options['loadfrom']; if options['load'] and os.path.exists(loadfrom): print "loading model parameters from ",loadfrom; tparams = load_params(loadfrom, tparams,strict=False); return tparams
def init_params(options): ctxdim = 256 hdim = options['hdim'] actionNum = options['actions'] fdim = options['fdim'] params = OrderedDict() # params=Saliency_init(params,ctxdim,prefix="recog",name='saliency'); # params=Linger_init(params,ctxdim,ctxdim,prefix='recog',name='linger'); # LSTM # params=LSTM_init(params,ctxdim,hdim,prefix="recog",name='lstm1'); # params=LSTM_init(params,hdim,512,prefix="recog",name='lstm2'); # params=LSTM_init(params,512,256,prefix="recog",name='lstm3'); # params=LSTM_init(params,256,128,prefix="recog",name='lstm4'); # multiChannel params = ff_init(params, ctxdim, fdim, prefix="recog", name='channel0') # params=ff_init(params,hdim,fdim,prefix="recog",name='channel1'); # params=ff_init(params,512,fdim,prefix="recog",name='channel2'); # params=ff_init(params,256,fdim,prefix="recog",name='channel3'); # params=ff_init(params,128,fdim,prefix="recog",name='channel4'); #output params = ff_init(params, fdim, actionNum, prefix="recog", name='output') tparams = share_params(params) tparams = cnn_init(tparams, cnn_net, prefix='recog', name='cnn') # loading params if need loadfrom = options['loadfrom'] if options['load'] and os.path.exists(loadfrom): print "loading model parameters from ", loadfrom tparams = load_params(loadfrom, tparams, strict=False) return tparams
def init_params(options): featureMaps=options['featureMaps']; actionNum=options['actions']; fdim=options['fdim']; params=OrderedDict(); params=ff_init(params,featureMaps,fdim,prefix="recog",name='fc0'); params=ff_init(params,fdim,fdim,prefix="recog",name='fc1'); #output params=ff_init(params,fdim,actionNum,prefix="recog",name='output'); tparams=share_params(params); tparams=lasagne_net_init(tparams,CNN_NET,CNN_outputLayerName,prefix='recog',name='cnn'); # loading params if need loadfrom=options['loadfrom']; if options['load'] and os.path.exists(loadfrom): print "loading model parameters from ",loadfrom; tparams = load_params(loadfrom, tparams,strict=False); return tparams
def train(args, model_args): model_id = '/data/lisatmp4/anirudhg/spiral_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'logs/walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir print model_dir2 + '/' + 'log.jsonl.gz' logger = mimir.Logger(filename=model_dir2 + '/log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == "lsun" or args.dataset == "lsunsmall": print "loading lsun class!" from load_lsun import load_lsun print "loading lsun data!" if args.dataset == "lsunsmall": dataset_train, dataset_test = load_lsun(args.batch_size, downsample=True) spatial_width = 32 else: dataset_train, dataset_test = load_lsun(args.batch_size, downsample=False) spatial_width = 64 n_colors = 3 elif args.dataset == "celeba": print "loading celeba data" from fuel.datasets.celeba import CelebA dataset_train = CelebA(which_sets=['train'], which_format="64", sources=('features', ), load_in_memory=False) dataset_test = CelebA(which_sets=['test'], which_format="64", sources=('features', ), load_in_memory=False) spatial_width = 64 n_colors = 3 tr_scheme = SequentialScheme(examples=dataset_train.num_examples, batch_size=args.batch_size) ts_scheme = SequentialScheme(examples=dataset_test.num_examples, batch_size=args.batch_size) train_stream = DataStream.default_stream(dataset_train, iteration_scheme=tr_scheme) test_stream = DataStream.default_stream(dataset_test, iteration_scheme=ts_scheme) dataset_train = train_stream dataset_test = test_stream #epoch_it = train_stream.get_epoch_iterator() elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=20000, classes=1, cycles=1., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) elif args.dataset == 'Circle': print 'loading Circle' train_set = Circle(num_examples=20000, classes=1, cycles=1., noise=0.0, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) iter_per_epoch = train_set.num_examples else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() train_stream = dataset_train shp = next(train_stream.get_epoch_iterator())[0].shape print "got epoch iterator" # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) # scale is applied before shift #train_stream = ScaleAndShift(train_stream, scl, shft) #test_stream = ScaleAndShift(test_stream, scl, shft) print 'Building model' params = init_params(model_options) if args.reload_: print "Trying to reload parameters" if os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) print tparams x, cost, start_temperature = build_model(tparams, model_options) inps = [x, start_temperature] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature) #print 'Building f_cost...', #f_cost = theano.function(inps, cost) #print 'Done' print tparams grads = T.grad(cost, wrt=itemlist(tparams)) #get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 0 print 'Number of steps....', args.num_steps print 'Done' count_sample = 1 batch_index = 0 for eidx in xrange(max_epochs): if eidx % 20 == 0: params = unzip(tparams) save_params(params, model_dir + '/' + 'params_' + str(eidx) + '.npz') if eidx == 30: ipdb.set_trace() n_samples = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): batch_index += 1 n_samples += len(data[0]) uidx += 1 if data[0] is None: print 'No data ' uidx -= 1 continue data_run = data[0] temperature_forward = args.temperature meta_cost = [] for meta_step in range(0, args.meta_steps): meta_cost.append(f_grad_shared(data_run, temperature_forward)) f_update(lrate) if args.meta_steps > 1: data_run, sigma, _, _ = forward_diffusion( data_run, temperature_forward) temperature_forward *= args.temperature_factor cost = sum(meta_cost) / len(meta_cost) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1. logger.log({ 'epoch': eidx, 'batch_index': batch_index, 'uidx': uidx, 'training_error': cost }) empty = [] spiral_x = [empty for i in range(args.num_steps)] spiral_corrupted = [] spiral_sampled = [] grad_forward = [] grad_back = [] x_data_time = [] x_tilt_time = [] if batch_index % 8 == 0: count_sample += 1 temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) temperature_forward = args.temperature for num_step in range(args.num_steps): if num_step == 0: x_data_time.append(data[0]) plot_images( data[0], model_dir + '/' + 'orig_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) x_data, mu_data, _, _ = forward_diffusion( data[0], temperature_forward) plot_images( x_data, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(num_step)) x_data_time.append(x_data) temp_grad = np.concatenate( (x_data_time[-2], x_data_time[-1]), axis=1) grad_forward.append(temp_grad) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) spiral_corrupted.append(x_data) mu_data = np.asarray(mu_data).astype( 'float32').reshape(args.batch_size, INPUT_SIZE) mu_data = mu_data.reshape(args.batch_size, 2) else: x_data_time.append(x_data) x_data, mu_data, _, _ = forward_diffusion( x_data, temperature_forward) plot_images( x_data, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(num_step)) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) spiral_corrupted.append(x_data) mu_data = np.asarray(mu_data).astype( 'float32').reshape(args.batch_size, INPUT_SIZE) mu_data = mu_data.reshape(args.batch_size, 2) x_data_time.append(x_data) temp_grad = np.concatenate( (x_data_time[-2], x_data_time[-1]), axis=1) grad_forward.append(temp_grad) temperature_forward = temperature_forward * args.temperature_factor mean_sampled = x_data.mean() var_sampled = x_data.var() x_temp2 = data[0].reshape(args.batch_size, 2) plot_2D( spiral_corrupted, args.num_steps, model_dir + '/' + 'corrupted_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) plot_2D( x_temp2, 1, model_dir + '/' + 'orig_' + 'epoch_' + str(count_sample) + '_batch_index_' + str(batch_index)) plot_grad( grad_forward, model_dir + '/' + 'grad_forward_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) for i in range(args.num_steps + args.extra_steps): x_tilt_time.append(x_data) x_data, sampled_mean = f_sample(x_data, temperature) plot_images( x_data, model_dir + '/' + 'sampled_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index) + '_time_step_' + str(i)) x_tilt_time.append(x_data) temp_grad = np.concatenate( (x_tilt_time[-2], x_tilt_time[-1]), axis=1) grad_back.append(temp_grad) ###print 'Recons, On step number, using temperature', i, temperature x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor plot_grad( grad_back, model_dir + '/' + 'grad_back_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) plot_2D( x_tilt_time, args.num_steps, model_dir + '/' + 'sampled_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) s = np.random.normal(mean_sampled, var_sampled, [args.batch_size, 2]) x_sampled = s temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) x_data = np.asarray(x_sampled).astype('float32') for i in range(args.num_steps + args.extra_steps): x_data, sampled_mean = f_sample(x_data, temperature) spiral_sampled.append(x_data) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor plot_2D( spiral_sampled, args.num_steps, model_dir + '/' + 'inference_' + 'epoch_' + str(count_sample) + '_batch_' + str(batch_index)) ipdb.set_trace()
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='stochastic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=True, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 16, saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr8k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) if dataset == 'coco': valid, _ = valid # the second one contains all the validation data # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
decoding_embedding_size = 64 # RNN Size rnn_size = 64 # Number of Layers num_layers = 2 # Learning Rate learning_rate = 0.0001 # Dropout Keep Probability keep_probability = 0.8 display_step = 10 save_path = 'checkpoints/sup/actor/dev' save_path_critic = 'checkpoints/sup/critic/dev' (source_int_text, target_int_text), (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = util.load_preprocess('preprocess.p') load_path_actor = util.load_params('params_actor_sup.p') source_train = util.read_list('source_train.npy') target_train = util.read_list('target_train.npy') valid_size = batch_size * 10 train_source = source_train[valid_size:] train_target = target_train[valid_size:] valid_source = source_train[:valid_size] valid_target = target_train[:valid_size] test_acc_list = [] train_graph = tf.Graph() critic_graph = tf.Graph() actor_graph = tf.Graph()
def train(args, model_args, lrate): model_id = '/data/lisatmp4/anirudhg/minst_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir logger = mimir.Logger(filename=model_dir2 + '/' + model_id2 + 'log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=100000, classes=1, cycles=2., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() train_stream = Flatten( DataStream.default_stream(dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples, batch_size=args.batch_size))) shp = next(train_stream.get_epoch_iterator())[0].shape # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) # scale is applied before shift #train_stream = ScaleAndShift(train_stream, scl, shft) #test_stream = ScaleAndShift(test_stream, scl, shft) print 'Building model' params = init_params(model_options) if args.reload_ and os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) ''' x = T.matrix('x', dtype='float32') f=transition_operator(tparams, model_options, x, 1) for data in train_stream.get_epoch_iterator(): print data[0] a = f(data[0]) print a ipdb.set_trace() ''' x, cost = build_model(tparams, model_options) inps = [x] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature) print 'Building f_cost...', f_cost = theano.function(inps, cost) print 'Done' print tparams grads = T.grad(cost, wrt=itemlist(tparams)) get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 0 print 'Number of steps....' print args.num_steps print 'Done' count_sample = 1 for eidx in xrange(max_epochs): n_samples = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): batch_index += 1 n_samples += len(data[0]) uidx += 1 if data[0] is None: print 'No data ' uidx -= 1 continue ud_start = time.time() cost = f_grad_shared(data[0]) f_update(lrate) ud = time.time() - ud_start if batch_index % 1 == 0: print 'Cost is this', cost count_sample += 1 from impainting import change_image, inpainting train_temp = data[0] print data[0].shape change_image(train_temp.reshape(args.batch_size, 1, 28, 28), 3) train_temp = train_temp.reshape(args.batch_size, 784) output = inpainting(train_temp) change_image(output.reshape(args.batch_size, 1, 28, 28), 1) reverse_time( scl, shft, output, model_dir + '/' + 'impainting_orig_' + 'epoch_' + str(count_sample) + '_batch_index_' + str(batch_index)) x_data = np.asarray(output).astype('float32') temperature = args.temperature * (args.temperature_factor **(args.num_steps - 1)) temperature = args.temperature #* (args.temperature_factor ** (args.num_steps -1 )) orig_impainted_data = np.asarray(data[0]).astype('float32') for i in range(args.num_steps + args.extra_steps + 5): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( x_data, temperature) print 'Impainting using temperature', i, temperature x_data = do_half_image(x_data, orig_impainted_data) reverse_time( scl, shft, x_data, model_dir + '/' + 'impainting_orig_' + 'epoch_' + str(count_sample) + '_batch_index_' + str(batch_index) + 'step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature = temperature #temperature /= args.temperature_factor ipdb.set_trace()
opts = parser.parse_args() print(opts, file=sys.stderr) assert opts.model_type in ['single', 'multi-res'] FITNESS_EVAL_BATCHES = 10 if opts.popn_size % FITNESS_EVAL_BATCHES != 0: raise Exception("only support population size that's" " a multiple of %d" % FITNESS_EVAL_BATCHES) dataset = data.dataset(split=opts.split, batch_size=opts.num_examples) for x, y_true in dataset: break params = u.load_params(opts.params) @jit def inv_mean_loss(member): if opts.model_type == 'single': # member denotes a channel mask we want to apply to # entire x batch model = models.construct_single_trunk_model() mask_tile_shape = list(x.shape) mask_tile_shape[-1] = 1 mask = jnp.tile(member, mask_tile_shape) logits = model.apply(params, x * mask) else: # multi-res # member denotes channel selection handled in model model = models.construct_multires_model()
# Read the command line arguments parser = argparse.ArgumentParser() parser.add_argument( '--mode', choices=['train', 'test', 'render'], default='train') parser.add_argument( '--params', type=str, default='params.yaml', help='Source for experiment parameters (will be copied to log directory).') parser.add_argument( '--episodes', type=int, default=1, help='Number of episodes during testing.') args = parser.parse_args() # Set the experiment parameter PARAMS_FILE = str(Path(os.path.join(PARAMS_DIR, args.params))) params = load_params(PARAMS_FILE) params['mode'] = args.mode params['test_episodes'] = args.episodes params['random_seed'] = seed_generator(params['random_seed'], params['runs']) params['start_time'] = start_time # Set up directory structure if training #if params['mode'] == 'train': # Create experiment dir params['exp_dir'] = create_dir( Path(os.path.join( LOG_DIR, params['env_type'], params['env_name'], str(time.strftime("%Y-%m-%d_%H-%M"))))) # Safe experiment parameters to log dir
def main(): params = load_params() m = get_model(conv_units=params['model']['conv_units']) m.summary() training_images, training_labels, testing_images, testing_labels = read_dataset( DATASET_FILE) assert training_images.shape[0] + testing_images.shape[0] == 70000 assert training_labels.shape[0] + testing_labels.shape[0] == 70000 training_images = normalize(training_images) testing_images = normalize(testing_images) training_labels = tf.keras.utils.to_categorical(training_labels, num_classes=10, dtype="float32") testing_labels = tf.keras.utils.to_categorical(testing_labels, num_classes=10, dtype="float32") # We use the test set as validation for simplicity x_train = training_images x_valid = testing_images y_train = training_labels y_valid = testing_labels history = m.fit( x_train, y_train, batch_size=BATCH_SIZE, epochs=params["train"]["epochs"], verbose=1, validation_data=(x_valid, y_valid), callbacks=[DvcLiveCallback(model_file=f"{OUTPUT_DIR}/model.h5")], ) metrics_dict = m.evaluate( testing_images, testing_labels, batch_size=BATCH_SIZE, return_dict=True, ) with open(METRICS_FILE, "w") as f: f.write(json.dumps(metrics_dict)) misclassified = {} # predictions for the confusion matrix y_prob = m.predict(x_valid) y_pred = y_prob.argmax(axis=-1) os.makedirs("plots") with open("plots/confusion.csv", "w") as f: f.write("actual,predicted\n") sx = y_valid.shape[0] for i in range(sx): actual = y_valid[i].argmax() predicted = y_pred[i] f.write(f"{actual},{predicted}\n") misclassified[(actual, predicted)] = x_valid[i] # find misclassified examples and generate a confusion table image confusion_out = create_image_matrix(misclassified) imageio.imwrite("plots/confusion.png", confusion_out)
import tensorflow as tf import numpy as np import util _, vocab_to_int, int_to_vocab, token_dict = util.load_preprocess() seq_length, load_dir = util.load_params() def get_tensors(loaded_graph): """ Get input, initial state, final state, and probabilities tensor from <loaded_graph> :param loaded_graph: TensorFlow graph loaded from file :return: Tuple (InputTensor, InitialStateTensor, FinalStateTensor, ProbsTensor) """ with loaded_graph.as_default(): inputs = tf.get_default_graph().get_tensor_by_name('input:0') initial_state = tf.get_default_graph().get_tensor_by_name( 'initial_state:0') fianl_state = tf.get_default_graph().get_tensor_by_name( 'final_state:0') prob = tf.get_default_graph().get_tensor_by_name('probs:0') return inputs, initial_state, fianl_state, prob def pick_word(probabilities, int_to_vocab): """ Pick the next word in the generated text :param probabilities: Probabilites of the next word :param int_to_vocab: Dictionary of word ids as the keys and words as the values :return: String of the predicted word """
import tensorflow as tf import numpy as np import util # Batch Size batch_size = 128 _, (source_vocab_to_int, target_vocab_to_int), ( source_int_to_vocab, target_int_to_vocab) = util.load_preprocess('preprocess.p') load_path_sup = util.load_params('params_actor_sup.p') load_path_actor = util.load_params('params_actor_reinforce.p') source_test = util.read_list('source_test.npy') target_test = util.read_list('target_test.npy') test_acc_list = [] loaded_graph_sup = tf.Graph() loaded_graph_actor = tf.Graph() sup_sess = tf.Session(graph=loaded_graph_sup) actor_sess = tf.Session(graph=loaded_graph_actor) with sup_sess.as_default(): with loaded_graph_sup.as_default(): # Load saved model and restore the saved variables loader = tf.train.import_meta_graph(load_path_sup + '.meta') loader.restore(sup_sess, load_path_sup) input_data = loaded_graph_sup.get_tensor_by_name('input:0') logits = loaded_graph_sup.get_tensor_by_name('predictions:0') target_sequence_length = loaded_graph_sup.get_tensor_by_name(
def train(dim_word=100, # word vector dimensionality ctx_dim=512, # context vector dimensionality dim=1000, # the number of LSTM units attn_type='deterministic', # [see section 4 from paper] n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=False, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit alpha_entropy_c=0.002, # hard attn param RL_sumCost=False, # hard attn param semi_sampling_p=0.5, # hard attn param temperature=1., # hard attn param patience=10, max_epochs=5000, dispFreq=100, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=0.01, # used only for SGD selector=False, # selector (see paper) n_words=10000, # vocab size maxlen=100, # maximum length of the description optimizer='rmsprop', batch_size = 16, valid_batch_size = 2,#change from 16 saveto='model.npz', # relative path of saved model file validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=5, # generate some samples after every sampleFreq updates data_path='./data', # path to find data dataset='flickr30k', dictionary=None, # word dictionary use_dropout=False, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates reload_=False, save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if reload_ and os.path.exists(saveto): print "Reloading options" with open('%s.pkl'%saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(dataset) train, valid, test, worddict = load_data(path=data_path) # index 0 and 1 always code for the end of sentence and unknown token word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) if reload_ and os.path.exists(saveto): print "Reloading model" params = load_params(saveto, params) # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas, alphas_sample,\ cost, \ opt_outs = \ build_model(tparams, model_options) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=opt_outs['attn_updates'] if model_options['attn_type']=='stochastic' else None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ((1.-alphas.sum(0))**2).sum(0).mean() cost += alpha_reg hard_attn_updates = [] # Backprop! if model_options['attn_type'] == 'deterministic': grads = tensor.grad(cost, wrt=itemlist(tparams)) else: # shared variables for hard attention baseline_time = theano.shared(numpy.float32(0.), name='baseline_time') opt_outs['baseline_time'] = baseline_time alpha_entropy_c = theano.shared(numpy.float32(alpha_entropy_c), name='alpha_entropy_c') alpha_entropy_reg = alpha_entropy_c * (alphas*tensor.log(alphas)).mean() # [see Section 4.1: Stochastic "Hard" Attention for derivation of this learning rule] if model_options['RL_sumCost']: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:(baseline_time-opt_outs['masked_cost'].mean(0))[None,:,None]/10.* (-alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) else: grads = tensor.grad(cost, wrt=itemlist(tparams), disconnected_inputs='raise', known_grads={alphas:opt_outs['masked_cost'][:,:,None]/10.* (alphas_sample/alphas) + alpha_entropy_c*(tensor.log(alphas) + 1)}) # [equation on bottom left of page 5] hard_attn_updates += [(baseline_time, baseline_time * 0.9 + 0.1 * opt_outs['masked_cost'].mean())] # updates from scan hard_attn_updates += opt_outs['attn_updates'] # to getthe cost after regularization or the gradients, use this # f_cost = theano.function([x, mask, ctx], cost, profile=False) # f_grad = theano.function([x, mask, ctx], grads, profile=False) # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, hard_attn_updates) print 'Optimization' # [See note in section 4.3 of paper] train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=maxlen) if valid: kf_valid = KFold(len(valid[0]), n_folds=len(valid[0])/valid_batch_size, shuffle=False) if test: kf_test = KFold(len(test[0]), n_folds=len(test[0])/valid_batch_size, shuffle=False) # history_errs is a bare-bones training log that holds the validation and test error history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = numpy.load(saveto)['history_errs'].tolist() best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0])/batch_size if saveFreq == -1: saveFreq = len(train[0])/batch_size if sampleFreq == -1: sampleFreq = len(train[0])/batch_size uidx = 0 estop = False for eidx in xrange(max_epochs): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx = prepare_data(caps, train[1], worddict, maxlen=maxlen, n_words=n_words) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', maxlen continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx) f_update(lrate) ud_duration = time.time() - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Checkpoint if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl'%saveto, 'wb')) print 'Done' # Print a generated sample as a sanity check if numpy.mod(uidx, sampleFreq) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score = gen_sample(tparams, f_init, f_next, ctx_s[jj], model_options, trng=trng, k=5, maxlen=30, stochastic=False) # Decode the sample from encoding back to words print 'Truth ',jj,': ', for vv in x_s[:,jj]: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print for kk, ss in enumerate([sample[0]]): print 'Sample (', kk,') ', jj, ': ', for vv in ss: if vv == 0: break if vv in word_idict: print word_idict[vv], else: print 'UNK', print # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid).mean() if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test).mean() history_errs.append([valid_err, test_err]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or valid_err <= numpy.array(history_errs)[:,0].min(): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto+'_bestll', history_errs=history_errs, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if eidx > patience and len(history_errs) > patience and valid_err >= numpy.array(history_errs)[:-patience,0].min(): bad_counter += 1 if bad_counter > patience: print 'Early Stop!' estop = True break print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err print 'Seen %d samples' % n_samples if estop: break if save_per_epoch: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_errs=history_errs, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) train_err = 0 valid_err = 0 test_err = 0 if valid: valid_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, valid, kf_valid) if test: test_err = -pred_probs(f_log_probs, model_options, worddict, prepare_data, test, kf_test) print 'Train ', train_err, 'Valid ', valid_err, 'Test ', test_err params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, train_err=train_err, valid_err=valid_err, test_err=test_err, history_errs=history_errs, **params) return train_err, valid_err, test_err
import tensorflow as tf import numpy as np import util _, (source_vocab_to_int, target_vocab_to_int), (source_int_to_vocab, target_int_to_vocab) = util.load_preprocess() load_path = util.load_params() # To feed a sentence into the model for translation, you first need to preprocess it def sentence_to_seq(sentence, vocab_to_int): """ Convert a sentence to a sequence of ids :param sentence: String :param vocab_to_int: Dictionary to go from the words to an id :return: List of word ids """ sentence = sentence.lower() sentence_list = sentence.split() word_ids = [] for val in sentence_list: word_ids.append(vocab_to_int.get(val, vocab_to_int['<UNK>'])) return word_ids # Translate translate_sentence = 'he saw a old yellow truck .' translate_sentence = sentence_to_seq(translate_sentence, source_vocab_to_int)
def train( dim_word=300, # word vector dimensionality ctx_dim=300, # context vector dimensionality semantic_dim=300, dim=1000, # the number of LSTM units cnn_dim=4096, # CNN feature dimension n_layers_att=1, # number of layers used to compute the attention weights n_layers_out=1, # number of layers used to compute logit n_layers_lstm=1, # number of lstm layers n_layers_init=1, # number of layers to initialize LSTM at time 0 lstm_encoder=True, # if True, run bidirectional LSTM on input units prev2out=False, # Feed previous word into logit ctx2out=False, # Feed attention weighted ctx into logit cutoff=10, patience=5, max_epochs=30, dispFreq=500, decay_c=0., # weight decay coeff alpha_c=0., # doubly stochastic coeff lrate=1e-4, # used only for SGD selector=False, # selector (see paper) maxlen=30, # maximum length of the description optimizer='rmsprop', pretrained='', batch_size=256, saveto='model', # relative path of saved model file saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq updates embedding='../Data/GloVe/vocab_glove.pkl', cnn_type='vgg', prefix='../Data', # path to find data dataset='coco', criterion='Bleu_4', switch_test_val=False, use_cnninit=True, use_dropout=True, # setting this true turns on dropout at various points use_dropout_lstm=False, # dropout on lstm gates save_per_epoch=False): # this saves down the model every epoch # hyperparam dict model_options = locals().copy() model_options = validate_options(model_options) # reload options if os.path.exists('%s.pkl' % saveto): print "Reloading options" with open('%s.pkl' % saveto, 'rb') as f: model_options = pkl.load(f) print "Using the following parameters:" print model_options print 'Loading data' load_data, prepare_data = get_dataset(model_options['dataset']) # Load data from data path if 'switch_test_val' in model_options and model_options['switch_test_val']: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_test=True) else: train, valid, worddict = load_data(path=osp.join( model_options['prefix'], model_options['dataset']), options=model_options, load_train=True, load_val=True) # Automatically calculate the update frequency validFreq = len(train[0]) / model_options['batch_size'] print "Validation frequency is %d" % validFreq word_idict = {vv: kk for kk, vv in worddict.iteritems()} model_options['n_words'] = len(worddict) # Initialize (or reload) the parameters using 'model_options' # then build the Theano graph print 'Building model' params = init_params(model_options) # Initialize it with glove if 'VCemb' in params: params['VCemb'] = read_pkl( model_options['embedding']).astype('float32') # If there is a same experiment, don't use pretrained weights if os.path.exists('%s.npz' % saveto): print "Reloading model" params = load_params('%s.npz' % saveto, params) elif pretrained != '': params = load_params(pretrained, params, False) # Only pretrain the Language model # numpy arrays -> theano shared variables tparams = init_tparams(params) # In order, we get: # 1) trng - theano random number generator # 2) use_noise - flag that turns on dropout # 3) inps - inputs for f_grad_shared # 4) cost - log likelihood for each sentence # 5) opts_out - optional outputs (e.g selector) trng, use_noise, \ inps, alphas,\ cost, \ opt_outs = \ build_model(tparams, model_options) # Load evaluator to calculate bleu score evaluator = cocoEvaluation(model_options['dataset']) # To sample, we use beam search: 1) f_init is a function that initializes # the LSTM at time 0 [see top right of page 4], 2) f_next returns the distribution over # words and also the new "initial state/memory" see equation print 'Building sampler' f_init, f_next = build_sampler(tparams, model_options, use_noise, trng) # we want the cost without any the regularizers # define the log probability f_log_probs = theano.function(inps, -cost, profile=False, updates=None, allow_input_downcast=True) # Define the cost function + Regularization cost = cost.mean() # add L2 regularization costs if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # Doubly stochastic regularization if alpha_c > 0.: alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = sum([ alpha_c * ((1. - alpha.sum(0))**2).sum(0).mean() for alpha in alphas ]) cost += alpha_reg # Backprop! grads = tensor.grad(cost, wrt=itemlist(tparams)) # to getthe cost after regularization or the gradients, use this # f_grad_shared computes the cost and updates adaptive learning rate variables # f_update updates the weights of the model lr = tensor.scalar(name='lr') f_grad_shared, f_update = eval(model_options['optimizer'])(lr, tparams, grads, inps, cost) print 'Optimization' train_iter = HomogeneousData(train, batch_size=batch_size, maxlen=model_options['maxlen']) # history_bleu is a bare-bones training log, reload history history_bleu = [] if os.path.exists('%s.npz' % saveto): history_bleu = numpy.load('%s.npz' % saveto)['history_bleu'].tolist() start_epochs = len(history_bleu) best_p = None bad_counter = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size if sampleFreq == -1: sampleFreq = len(train[0]) / batch_size uidx = 0 estop = False for eidx in xrange(start_epochs, model_options['max_epochs']): n_samples = 0 print 'Epoch ', eidx for caps in train_iter: n_samples += len(caps) uidx += 1 # turn on dropout use_noise.set_value(1.) # preprocess the caption, recording the # time spent to help detect bottlenecks pd_start = time.time() x, mask, ctx, cnn_feats = prepare_data(caps, train[1], train[2], worddict, model_options) pd_duration = time.time() - pd_start if x is None: print 'Minibatch with zero sample under length ', model_options[ 'maxlen'] continue # get the cost for the minibatch, and update the weights ud_start = time.time() cost = f_grad_shared(x, mask, ctx, cnn_feats) print "Epoch %d, Updates: %d, Cost is: %f" % (eidx, uidx, cost) f_update(model_options['lrate']) ud_duration = time.time( ) - ud_start # some monitoring for each mini-batch # Numerical stability check if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' return 1., 1., 1. if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, 'Update ', uidx, 'Cost ', cost, 'PD ', pd_duration, 'UD ', ud_duration # Print a generated sample as a sanity check if numpy.mod(uidx, model_options['sampleFreq']) == 0: # turn off dropout first use_noise.set_value(0.) x_s = x mask_s = mask ctx_s = ctx # generate and decode the a subset of the current training batch for jj in xrange(numpy.minimum(10, len(caps))): sample, score, alphas = gen_sample( f_init, f_next, ctx_s[jj], cnn_feats[jj], model_options, trng=trng, maxlen=model_options['maxlen']) # Decode the sample from encoding back to words print 'Truth ', jj, ': ', print seqs2words(x_s[:, jj], word_idict) for kk, ss in enumerate([sample[0]]): print 'Sample (', kk, ') ', jj, ': ', print seqs2words(ss, word_idict) # Log validation loss + checkpoint the model with the best validation log likelihood if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) # Do evaluation on validation set imgid = collapse([elem[-1] for elem in valid[0]]) caps = process_examples([f_init], [f_next], imgid, valid[1], valid[2], word_idict, model_options) folder = osp.join('../output', '%s_%s' % (saveto, 'val')) if not osp.exists(folder): os.mkdir(folder) with open(osp.join(folder, 'captions_val2014_results.json'), 'w') as f: json.dump(caps, f) eva_result = evaluator.evaluate(folder, False) if model_options['criterion'] == 'combine': history_bleu.append(eva_result['Bleu_4'] + eva_result['CIDEr']) else: history_bleu.append(eva_result[model_options['criterion']]) # the model with the best validation long likelihood is saved seperately with a different name if uidx == 0 or history_bleu[-1] == max(history_bleu): best_p = unzip(tparams) print 'Saving model with best validation ll' params = copy.copy(best_p) params = unzip(tparams) numpy.savez(saveto + '_bestll', history_bleu=history_bleu, **params) bad_counter = 0 # abort training if perplexity has been increasing for too long if len(history_bleu) > model_options[ 'patience'] and history_bleu[-1] <= max( history_bleu[:-model_options['patience']]): bad_counter += 1 if bad_counter > model_options['patience']: print 'Early Stop!' estop = True break print ' BLEU-4 score ', history_bleu[-1] # Checkpoint if numpy.mod(uidx, model_options['saveFreq']) == 0: print 'Saving...', if best_p is not None: params = copy.copy(best_p) else: params = unzip(tparams) numpy.savez(saveto, history_bleu=history_bleu, **params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print 'Done' print 'Seen %d samples' % n_samples if estop: break if model_options['save_per_epoch']: numpy.savez(saveto + '_epoch_' + str(eidx + 1), history_bleu=history_bleu, **unzip(tparams)) # use the best nll parameters for final checkpoint (if they exist) if best_p is not None: zipp(best_p, tparams) params = copy.copy(best_p) numpy.savez(saveto, zipped_params=best_p, history_bleu=history_bleu, **params)
def main(model, saveto, k=5, normalize=False, zero_pad=False, datasets='dev,test', sampling=False, pkl_name=None): # load model model_options if pkl_name is None: pkl_name = model with open('%s.pkl'% pkl_name, 'rb') as f: options = pkl.load(f) # fetch data, skip ones we aren't using to save time load_data, prepare_data = get_dataset(options['dataset']) train, valid, test, worddict = load_data(path='./data/coco/', load_train=True if 'train' in datasets else False, load_dev=True if 'dev' in datasets else False, load_test=True if 'test' in datasets else False) # import pdb; pdb.set_trace() # <eos> means end of sequence (aka periods), UNK means unknown word_idict = dict() for kk, vv in worddict.iteritems(): word_idict[vv] = kk word_idict[0] = '<eos>' word_idict[1] = 'UNK' # get the parameters params = init_params(options) params = load_params(model, params) tparams = init_tparams(params) # index -> words def _seqs2words(caps): capsw = [] for cc in caps: ww = [] for w in cc: if w == 0: break ww.append(word_idict[w]) capsw.append(' '.join(ww)) return capsw # process all dev examples def _process_examples(contexts, params, tparams): caps = [None] * contexts.shape[0] for idx, ctx in enumerate(contexts): cc = ctx.todense().reshape([5,4122]) #cc = ctx.todense().reshape([14*14,512]) if zero_pad: cc0 = numpy.zeros((cc.shape[0]+1, cc.shape[1])).astype('float32') cc0[:-1,:] = cc else: cc0 = cc resp = gen_model(idx, cc0, model, options, k, normalize, word_idict, sampling, params, tparams) caps[resp[0]] = resp[1] print 'Sample ', (idx+1), '/', contexts.shape[0], ' Done' print resp[1] sys.stdout.flush() return caps ds = datasets.strip().split(',') # send all the features for the various datasets for dd in ds: if dd == 'train': print 'Training Set...', new_train = train[1][:2000] caps = _seqs2words(_process_examples(new_train, params, tparams)) # import pdb; pdb.set_trace() with open(saveto+'.train.txt', 'w') as f: print >>f, '\n'.join(caps) print 'Done' if dd == 'dev': print 'Development Set...', caps = _seqs2words(_process_examples(valid[1], params, tparams)) # import pdb; pdb.set_trace() with open(saveto+'.dev.txt', 'w') as f: print >>f, '\n'.join(caps) print 'Done' if dd == 'test': print 'Test Set...', caps = _seqs2words(_process_examples(test[1][:1000], params, tparams)) # subset with open(saveto+'.test.txt', 'w') as f: print >>f, '\n'.join(caps) print 'Done'
def train(args, model_args): #model_id = '/data/lisatmp4/lambalex/lsun_walkback/walkback_' model_id = '/data/lisatmp4/anirudhg/cifar_walk_back/walkback_' model_dir = create_log_dir(args, model_id) model_id2 = 'logs/walkback_' model_dir2 = create_log_dir(args, model_id2) print model_dir print model_dir2 + '/' + 'log.jsonl.gz' logger = mimir.Logger(filename=model_dir2 + '/log.jsonl.gz', formatter=None) # TODO batches_per_epoch should not be hard coded lrate = args.lr import sys sys.setrecursionlimit(10000000) args, model_args = parse_args() #trng = RandomStreams(1234) if args.resume_file is not None: print "Resuming training from " + args.resume_file from blocks.scripts import continue_training continue_training(args.resume_file) ## load the training data if args.dataset == 'MNIST': print 'loading MNIST' from fuel.datasets import MNIST dataset_train = MNIST(['train'], sources=('features', )) dataset_test = MNIST(['test'], sources=('features', )) n_colors = 1 spatial_width = 28 elif args.dataset == 'CIFAR10': from fuel.datasets import CIFAR10 dataset_train = CIFAR10(['train'], sources=('features', )) dataset_test = CIFAR10(['test'], sources=('features', )) n_colors = 3 spatial_width = 32 elif args.dataset == "lsun" or args.dataset == "lsunsmall": print "loading lsun class!" from load_lsun import load_lsun print "loading lsun data!" if args.dataset == "lsunsmall": dataset_train, dataset_test = load_lsun(args.batch_size, downsample=True) spatial_width = 32 else: dataset_train, dataset_test = load_lsun(args.batch_size, downsample=False) spatial_width = 64 n_colors = 3 elif args.dataset == "celeba": print "loading celeba data" from fuel.datasets.celeba import CelebA dataset_train = CelebA(which_sets=['train'], which_format="64", sources=('features', ), load_in_memory=False) dataset_test = CelebA(which_sets=['test'], which_format="64", sources=('features', ), load_in_memory=False) spatial_width = 64 n_colors = 3 tr_scheme = SequentialScheme(examples=dataset_train.num_examples, batch_size=args.batch_size) ts_scheme = SequentialScheme(examples=dataset_test.num_examples, batch_size=args.batch_size) train_stream = DataStream.default_stream(dataset_train, iteration_scheme=tr_scheme) test_stream = DataStream.default_stream(dataset_test, iteration_scheme=ts_scheme) dataset_train = train_stream dataset_test = test_stream #epoch_it = train_stream.get_epoch_iterator() elif args.dataset == 'Spiral': print 'loading SPIRAL' train_set = Spiral(num_examples=100000, classes=1, cycles=2., noise=0.01, sources=('features', )) dataset_train = DataStream.default_stream( train_set, iteration_scheme=ShuffledScheme(train_set.num_examples, args.batch_size)) else: raise ValueError("Unknown dataset %s." % args.dataset) model_options = locals().copy() if args.dataset != 'lsun' and args.dataset != 'celeba': train_stream = Flatten( DataStream.default_stream( dataset_train, iteration_scheme=ShuffledScheme( examples=dataset_train.num_examples - (dataset_train.num_examples % args.batch_size), batch_size=args.batch_size))) else: train_stream = dataset_train test_stream = dataset_test print "Width", WIDTH, spatial_width shp = next(train_stream.get_epoch_iterator())[0].shape print "got epoch iterator" # make the training data 0 mean and variance 1 # TODO compute mean and variance on full dataset, not minibatch Xbatch = next(train_stream.get_epoch_iterator())[0] scl = 1. / np.sqrt(np.mean((Xbatch - np.mean(Xbatch))**2)) shft = -np.mean(Xbatch * scl) # scale is applied before shift #train_stream = ScaleAndShift(train_stream, scl, shft) #test_stream = ScaleAndShift(test_stream, scl, shft) print 'Building model' params = init_params(model_options) if args.reload_: print "Trying to reload parameters" if os.path.exists(args.saveto_filename): print 'Reloading Parameters' print args.saveto_filename params = load_params(args.saveto_filename, params) tparams = init_tparams(params) print tparams ''' x = T.matrix('x', dtype='float32') temp = T.scalar('temp', dtype='float32') f=transition_operator(tparams, model_options, x, temp) for data in train_stream.get_epoch_iterator(): print data[0] a = f([data[0], 1.0, 1]) #ipdb.set_trace() ''' x, cost, start_temperature = build_model(tparams, model_options) inps = [x, start_temperature] x_Data = T.matrix('x_Data', dtype='float32') temperature = T.scalar('temperature', dtype='float32') forward_diffusion = one_step_diffusion(x_Data, model_options, tparams, temperature) #print 'Building f_cost...', #f_cost = theano.function(inps, cost) #print 'Done' print tparams grads = T.grad(cost, wrt=itemlist(tparams)) #get_grads = theano.function(inps, grads) for j in range(0, len(grads)): grads[j] = T.switch(T.isnan(grads[j]), T.zeros_like(grads[j]), grads[j]) # compile the optimizer, the actual computational graph is compiled here lr = T.scalar(name='lr') print 'Building optimizers...', optimizer = args.optimizer f_grad_shared, f_update = getattr(optimizers, optimizer)(lr, tparams, grads, inps, cost) print 'Done' for param in tparams: print param print tparams[param].get_value().shape print 'Buiding Sampler....' f_sample = sample(tparams, model_options) print 'Done' uidx = 0 estop = False bad_counter = 0 max_epochs = 4000 batch_index = 1 print 'Number of steps....' print args.num_steps print "Number of metasteps...." print args.meta_steps print 'Done' count_sample = 1 for eidx in xrange(max_epochs): if eidx % 20 == 0: params = unzip(tparams) save_params(params, model_dir + '/' + 'params_' + str(eidx) + '.npz') n_samples = 0 print 'Starting Next Epoch ', eidx for data in train_stream.get_epoch_iterator(): if args.dataset == 'CIFAR10': if data[0].shape[0] == args.batch_size: data_use = (data[0].reshape(args.batch_size, 3 * 32 * 32), ) else: continue t0 = time.time() batch_index += 1 n_samples += len(data_use[0]) uidx += 1 if data_use[0] is None: print 'No data ' uidx -= 1 continue ud_start = time.time() t1 = time.time() data_run = data_use[0] temperature_forward = args.temperature meta_cost = [] for meta_step in range(0, args.meta_steps): meta_cost.append(f_grad_shared(data_run, temperature_forward)) f_update(lrate) if args.meta_steps > 1: data_run, sigma, _, _ = forward_diffusion( [data_run, temperature_forward, 1]) temperature_forward *= args.temperature_factor cost = sum(meta_cost) / len(meta_cost) ud = time.time() - ud_start #gradient_updates_ = get_grads(data_use[0],args.temperature) if np.isnan(cost) or np.isinf(cost): print 'NaN detected' return 1. t1 = time.time() #print time.time() - t1, "time to get grads" t1 = time.time() logger.log({ 'epoch': eidx, 'batch_index': batch_index, 'uidx': uidx, 'training_error': cost }) #'Norm_1': np.linalg.norm(gradient_updates_[0]), #'Norm_2': np.linalg.norm(gradient_updates_[1]), #'Norm_3': np.linalg.norm(gradient_updates_[2]), #'Norm_4': np.linalg.norm(gradient_updates_[3])}) #print time.time() - t1, "time to log" #print time.time() - t0, "total time in batch" t5 = time.time() if batch_index % 20 == 0: print batch_index, "cost", cost if batch_index % 200 == 0: count_sample += 1 temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) temperature_forward = args.temperature for num_step in range(args.num_steps * args.meta_steps): print "Forward temperature", temperature_forward if num_step == 0: x_data, sampled, sampled_activation, sampled_preactivation = forward_diffusion( [data_use[0], temperature_forward, 1]) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) x_temp = x_data.reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp, model_dir + '/' + "batch_" + str(batch_index) + '_corrupted' + 'epoch_' + str(count_sample) + '_time_step_' + str(num_step)) else: x_data, sampled, sampled_activation, sampled_preactivation = forward_diffusion( [x_data, temperature_forward, 1]) x_data = np.asarray(x_data).astype('float32').reshape( args.batch_size, INPUT_SIZE) x_temp = x_data.reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp, model_dir + '/batch_' + str(batch_index) + '_corrupted' + '_epoch_' + str(count_sample) + '_time_step_' + str(num_step)) temperature_forward = temperature_forward * args.temperature_factor x_temp2 = data_use[0].reshape(args.batch_size, n_colors, WIDTH, WIDTH) plot_images( x_temp2, model_dir + '/' + 'orig_' + 'epoch_' + str(eidx) + '_batch_index_' + str(batch_index)) temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) for i in range(args.num_steps * args.meta_steps + args.extra_steps): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( [x_data, temperature, 0]) print 'On backward step number, using temperature', i, temperature reverse_time( scl, shft, x_data, model_dir + '/' + "batch_" + str(batch_index) + '_samples_backward_' + 'epoch_' + str(count_sample) + '_time_step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor if args.noise == "gaussian": x_sampled = np.random.normal( 0.5, 2.0, size=(args.batch_size, INPUT_SIZE)).clip(0.0, 1.0) else: s = np.random.binomial(1, 0.5, INPUT_SIZE) temperature = args.temperature * (args.temperature_factor**( args.num_steps * args.meta_steps - 1)) x_data = np.asarray(x_sampled).astype('float32') for i in range(args.num_steps * args.meta_steps + args.extra_steps): x_data, sampled, sampled_activation, sampled_preactivation = f_sample( [x_data, temperature, 0]) print 'On step number, using temperature', i, temperature reverse_time( scl, shft, x_data, model_dir + '/batch_index_' + str(batch_index) + '_inference_' + 'epoch_' + str(count_sample) + '_step_' + str(i)) x_data = np.asarray(x_data).astype('float32') x_data = x_data.reshape(args.batch_size, INPUT_SIZE) if temperature == args.temperature: temperature = temperature else: temperature /= args.temperature_factor ipdb.set_trace()