def check_scalar_grads(precision, backend): cgt.reset_config() np.random.seed(0) cgt.set_precision(precision) cgt.core.update_config(backend=backend) x = cgt.scalar('x') y = cgt.scalar('y') z = cgt.scalar('z') vars = [x,y,z] #pylint: disable=W0622 vals = nr.rand(len(vars))+1 PROB2RESULT = {} for ((key,_), cls) in it.chain( it.izip(core.UNARY_INFO.items(),it.repeat(core.ElwiseUnary)), it.izip(core.BINARY_INFO.items(),it.repeat(core.ElwiseBinary)) ): if key == "conj": print "skipping conj" continue utils.colorprint(utils.Color.YELLOW, "Testing %s\n"%key) if cls == core.ElwiseUnary: n_in = 1 op = cls(key) else: n_in = 2 op = cls(key, (True,True)) inputvars = vars[0:n_in] inputvals = vals[0:n_in] out = core.Result(op, inputvars) f = cgt.function(inputvars, out) try: grads = cgt.grad(out, inputvars) except core.NonDifferentiable: print "nondiff" continue if DISPLAY: print "Function:" cgt.print_tree(out) print "Gradient original:" cgt.print_tree(grads) print "Gradient simplified:" grads_simple = core.simplify(grads) if DISPLAY: cgt.print_tree(grads_simple) gradf = cgt.function(inputvars, grads) eps = {"single":1e-4,"double":1e-9}[precision] nugrad = numeric_grad(lambda li: f(*li), inputvals,eps=eps) #pylint: disable=W0640 cgtgrad = gradf(*inputvals) np.testing.assert_almost_equal(nugrad,cgtgrad,decimal={"single":3,"double":6}[precision]) grad_count = core.count_nodes(grads_simple) PROB2RESULT[key] = {} PROB2RESULT[key]["grad"] = grad_count if DISPLAY: from thirdparty.tabulate import tabulate print tabulate([[key,val["grad"]] for (key,val) in PROB2RESULT.iteritems()],headers=["funcname","gradcount"])
def test_scalars(): np.random.seed(0) x = cgt.scalar('x') y = cgt.scalar('y') z = cgt.scalar('z') vars = [x,y,z] #pylint: disable=W0622 vals = nr.rand(len(vars))+1 PROB2RESULT = {} for ((key,_), cls) in it.chain( it.izip(core.UNARY_INFO.items(),it.repeat(core.ElwiseUnary)), it.izip(core.BINARY_INFO.items(),it.repeat(core.ElwiseBinary)) ): if key == "conj": print "skipping conj" continue utils.colorprint(utils.Color.YELLOW, "Testing %s\n"%key) if cls == core.ElwiseUnary: n_in = 1 op = cls(key) else: n_in = 2 op = cls(key, (True,True)) inputvars = vars[0:n_in] inputvals = vals[0:n_in] out = core.Result(op, inputvars) f = cgt.function(inputvars, out) try: grads = cgt.grad(out, inputvars) except core.NonDifferentiable: print "nondiff" continue if DISPLAY: print "Function:" cgt.print_tree(out) print "Gradient original:" cgt.print_tree(grads) print "Gradient simplified:" grads_simple = core.simplify(grads) if DISPLAY: cgt.print_tree(grads_simple) gradf = cgt.function(inputvars, grads) eps = {"single":1e-4,"double":1e-9}[cgt.get_precision()] nugrad = numeric_grad(lambda li: f(*li), inputvals,eps=eps) #pylint: disable=W0640 cgtgrad = gradf(*inputvals) np.testing.assert_almost_equal(nugrad,cgtgrad,decimal={"single":3,"double":6}[cgt.get_precision()]) grad_count = core.count_nodes(grads_simple) PROB2RESULT[key] = {} PROB2RESULT[key]["grad"] = grad_count if DISPLAY: from thirdparty.tabulate import tabulate print tabulate([[key,val["grad"]] for (key,val) in PROB2RESULT.iteritems()],headers=["funcname","gradcount"])
def test_im2col(): for settings in [ ((4,4),(0,0),(1,1)), ((3,3),(1,1),(2,2)), ((3,3),(1,1),(3,3)) ]: xval = np.arange(2*1*28*28).reshape(2,1,28,28).astype(cgt.floatX) x = cgt.tensor4("x", fixed_shape=xval.shape) y = im2col(x, *settings) h = cgt.constant(np.random.randn(*cgt.infer_shape(y))) cost = (y*h).sum() fcost = cgt.function([x],cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval,eps=1e-5) gana = fgrad(xval) assert np.allclose(gnum, gana)
def test_pool(**kwargs): np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2,3,5,7)) y = max_pool_2d(x, (4,4),(0,0),(1,1)) xval = np.random.randn(2,3,5,7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y*h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum,gana)
def test_im2col(): for settings in [((4, 4), (0, 0), (1, 1)), ((3, 3), (1, 1), (2, 2)), ((3, 3), (1, 1), (3, 3))]: xval = np.arange(2 * 1 * 28 * 28).reshape(2, 1, 28, 28).astype(cgt.floatX) x = cgt.tensor4("x", fixed_shape=xval.shape) y = im2col(x, *settings) h = cgt.constant(np.random.randn(*cgt.infer_shape(y))) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval, eps=1e-5) gana = fgrad(xval) assert np.allclose(gnum, gana)
def test_cpu_pool(**kwargs): np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2, 3, 5, 7)) y = max_pool_2d(x, (4, 4), (0, 0), (1, 1)) xval = np.random.randn(2, 3, 5, 7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum, gana)
def test_cpu_pool(): with cgt.scoped_update_config(precision="quad", backend="native"): print cgt.get_precision() ci = get_compile_info() np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2, 3, 5, 7)) y = max_pool_2d(x, (4, 4), (0, 0), (1, 1)) xval = np.random.randn(2, 3, 5, 7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y * h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum, gana)
def test_cpu_pool(): with cgt.scoped_update_config(precision="quad",backend="native"): print cgt.get_precision() ci = get_compile_info() np.random.seed(0) x = cgt.tensor4("x", fixed_shape=(2,3,5,7)) y = max_pool_2d(x, (4,4),(0,0),(1,1)) xval = np.random.randn(2,3,5,7) hval = np.random.randn(*cgt.infer_shape(y)) h = cgt.constant(hval) cost = (y*h).sum() fcost = cgt.function([x], cost) fgrad = cgt.function([x], cgt.grad(cost, [x])[0]) from cgt.numeric_diff import numeric_grad gnum = numeric_grad(fcost, xval) gana = fgrad(xval) assert np.allclose(gnum,gana)
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--grad_check", action="store_true") parser.add_argument("--n_batches", type=int, default=1000000) parser.add_argument("--profile", action="store_true") parser.add_argument("--unittest", action="store_true") args = parser.parse_args() np.seterr("raise") cgt.set_precision("quad" if args.grad_check else "double") np.random.seed(0) # model parameters if args.grad_check: opt = NTMOpts( b=1, # batch size h=1, # number of heads n=2, # number of memory sites m=3, # dimension at each memory site k=4, # dimension of input p=2, # dimension of output ff_hid_sizes=[]) seq_length = 2 else: opt = NTMOpts( b=64, # batch size h=3, # number of heads n=128, # number of memory sites m=20, # dimension at each memory site k=3, # dimension of input p=1, # dimension of output ff_hid_sizes=[128, 128]) seq_length = 10 if args.unittest: seq_length = 3 args.n_batches = 3 tstart = time.time() ntm = make_ntm(opt) task = CopyTask(opt.b, seq_length, opt.p) f_loss, f_loss_and_grad, params = make_funcs(opt, ntm, task.total_time(), task.loss_timesteps()) print "graph construction and compilation took %g seconds" % (time.time() - tstart) pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(), ))) if args.grad_check: x, y = task.gen_batch() def f(thnew): thold = th.copy() pc.set_value_flat(thnew) loss = f_loss(x, y) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad g_num = numeric_grad(f, th, eps=1e-8) _, _, g_anal = f_loss_and_grad(x, y) assert np.allclose(g_num, g_anal, atol=1e-8) print "Gradient check succeeded!" print "%i/%i elts of grad are nonzero" % ( (g_anal != 0).sum(), g_anal.size) return seq_num = 0 state = make_rmsprop_state(pc.get_value_flat(), .01, .95) print fmt_row(13, ["seq num", "CE (bits)", "accuracy", "|g|_inf"], header=True) if args.profile: cgt.profiler.start() for i in xrange(args.n_batches): x, y = task.gen_batch() seq_num += x.shape[1] l, l01, g = f_loss_and_grad(x, y) print fmt_row(13, [seq_num, l, l01, np.abs(g).max()]) rmsprop_update(g, state) pc.set_value_flat(state.theta) if not np.isfinite(l): break if args.profile: cgt.profiler.print_stats()
def main(): nr.seed(0) parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, default="alice") parser.add_argument("--size_mem", type=int,default=64) parser.add_argument("--size_batch", type=int,default=64) parser.add_argument("--n_layers",type=int,default=2) parser.add_argument("--n_unroll",type=int,default=16) parser.add_argument("--k_in",type=int,default=3) parser.add_argument("--k_h",type=int,default=5) parser.add_argument("--step_size",type=float,default=.01) parser.add_argument("--decay_rate",type=float,default=0.95) parser.add_argument("--n_epochs",type=int,default=20) parser.add_argument("--arch",choices=["lstm","gru"],default="gru") parser.add_argument("--grad_check",action="store_true") parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") args = parser.parse_args() cgt.set_precision("quad" if args.grad_check else "single") assert args.n_unroll > 1 loader = Loader(args.data_dir,args.size_batch, args.n_unroll, (.8,.1,.1)) network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step(args.arch, loader.size_vocab, loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll, args.k_in, args.k_h) if args.profile: profiler.start() params = network.get_parameters() pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-0.01, 0.01, size=(pc.get_total_size(),))) for i, param in enumerate(pc.params): if "is_rotation" in param.props: shape = pc.get_shapes()[i] num_vec = int(shape[0] / 2) size_vec = int(shape[1]) gauss = nr.normal(size=(num_vec * size_vec)) gauss = np.reshape(gauss, (num_vec, size_vec)) gauss_mag = norm(gauss, axis=1, keepdims=True) gauss_normed = gauss / gauss_mag gauss_perturb = nr.normal(scale=0.01, size=(num_vec * size_vec)) gauss_perturb = np.reshape(gauss_perturb, (num_vec, size_vec)) second_vec = gauss_normed + gauss_perturb second_vec_mag = norm(second_vec, axis=1, keepdims=True) second_vec_normed = second_vec / second_vec_mag new_param_value = np.zeros(shape) for j in xrange(num_vec): new_param_value[2 * j, :] = gauss_normed[j, :] new_param_value[2 * j + 1, :] = second_vec_normed[j, :] param.op.set_value(new_param_value) #print new_param_value def initialize_hiddens(n): return [np.ones((n, args.size_mem), cgt.floatX) / float(args.size_mem) for _ in xrange(get_num_hiddens(args.arch, args.n_layers))] if args.grad_check: #if True: x,y = loader.train_batches_iter().next() prev_hiddens = initialize_hiddens(args.size_batch) def f(thnew): thold = pc.get_value_flat() pc.set_value_flat(thnew) loss = f_loss(x,y, *prev_hiddens) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad print "Beginning grad check" g_num = numeric_grad(f, pc.get_value_flat(),eps=1e-10) print "Ending grad check" result = f_loss_and_grad(x,y,*prev_hiddens) g_anal = result[1] diff = g_num - g_anal abs_diff = np.abs(diff) print np.where(abs_diff > 1e-4) print diff[np.where(abs_diff > 1e-4)] embed() assert np.allclose(g_num, g_anal, atol=1e-4) print "Gradient check succeeded!" return optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size = args.step_size, decay_rate = args.decay_rate) for iepoch in xrange(args.n_epochs): losses = [] tstart = time() print "starting epoch",iepoch cur_hiddens = initialize_hiddens(args.size_batch) for (x,y) in loader.train_batches_iter(): out = f_loss_and_grad(x,y, *cur_hiddens) loss = out[0] grad = out[1] cur_hiddens = out[2:] rmsprop_update(grad, optim_state) pc.set_value_flat(optim_state.theta) losses.append(loss) if args.unittest: return print "%.3f s/batch. avg loss = %.3f"%((time()-tstart)/len(losses), np.mean(losses)) optim_state.step_size *= .98 #pylint: disable=E1101 sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=300, temp=1.0, seed_text = "") if args.profile: profiler.print_stats()
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument("--grad_check",action="store_true") parser.add_argument("--n_batches",type=int,default=1000000) parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest", action="store_true") parser.add_argument("--task",choices=["copy","reverse_copy","repeat_copy"],default="copy") args = parser.parse_args() np.seterr("raise") cgt.set_precision("quad" if args.grad_check else "double") np.random.seed(0) # model parameters if args.grad_check: opt = NTMOpts( b = 1, # batch size h = 1, # number of heads n = 2, # number of memory sites m = 3, # dimension at each memory site k = 4, # dimension of input p = 2, # dimension of output ff_hid_sizes = [] ) seq_length = 2 else: opt = NTMOpts( b = 64, # batch size h = 3, # number of heads n = 128, # number of memory sites m = 20, # dimension at each memory site k = 3, # dimension of input p = 1, # dimension of output ff_hid_sizes = [128,128] ) seq_length = 10 if args.unittest: seq_length=3 args.n_batches=3 tstart = time.time() ntm = make_ntm(opt) if args.task == "copy": task = CopyTask(opt.b, seq_length, opt.p) elif args.task == "reverse_copy": task = ReverseCopyTask(opt.b, seq_length, opt.p) elif args.task == "repeat_copy": n_copies = 4 task = RepeatCopyTask(opt.b, seq_length, opt.p, n_copies) f_loss, f_loss_and_grad, params = make_funcs(opt, ntm, task.total_time(), task.loss_timesteps()) print "graph construction and compilation took %g seconds"%(time.time()-tstart) pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),))) if args.grad_check: x,y = task.gen_batch() def f(thnew): thold = th.copy() pc.set_value_flat(thnew) loss = f_loss(x,y) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad g_num = numeric_grad(f, th,eps=1e-8) _, _, g_anal = f_loss_and_grad(x,y) assert np.allclose(g_num, g_anal, atol=1e-8) print "Gradient check succeeded!" print "%i/%i elts of grad are nonzero"%( (g_anal != 0).sum(), g_anal.size ) return seq_num = 0 state = make_rmsprop_state(pc.get_value_flat(), .01, .95) print fmt_row(13, ["seq num", "CE (bits)", "accuracy", "|g|_inf"], header=True) if args.profile: cgt.profiler.start() for i in xrange(args.n_batches): x,y = task.gen_batch() seq_num += x.shape[1] l,l01,g = f_loss_and_grad(x,y) print fmt_row(13, [seq_num, l,l01,np.abs(g).max()]) rmsprop_update(g, state) pc.set_value_flat(state.theta) if not np.isfinite(l): break if args.profile: cgt.profiler.print_stats()
def main(): nr.seed(0) parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, default="alice") parser.add_argument("--size_mem", type=int, default=64) parser.add_argument("--size_batch", type=int, default=64) parser.add_argument("--n_layers", type=int, default=2) parser.add_argument("--n_unroll", type=int, default=16) parser.add_argument("--step_size", type=float, default=.01) parser.add_argument("--decay_rate", type=float, default=0.95) parser.add_argument("--n_epochs", type=int, default=20) parser.add_argument("--arch", choices=["lstm", "gru"], default="lstm") parser.add_argument("--grad_check", action="store_true") parser.add_argument("--profile", action="store_true") parser.add_argument("--unittest", action="store_true") args = parser.parse_args() cgt.set_precision("quad" if args.grad_check else "single") assert args.n_unroll > 1 loader = Loader(args.data_dir, args.size_batch, args.n_unroll, (.8, .1, .1)) network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step( args.arch, loader.size_vocab, loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll) if args.profile: profiler.start() params = network.get_parameters() pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(), ))) def initialize_hiddens(n): return [ np.zeros((n, args.size_mem), cgt.floatX) for _ in xrange(get_num_hiddens(args.arch, args.n_layers)) ] if args.grad_check: x, y = loader.train_batches_iter().next() prev_hiddens = initialize_hiddens(args.size_batch) def f(thnew): thold = pc.get_value_flat() pc.set_value_flat(thnew) loss = f_loss(x, y, *prev_hiddens) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad g_num = numeric_grad(f, pc.get_value_flat(), eps=1e-10) result = f_loss_and_grad(x, y, *prev_hiddens) g_anal = result[1] assert np.allclose(g_num, g_anal, atol=1e-4) print "Gradient check succeeded!" return optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size=args.step_size, decay_rate=args.decay_rate) for iepoch in xrange(args.n_epochs): losses = [] tstart = time() print "starting epoch", iepoch cur_hiddens = initialize_hiddens(args.size_batch) for (x, y) in loader.train_batches_iter(): out = f_loss_and_grad(x, y, *cur_hiddens) loss = out[0] grad = out[1] cur_hiddens = out[2:] rmsprop_update(grad, optim_state) pc.set_value_flat(optim_state.theta) losses.append(loss) if args.unittest: return print "%.3f s/batch. avg loss = %.3f" % ( (time() - tstart) / len(losses), np.mean(losses)) optim_state.step_size *= .98 #pylint: disable=E1101 sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=300, temp=1.0, seed_text="") if args.profile: profiler.print_stats()
def main(): nr.seed(0) parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, default="alice") parser.add_argument("--size_mem", type=int, default=64) parser.add_argument("--size_batch", type=int, default=64) parser.add_argument("--n_layers", type=int, default=2) parser.add_argument("--n_unroll", type=int, default=16) parser.add_argument("--k_in", type=int, default=3) parser.add_argument("--k_h", type=int, default=5) parser.add_argument("--step_size", type=float, default=.01) parser.add_argument("--decay_rate", type=float, default=0.95) parser.add_argument("--n_epochs", type=int, default=20) parser.add_argument("--arch", choices=["lstm", "gru"], default="gru") parser.add_argument("--grad_check", action="store_true") parser.add_argument("--profile", action="store_true") parser.add_argument("--unittest", action="store_true") args = parser.parse_args() cgt.set_precision("quad" if args.grad_check else "single") assert args.n_unroll > 1 loader = Loader(args.data_dir, args.size_batch, args.n_unroll, (.8, .1, .1)) network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step( args.arch, loader.size_vocab, loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll, args.k_in, args.k_h) if args.profile: profiler.start() params = network.get_parameters() pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-0.01, 0.01, size=(pc.get_total_size(), ))) for i, param in enumerate(pc.params): if "is_rotation" in param.props: shape = pc.get_shapes()[i] num_vec = int(shape[0] / 2) size_vec = int(shape[1]) gauss = nr.normal(size=(num_vec * size_vec)) gauss = np.reshape(gauss, (num_vec, size_vec)) gauss_mag = norm(gauss, axis=1, keepdims=True) gauss_normed = gauss / gauss_mag gauss_perturb = nr.normal(scale=0.01, size=(num_vec * size_vec)) gauss_perturb = np.reshape(gauss_perturb, (num_vec, size_vec)) second_vec = gauss_normed + gauss_perturb second_vec_mag = norm(second_vec, axis=1, keepdims=True) second_vec_normed = second_vec / second_vec_mag new_param_value = np.zeros(shape) for j in xrange(num_vec): new_param_value[2 * j, :] = gauss_normed[j, :] new_param_value[2 * j + 1, :] = second_vec_normed[j, :] param.op.set_value(new_param_value) #print new_param_value def initialize_hiddens(n): return [ np.ones((n, args.size_mem), cgt.floatX) / float(args.size_mem) for _ in xrange(get_num_hiddens(args.arch, args.n_layers)) ] if args.grad_check: #if True: x, y = loader.train_batches_iter().next() prev_hiddens = initialize_hiddens(args.size_batch) def f(thnew): thold = pc.get_value_flat() pc.set_value_flat(thnew) loss = f_loss(x, y, *prev_hiddens) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad print "Beginning grad check" g_num = numeric_grad(f, pc.get_value_flat(), eps=1e-10) print "Ending grad check" result = f_loss_and_grad(x, y, *prev_hiddens) g_anal = result[1] diff = g_num - g_anal abs_diff = np.abs(diff) print np.where(abs_diff > 1e-4) print diff[np.where(abs_diff > 1e-4)] embed() assert np.allclose(g_num, g_anal, atol=1e-4) print "Gradient check succeeded!" return optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size=args.step_size, decay_rate=args.decay_rate) for iepoch in xrange(args.n_epochs): losses = [] tstart = time() print "starting epoch", iepoch cur_hiddens = initialize_hiddens(args.size_batch) for (x, y) in loader.train_batches_iter(): out = f_loss_and_grad(x, y, *cur_hiddens) loss = out[0] grad = out[1] cur_hiddens = out[2:] rmsprop_update(grad, optim_state) pc.set_value_flat(optim_state.theta) losses.append(loss) if args.unittest: return print "%.3f s/batch. avg loss = %.3f" % ( (time() - tstart) / len(losses), np.mean(losses)) optim_state.step_size *= .98 #pylint: disable=E1101 sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=300, temp=1.0, seed_text="") if args.profile: profiler.print_stats()
def main(): nr.seed(0) parser = argparse.ArgumentParser() parser.add_argument("--data_dir", type=str, default="alice") parser.add_argument("--size_mem", type=int,default=64) parser.add_argument("--size_batch", type=int,default=64) parser.add_argument("--n_layers",type=int,default=2) parser.add_argument("--n_unroll",type=int,default=16) parser.add_argument("--step_size",type=float,default=.01) parser.add_argument("--decay_rate",type=float,default=0.95) parser.add_argument("--n_epochs",type=int,default=20) parser.add_argument("--arch",choices=["lstm","gru"],default="lstm") parser.add_argument("--grad_check",action="store_true") parser.add_argument("--profile",action="store_true") parser.add_argument("--unittest",action="store_true") parser.add_argument("--temperature",type=float,default=1) args = parser.parse_args() cgt.set_precision("quad" if args.grad_check else "single") assert args.n_unroll > 1 loader = Loader(args.data_dir,args.size_batch, args.n_unroll, (1.0,0,0)) network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step(args.arch, loader.size_vocab, loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll) if args.profile: profiler.start() params = network.get_parameters() pc = ParamCollection(params) pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),))) def initialize_hiddens(n): return [np.zeros((n, args.size_mem), cgt.floatX) for _ in xrange(get_num_hiddens(args.arch, args.n_layers))] if args.grad_check: x,y = loader.train_batches_iter().next() prev_hiddens = initialize_hiddens(args.size_batch) def f(thnew): thold = pc.get_value_flat() pc.set_value_flat(thnew) loss = f_loss(x,y, *prev_hiddens) pc.set_value_flat(thold) return loss from cgt.numeric_diff import numeric_grad g_num = numeric_grad(f, pc.get_value_flat(),eps=1e-10) result = f_loss_and_grad(x,y,*prev_hiddens) g_anal = result[1] assert np.allclose(g_num, g_anal, atol=1e-4) print "Gradient check succeeded!" return optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size = args.step_size, decay_rate = args.decay_rate) for iepoch in xrange(args.n_epochs): losses = [] tstart = time() print "starting epoch",iepoch cur_hiddens = initialize_hiddens(args.size_batch) for (x,y) in loader.train_batches_iter(): out = f_loss_and_grad(x,y, *cur_hiddens) loss = out[0] grad = out[1] cur_hiddens = out[2:] rmsprop_update(grad, optim_state) pc.set_value_flat(optim_state.theta) losses.append(loss) if args.unittest: return print "%.3f s/batch. avg loss = %.3f"%((time()-tstart)/len(losses), np.mean(losses)) optim_state.step_size *= .98 #pylint: disable=E1101 sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=1000, temperature=args.temperature, seed_text = "") if args.profile: profiler.print_stats()