def search_model_adam(state, channel, reload_model=False): pp.pprint(state) def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x) def NRect(x, rng=None, use_noise=False, std=0.05): assert rng is not None if use_noise: x = x + rng.normal(x.shape, avg=0.0, std=std, dtype=x.dtype) return Trect(x) def get_inps(use_mask=True, vgen=None, use_bow_out=False, debug=False, output_map=None): if use_mask: X, y, mask, cmask = TT.itensor3("X"), TT.imatrix("y"), TT.fmatrix("mask"), \ TT.fmatrix("cost_mask") qmask = TT.fmatrix("qmask") bow_out = TT.ftensor3("bow_out") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'].astype("int32") y.tag.test_value = batch['y'].astype("int32") mask.tag.test_value = batch['mask'].astype("float32") cmask.tag.test_value = batch['cmask'].astype("float32") qmask.tag.test_value = batch["qmask"].astype("float32") if use_bow_out: bow_out.tag.test_value = batch['bow_out'].astype("float32") if output_map: outs = {} outs["X"] = X outs["y"] = y outs["mask"] = mask outs["cmask"] = cmask if use_bow_out: outs["bow_out"] = bow_out outs["qmask"] = qmask else: outs = [X, y, mask, cmask] if use_bow_out: outs += [bow_out] return outs else: X, y = TT.itensor3("X"), TT.itensor3("y") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'] y.tag.test_value = batch['y'] return [X, y] lr = state.lr batch_size = state.batch_size seed = state.get("seed", 3) seed_path = "{0}/seed_{1}.txt".format(state.save_path, str(seed)) replace_seed(seed_path, seed) seed_setter = SEEDSetter(seed_path) print "seed is", seed_setter # No of els in the cols of the content for the memory mem_size = state.mem_size # No of rows in M mem_nel = state.mem_nel std = state.std renormalization_scale = state.renormalization_scale sub_mb_size = state.sub_mb_size smoothed_diff_weights = state.get('smoothed_diff_weights', True) # No of hids for controller n_hids = state.n_hids # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.get('bow_size', 80) # ff controller use_ff_controller = state.use_ff_controller # For RNN controller: learn_h0 = state.get('learn_h0', False) use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = state.get('use_loc_based_addressing', False) bowout = state.get('bowout', True) use_reinforce = state.get('use_reinforce', False) max_seq_len = state.max_seq_len max_fact_len = state.max_fact_len n_read_heads = state.n_read_heads n_write_heads = 1 n_reading_steps = state.n_reading_steps lambda1_rein = state.get('lambda1_rein', 4e-5) lambda2_rein = state.get('lambda2_rein', 1e-5) base_reg = 2e-5 #size of the address in the memory: address_size = state.address_size renormalization_scale = state.renormalization_scale w2v_embed_scale = 0.05 use_layer_norm = state.get('use_layer_norm', False) rng = np.random.RandomState(int(seed)) trng = RandomStreams(int(seed)) NRect = lambda x, use_noise=False: NRect( x, rng=trng, use_noise=use_noise, std=std) use_noise = False emb_scale = state.get('emb_scale', 0.32) use_quad_interactions = state.get('use_quad_interactions', True) mode = state.get('theano_function_mode', None) import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10)) task_id = state.task_id print "Task id is, ", task_id cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh use_gru_inp = state.get('use_gru_inp', True) use_bow_inp = state.get('use_bow_inp', False) w2v_embed_path = None use_reinforce_baseline = state.use_reinforce_baseline use_reinforce = state.get('use_reinforce', False) l1_pen = state.get('l1_pen', 1e-4) l2_pen = state.get('l2_pen', 1e-3) hybrid_att = state.get('hybrid_att', False) use_dice_val = state.get('use_dice_val', False) debug = state.get('debug', False) correlation_ws = state.get('correlation_ws', 6e-4) anticorr = state.get('anticorr', None) path = state.path prfx = ( "ntm_on_fb_BABI_task_%(task_id)d_seed_%(seed)s_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f" ) % locals() prfx = state.save_path + prfx tdata_gen = FBbABIDataIteratorSingleQ( task_file='all_tasks_train_ngram_False.pkl', randomize=True, max_seq_len=max_seq_len, max_fact_len=max_fact_len, task_id=task_id, task_path=path, mode='train', fact_vocab="../all_tasks_test_ngram_False_dict.pkl", batch_size=batch_size) vdata_gen = FBbABIDataIteratorSingleQ( task_file='all_tasks_valid_ngram_False.pkl', max_fact_len=tdata_gen.max_fact_len, max_seq_len=max_seq_len, randomize=False, task_id=task_id, mode="valid", task_path=path, fact_vocab="../all_tasks_test_ngram_False_dict.pkl", batch_size=batch_size) tst_data_gen = FBbABIDataIteratorSingleQ( task_file='../all_tasks_test_ngram_False.pkl', max_fact_len=tdata_gen.max_fact_len, max_seq_len=max_seq_len, randomize=False, task_id=task_id, mode="valid", task_path=path, fact_vocab="../all_tasks_test_ngram_False_dict.pkl", batch_size=batch_size) use_mask = True n_layers = state.get('n_layers', 1) inps = get_inps(vgen=vdata_gen, debug=debug, use_bow_out=bowout, output_map=True) wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=std, rng=rng, init_method=BiasInitMethods.Constant, center=0.0) print "Length of the vocabulary, ", len(tdata_gen.vocab.items()) ntm = NTMModel(n_in=len(tdata_gen.vocab.items()), n_hids=n_hids, bow_size=bow_size, n_out=len(tdata_gen.vocab.items()), predict_bow_out=bowout, mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, n_layers=n_layers, hybrid_att=hybrid_att, smoothed_diff_weights=smoothed_diff_weights, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, w2v_embed_scale=w2v_embed_scale, emb_scale=emb_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_layer_norm=use_layer_norm, use_last_hidden_state=False, use_loc_based_addressing=use_loc_based_addressing, use_simple_rnn_inp_rep=False, use_gru_inp_rep=use_gru_inp, use_bow_input=use_bow_inp, anticorr=anticorr, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, correlation_ws=correlation_ws, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=use_reinforce, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, debug=debug, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=True, use_noise=use_noise, max_fact_len=max_fact_len, softmax=True, use_mask=use_mask, batch_size=batch_size) bow_weight_stop = state.get('bow_weight_stop', 1.2 * 1e-1) bow_weight_anneal_start = state.get('bow_weight_anneal_start', 320) bow_weight_start = state.get("bow_weight_start", 0.74) bow_out_anneal_rate = state.get("bow_out_anneal_rate", 2 * 1e-4) save_freq = state.get("save_freq", 1000) main_loop = FBaBIMainLoop(ntm, print_every=50, checkpoint_every=save_freq, validate_every=500, bow_out_anneal_rate=bow_out_anneal_rate, bow_weight_start=bow_weight_start, bow_weight_stop=bow_weight_stop, bow_weight_anneal_start=bow_weight_anneal_start, train_data_gen=tdata_gen, valid_data_gen=vdata_gen, test_data_gen=tst_data_gen, learning_rate=lr, reload_model=reload_model, valid_iters=None, linear_start=False, use_qmask=True, max_iters=state.max_iters, state=state, prefix=prfx) main_loop.run() if channel is None: return None return channel.COMPLETE
def search_model_adam(state, channel, reload_model=False): pp.pprint(state) def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x) def get_inps(use_mask=True, vgen=None, debug=False, output_map=None): if use_mask: X, y, mask, cmask = TT.ftensor3("X"), TT.ftensor3("y"), \ TT.fmatrix("mask"), TT.ftensor3("cost_mask") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'].astype("float32") y.tag.test_value = batch['y'].astype("float32") mask.tag.test_value = batch['mask'].astype("float32") cmask.tag.test_value = batch['cmask'].astype("float32") if output_map: outs = {} outs["X"] = X outs["y"] = y outs["mask"] = mask outs["cmask"] = cmask else: outs = [X, y, mask, cmask] return outs else: X, y = TT.tensor3("X"), TT.tensor3("y") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'] y.tag.test_value = batch['y'] return [X, y] lr = state['lr'] batch_size = state['batch_size'] # No of els in the cols of the content for the memory mem_size = state['mem_size'] # No of rows in M mem_nel = state['mem_nel'] std = state['std'] renormalization_scale = state['renormalization_scale'] sub_mb_size = state['sub_mb_size'] smoothed_diff_weights = state.get('smoothed_diff_weights', True) max_len = 10 inp_size = 10 # No of hids for controller n_hids = state['n_hids'] # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.get('bow_size', 80) # ff controller use_ff_controller = state['use_ff_controller'] # For RNN controller: learn_h0 = state.get('learn_h0', False) use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = state.get('use_loc_based_addressing', False) bowout = state.get('bowout', False) use_reinforce = state.get('use_reinforce', False) seed = 7 n_read_heads = state['n_read_heads'] n_write_heads = 1 n_reading_steps = state['n_reading_steps'] lambda1_rein = state.get('lambda1_rein', 4e-5) lambda2_rein = state.get('lambda2_rein', 1e-5) base_reg = 2e-5 #size of the address in the memory: address_size = state['address_size'] renormalization_scale = state['renormalization_scale'] w2v_embed_scale = 0.05 rng = np.random.RandomState(seed) trng = RandomStreams(seed) NRect = lambda x, use_noise=False: NRect(x, rng=trng, use_noise=use_noise, std=std) use_noise = False use_quad_interactions = state.get('use_quad_interactions', True) mode = state.get('theano_function_mode', None) import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10)) cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh use_gru_inp = state.get('use_gru_inp', False) use_bow_inp = state.get('use_bow_inp', False) w2v_embed_path = None use_reinforce_baseline = state['use_reinforce_baseline'] use_reinforce = state.get('use_reinforce', False) l1_pen = state.get('l1_pen', 1e-4) l2_pen = state.get('l2_pen', 1e-3) hybrid_att = state.get('hybrid_att', False) use_dice_val = state.get('use_dice_val', False) debug = state.get('debug', False) correlation_ws = state.get('correlation_ws', False) anticorr = state.get('anticorr', None) prfx = ("ntm_copy_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d_rs_%(renormalization_scale)f" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)d_lr_%(lr)f_%(address_size)d") % locals() save_path = state.get("save_path", ".") prfx = save_path + prfx tdata_gen = CopyDataGen(batch_size, max_len, inp_size, rng=rng, seed=seed, rnd_len=True) vdata_gen = CopyDataGen(batch_size, max_len, inp_size, rng=rng, seed=2, rnd_len=False) tst_data_gen = CopyDataGen(batch_size, max_len, inp_size, rng=rng, seed=3, rnd_len=False) n_layers = state.get('n_layers', 1) inps = get_inps(vgen=vdata_gen, debug=debug, output_map=True) wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=std, rng=rng, init_method=BiasInitMethods.Constant, center=0.0) ntm = NTMModel(n_in=inp_size, n_hids=n_hids, bow_size=bow_size, n_out=inp_size, predict_bow_out=bowout, mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, n_layers=n_layers, hybrid_att=hybrid_att, smoothed_diff_weights=smoothed_diff_weights, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, w2v_embed_scale=w2v_embed_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_last_hidden_state=False, use_loc_based_addressing=use_loc_based_addressing, use_simple_rnn_inp_rep=False, use_gru_inp_rep=use_gru_inp, use_bow_input=use_bow_inp, anticorr=anticorr, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, correlation_ws=correlation_ws, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=use_reinforce, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, debug=debug, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=True, use_noise=use_noise, max_fact_len=max_len, softmax=False, batch_size=batch_size) save_freq = state.get("save_freq", 1000) main_loop = NTMToyMainLoop(ntm, print_every=50, checkpoint_every=save_freq, validate_every=500, train_data_gen=tdata_gen, valid_data_gen=vdata_gen, test_data_gen=tst_data_gen, learning_rate=lr, reload_model=reload_model, valid_iters=200, max_iters=state['max_iters'], state=state, prefix=prfx) main_loop.run()
def search_model_adam(state, channel, reload_model=False): pp.pprint(state) def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x) def get_inps(vgen=None, debug=False, output_map=None): X, y = TT.fmatrix("X"), TT.vector("y", dtype="uint8") if debug: theano.config.compute_test_value = "warn" batch = vgen.get_epoch_iterator().next() X.tag.test_value = batch[0].reshape((batch[0].shape[0], -1)) y.tag.test_value = batch[1].flatten() return [X, y] lr = state['lr'] batch_size = state['batch_size'] # No of els in the cols of the content for the memory mem_size = state['mem_size'] # No of rows in M mem_nel = state['mem_nel'] std = state['std'] renormalization_scale = state['renormalization_scale'] sub_mb_size = state['sub_mb_size'] smoothed_diff_weights = state.get('smoothed_diff_weights', False) max_len = 784 inp_size = 1 # No of hids for controller n_hids = state['n_hids'] # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.get('bow_size', 80) # ff controller use_ff_controller = state['use_ff_controller'] # For RNN controller: learn_h0 = state.get('learn_h0', False) use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = state.get('use_loc_based_addressing', False) bowout = state.get('bowout', False) use_reinforce = state.get('use_reinforce', False) permute_order = state.get('permute_order', True) seed = 7 n_read_heads = state['n_read_heads'] n_write_heads = 1 n_reading_steps = state['n_reading_steps'] lambda1_rein = state.get('lambda1_rein', 4e-5) lambda2_rein = state.get('lambda2_rein', 1e-5) base_reg = 2e-5 #size of the address in the memory: address_size = state["address_size"] w2v_embed_scale = 0.05 n_out = 10 rng = np.random.RandomState(seed) trng = RandomStreams(seed) NRect = lambda x, use_noise=False: NRect( x, rng=trng, use_noise=use_noise, std=std) use_noise = False use_quad_interactions = state.get('use_quad_interactions', True) mode = state.get('theano_function_mode', None) import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=state.get('gradient_clip', 10)) cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh use_gru_inp = state.get('use_gru_inp', False) use_bow_inp = state.get('use_bow_inp', False) w2v_embed_path = None use_reinforce_baseline = state['use_reinforce_baseline'] use_reinforce = state.get('use_reinforce', False) l1_pen = state.get('l1_pen', 1e-4) l2_pen = state.get('l2_pen', 1e-3) hybrid_att = state.get('hybrid_att', False) use_dice_val = state.get('use_dice_val', False) debug = state.get('debug', False) correlation_ws = state.get('correlation_ws', False) idxs = np.arange(max_len) np.random.shuffle(idxs) use_batch_norm = state.get("use_batch_norm", False) anticorr = state.get('anticorr', None) prfx = ( "ntm_on_fb_copy_task_all_learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f_use_bn_%(use_batch_norm)d_hard2" ) % locals() tdata_gen = get_stream(which_set="train", batch_size=batch_size) vdata_gen = get_stream(which_set="valid", batch_size=batch_size) tst_data_gen = get_stream(which_set="test", batch_size=batch_size) n_layers = state.get('n_layers', 1) inps = get_inps(vgen=vdata_gen, debug=debug, output_map=True) wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=1e-3, rng=rng, init_method=BiasInitMethods.Random, center=0.0) ntm = NTMModel(n_in=inp_size, n_hids=n_hids, bow_size=bow_size, n_out=n_out, predict_bow_out=bowout, mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, n_layers=n_layers, hybrid_att=hybrid_att, smoothed_diff_weights=smoothed_diff_weights, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, use_batch_norm=use_batch_norm, w2v_embed_scale=w2v_embed_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_last_hidden_state=True, use_loc_based_addressing=use_loc_based_addressing, use_simple_rnn_inp_rep=False, use_gru_inp_rep=use_gru_inp, use_bow_input=use_bow_inp, use_inp_content=False, anticorr=anticorr, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, correlation_ws=correlation_ws, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=use_reinforce, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, debug=debug, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=False, use_noise=use_noise, rnd_indxs=idxs, permute_order=permute_order, max_fact_len=max_len, softmax=True, batch_size=None) save_freq = state.get("save_freq", 1000) main_loop = SeqMNISTMainLoop(ntm, print_every=50, checkpoint_every=save_freq, validate_every=500, train_data_gen=tdata_gen, valid_data_gen=vdata_gen, test_data_gen=tst_data_gen, learning_rate=lr, reload_model=reload_model, num_epochs=250, state=state, prefix=prfx) main_loop.run()
def train( dim_word_desc=400, # word vector dimensionality dim_word_q=400, dim_word_ans=600, dim_proj=300, dim=400, # the number of LSTM units encoder_desc='lstm', encoder_desc_word='lstm', encoder_desc_sent='lstm', use_dq_sims=False, eyem=None, learn_h0=False, use_desc_skip_c_g=False, debug=False, encoder_q='lstm', patience=10, max_epochs=5000, dispFreq=100, decay_c=0., alpha_c=0., clip_c=-1., lrate=0.01, n_words_q=49145, n_words_desc=115425, n_words_ans=409, pkl_train_files=None, pkl_valid_files=None, maxlen=2000, # maximum length of the description optimizer='rmsprop', batch_size=2, vocab=None, valid_batch_size=16, use_elu_g=False, saveto='model.npz', model_dir=None, ms_nlayers=3, validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates datasets=[None], truncate=400, momentum=0.9, use_bidir=False, cost_mask=None, valid_datasets=[ '/u/yyu/stor/caglar/rc-data/cnn/cnn_test_data.h5', '/u/yyu/stor/caglar/rc-data/cnn/cnn_valid_data.h5' ], dropout_rate=0.5, use_dropout=True, reload_=True, **opt_ds): ensure_dir_exists(model_dir) mpath = os.path.join(model_dir, saveto) mpath_best = os.path.join(model_dir, prfx("best", saveto)) mpath_last = os.path.join(model_dir, prfx("last", saveto)) mpath_stats = os.path.join(model_dir, prfx("stats", saveto)) # Model options model_options = locals().copy() model_options['use_sent_reps'] = opt_ds['use_sent_reps'] stats = defaultdict(list) del model_options['eyem'] del model_options['cost_mask'] if cost_mask is not None: cost_mask = sharedX(cost_mask) # reload options and parameters if reload_: print "Reloading the model." if os.path.exists(mpath_best): print "Reloading the best model from %s." % mpath_best with open(os.path.join(mpath_best, '%s.pkl' % mpath_best), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath_best, params) elif os.path.exists(mpath): print "Reloading the model from %s." % mpath with open(os.path.join(mpath, '%s.pkl' % mpath), 'rb') as f: models_options = pkl.load(f) params = init_params(model_options) params = load_params(mpath, params) else: raise IOError("Couldn't open the file.") else: print "Couldn't reload the models initializing from scratch." params = init_params(model_options) if datasets[0]: print "Short dataset", datasets[0] print 'Loading data' print 'Building model' if pkl_train_files is None or pkl_valid_files is None: train, valid, test = load_data(path=datasets[0], valid_path=valid_datasets[0], test_path=valid_datasets[1], batch_size=batch_size, **opt_ds) else: train, valid, test = load_pkl_data(train_file_paths=pkl_train_files, valid_file_paths=pkl_valid_files, batch_size=batch_size, vocab=vocab, eyem=eyem, **opt_ds) tparams = init_tparams(params) trng, use_noise, inps_d, \ opt_ret, \ cost, errors, ent_errors, ent_derrors, probs = \ build_model(tparams, model_options, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, valid, cost_mask=cost_mask) alphas = opt_ret['dec_alphas'] if opt_ds['use_sent_reps']: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], inps_d['slen'], inps_d['qlen'],\ inps_d['ent_mask'] ] else: inps = [inps_d["desc"], \ inps_d["word_mask"], \ inps_d["q"], \ inps_d['q_mask'], \ inps_d['ans'], \ inps_d['wlen'], \ inps_d['qlen'], \ inps_d['ent_mask']] outs = [cost, errors, probs, alphas] if ent_errors: outs += [ent_errors] if ent_derrors: outs += [ent_derrors] # before any regularizer print 'Building f_log_probs...', f_log_probs = theano.function(inps, outs, profile=profile) print 'Done' # Apply weight decay on the feed-forward connections if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): if "logit" in kk or "ff" in kk: weight_decay += (vv**2).sum() weight_decay *= decay_c cost += weight_decay # after any regularizer print 'Computing gradient...', grads = safe_grad(cost, itemlist(tparams)) print 'Done' # Gradient clipping: if clip_c > 0.: g2 = get_norms(grads) for p, g in grads.iteritems(): grads[p] = tensor.switch(g2 > (clip_c**2), (g / tensor.sqrt(g2 + 1e-8)) * clip_c, g) inps.pop() if optimizer.lower() == "adasecant": learning_rule = Adasecant(delta_clip=25.0, use_adagrad=True, grad_clip=0.25, gamma_clip=0.) elif optimizer.lower() == "rmsprop": learning_rule = RMSPropMomentum(init_momentum=momentum) elif optimizer.lower() == "adam": learning_rule = Adam() elif optimizer.lower() == "adadelta": learning_rule = AdaDelta() lr = tensor.scalar(name='lr') print 'Building optimizers...', learning_rule = None if learning_rule: f_grad_shared, f_update = learning_rule.get_funcs(learning_rate=lr, grads=grads, inp=inps, cost=cost, errors=errors) else: f_grad_shared, f_update = eval(optimizer)(lr, tparams, grads, inps, cost, errors) print 'Done' print 'Optimization' history_errs = [] # reload history if reload_ and os.path.exists(mpath): history_errs = list(numpy.load(mpath)['history_errs']) best_p = None bad_count = 0 if validFreq == -1: validFreq = len(train[0]) / batch_size if saveFreq == -1: saveFreq = len(train[0]) / batch_size best_found = False uidx = 0 estop = False train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() for eidx in xrange(max_epochs): n_samples = 0 if train.done: train.reset() for d_, q_, a, em in train: n_samples += len(a) uidx += 1 use_noise.set_value(1.) if opt_ds['use_sent_reps']: # To mask the description and the question. d, d_mask, q, q_mask, dlen, slen, qlen = prepare_data_sents( d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared( d, d_mask, q, q_mask, a, dlen, slen, qlen) else: d, d_mask, q, q_mask, dlen, qlen = prepare_data(d_, q_) if d is None: print 'Minibatch with zero sample under length ', maxlen uidx -= 1 continue ud_start = time.time() cost, errors, gnorm, pnorm = f_grad_shared( d, d_mask, q, q_mask, a, dlen, qlen) upnorm = f_update(lrate) ud = time.time() - ud_start # Collect the running ave train stats. train_cost_ave = running_ave(train_cost_ave, cost) train_err_ave = running_ave(train_err_ave, errors) train_gnorm_ave = running_ave(train_gnorm_ave, gnorm) if numpy.isnan(cost) or numpy.isinf(cost): print 'NaN detected' import ipdb ipdb.set_trace() if numpy.mod(uidx, dispFreq) == 0: print 'Epoch ', eidx, ' Update ', uidx, \ ' Cost ', cost, ' UD ', ud, \ ' UpNorm ', upnorm[0].tolist(), \ ' GNorm ', gnorm, \ ' Pnorm ', pnorm, 'Terrors ', errors if numpy.mod(uidx, saveFreq) == 0: print 'Saving...', if best_p is not None and best_found: numpy.savez(mpath_best, history_errs=history_errs, **best_p) pkl.dump(model_options, open('%s.pkl' % mpath_best, 'wb')) else: params = unzip(tparams) numpy.savez(mpath, history_errs=history_errs, **params) pkl.dump(model_options, open('%s.pkl' % mpath, 'wb')) pkl.dump(stats, open("%s.pkl" % mpath_stats, 'wb')) print 'Done' print_param_norms(tparams) if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) if valid.done: valid.reset() valid_costs, valid_errs, valid_probs, \ valid_alphas, error_ent, error_dent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_reps']) valid_alphas_ = numpy.concatenate( [va.argmax(0) for va in valid_alphas.tolist()], axis=0) valid_err = valid_errs.mean() valid_cost = valid_costs.mean() valid_alpha_ent = -negentropy(valid_alphas) mean_valid_alphas = valid_alphas_.mean() std_valid_alphas = valid_alphas_.std() mean_valid_probs = valid_probs.argmax(1).mean() std_valid_probs = valid_probs.argmax(1).std() history_errs.append([valid_cost, valid_err]) stats['train_err_ave'].append(train_err_ave) stats['train_cost_ave'].append(train_cost_ave) stats['train_gnorm_ave'].append(train_gnorm_ave) stats['valid_errs'].append(valid_err) stats['valid_costs'].append(valid_cost) stats['valid_err_ent'].append(error_ent) stats['valid_err_desc_ent'].append(error_dent) stats['valid_alphas_mean'].append(mean_valid_alphas) stats['valid_alphas_std'].append(std_valid_alphas) stats['valid_alphas_ent'].append(valid_alpha_ent) stats['valid_probs_mean'].append(mean_valid_probs) stats['valid_probs_std'].append(std_valid_probs) if uidx == 0 or valid_err <= numpy.array( history_errs)[:, 1].min(): best_p = unzip(tparams) bad_counter = 0 best_found = True else: bst_found = False if numpy.isnan(valid_err): import ipdb ipdb.set_trace() print "============================" print '\t>>>Valid error: ', valid_err, \ ' Valid cost: ', valid_cost print '\t>>>Valid pred mean: ', mean_valid_probs, \ ' Valid pred std: ', std_valid_probs print '\t>>>Valid alphas mean: ', mean_valid_alphas, \ ' Valid alphas std: ', std_valid_alphas, \ ' Valid alpha negent: ', valid_alpha_ent, \ ' Valid error ent: ', error_ent, \ ' Valid error desc ent: ', error_dent print "============================" print "Running average train stats " print '\t>>>Train error: ', train_err_ave, \ ' Train cost: ', train_cost_ave, \ ' Train grad norm: ', train_gnorm_ave print "============================" train_cost_ave, train_err_ave, \ train_gnorm_ave = reset_train_vals() print 'Seen %d samples' % n_samples if estop: break if best_p is not None: zipp(best_p, tparams) use_noise.set_value(0.) valid.reset() valid_cost, valid_error, valid_probs, \ valid_alphas, error_ent = eval_model(f_log_probs, prepare_data if not opt_ds['use_sent_reps'] \ else prepare_data_sents, model_options, valid, use_sent_rep=opt_ds['use_sent_rep']) print " Final eval resuts: " print 'Valid error: ', valid_error.mean() print 'Valid cost: ', valid_cost.mean() print '\t>>>Valid pred mean: ', valid_probs.mean(), \ ' Valid pred std: ', valid_probs.std(), \ ' Valid error ent: ', error_ent params = copy.copy(best_p) numpy.savez(mpath_last, zipped_params=best_p, history_errs=history_errs, **params) return valid_err, valid_cost
def search_model_adam_gru_soft(state, channel): def NReLU(x, rng=None, use_noise=False): assert rng is not None if use_noise: stds = Sigmoid(x) x = x + rng.normal(x.shape, avg=0.0, std=stds, dtype=x.dtype) return Trect(x) def NRect(x, rng=None, use_noise=False, std=0.05): assert rng is not None if use_noise: x = x + rng.normal(x.shape, avg=0.0, std=std, dtype=x.dtype) return Trect(x) def get_inps(use_mask=True, vgen=None, debug=False): if use_mask: X, y, mask, cmask = TT.itensor3("X"), TT.imatrix("y"), TT.fmatrix( "mask"), TT.fmatrix("cost_mask") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'].astype("int32") y.tag.test_value = batch['y'].astype("int32") mask.tag.test_value = batch['mask'].astype("float32") cmask.tag.test_value = batch['cmask'].astype("float32") return [X, y, mask, cmask] else: X, y = TT.itensor3("X"), TT.itensor3("y") if debug: theano.config.compute_test_value = "warn" batch = vgen.next() X.tag.test_value = batch['x'] y.tag.test_value = batch['y'] return [X, y] lr = state.lr batch_size = state.batch_size seed = state.get("seed", 3) # No of els in the cols of the content for the memory mem_size = state.mem_size # No of rows in M mem_nel = state.mem_nel std = state.std renormalization_scale = state.renormalization_scale sub_mb_size = state.sub_mb_size # No of hids for controller n_hids = state.n_hids # Not using deep out deep_out_size = 100 # Size of the bow embeddings bow_size = state.n_hids # ff controller use_ff_controller = True # For RNN controller: learn_h0 = True use_nogru_mem2q = False # Use loc based addressing: use_loc_based_addressing = False seed = 7 max_seq_len = 100 max_fact_len = 12 n_read_heads = 1 n_write_heads = 1 n_reading_steps = 1 lambda1_rein = 4e-5 lambda2_rein = 1e-5 base_reg = 3e-5 #size of the address in the memory: address_size = 20 w2v_embed_scale = 0.05 rng = np.random.RandomState(seed) trng = RandomStreams(seed) NRect = lambda x, use_noise=False: NRect( x, rng=trng, use_noise=use_noise, std=std) use_noise = False use_quad_interactions = True mode = None import sys sys.setrecursionlimit(50000) learning_rule = Adam(gradient_clipping=10) task_id = state.task_id cont_act = Tanh mem_gater_activ = Sigmoid erase_activ = Sigmoid content_activ = Tanh w2v_embed_path = None use_reinforce_baseline = False l1_pen = 7e-4 l2_pen = 9e-4 debug = False path = "/data/lisatmp3/gulcehrc/data/tasks_1-20_v1-2/en-10k/splitted_trainval/" prfx = ( "ntm_on_fb_BABI_task_all__learn_h0_l1_no_n_hids_%(n_hids)s_bsize_%(batch_size)d" "_std_%(std)f_mem_nel_%(mem_nel)d_mem_size_%(mem_size)f_lr_%(lr)f" ) % locals() tdata_gen = FBbABIDataIteratorSingleQ( task_file='all_tasks_train_ngram_False.pkl', randomize=True, max_seq_len=max_seq_len, max_fact_len=max_fact_len, task_id=task_id, task_path=path, mode='train', fact_vocab="all_tasks_train_ngram_False_dict.pkl", batch_size=batch_size) vdata_gen = FBbABIDataIteratorSingleQ( task_file='all_tasks_valid_ngram_False.pkl', max_fact_len=tdata_gen.max_fact_len, max_seq_len=max_seq_len, randomize=False, task_id=task_id, mode="valid", task_path=path, fact_vocab="all_tasks_train_ngram_False_dict.pkl", batch_size=batch_size) inps = get_inps(vgen=vdata_gen, debug=debug) wi = WeightInitializer(sparsity=-1, scale=std, rng=rng, init_method=InitMethods.Adaptive, center=0.0) bi = BiasInitializer(sparsity=-1, scale=std, rng=rng, init_method=BiasInitMethods.Constant, center=0.0) print "Length of the vocabulary, ", len(tdata_gen.vocab.items()) ntm = NTMModel(n_in=len(tdata_gen.vocab.items()), n_hids=n_hids, bow_size=bow_size, n_out=len(tdata_gen.vocab.items()), mem_size=mem_size, mem_nel=mem_nel, use_ff_controller=use_ff_controller, sub_mb_size=sub_mb_size, deep_out_size=deep_out_size, inps=inps, baseline_reg=base_reg, w2v_embed_path=w2v_embed_path, renormalization_scale=renormalization_scale, w2v_embed_scale=w2v_embed_scale, n_read_heads=n_read_heads, n_write_heads=n_write_heads, use_last_hidden_state=False, use_loc_based_addressing=use_loc_based_addressing, use_gru_inp_rep=False, use_bow_input=True, erase_activ=erase_activ, use_gate_quad_interactions=use_quad_interactions, content_activ=content_activ, use_multiscale_shifts=True, learning_rule=learning_rule, lambda1_rein=lambda1_rein, lambda2_rein=lambda2_rein, n_reading_steps=n_reading_steps, use_deepout=False, use_reinforce=False, use_nogru_mem2q=use_nogru_mem2q, use_reinforce_baseline=use_reinforce_baseline, controller_activ=cont_act, use_adv_indexing=False, use_out_mem=False, unroll_recurrence=False, address_size=address_size, reinforce_decay=0.9, learn_h0=learn_h0, theano_function_mode=mode, l1_pen=l1_pen, mem_gater_activ=mem_gater_activ, tie_read_write_gates=False, weight_initializer=wi, bias_initializer=bi, use_cost_mask=True, use_noise=use_noise, max_fact_len=max_fact_len, softmax=True, batch_size=batch_size) main_loop = FBaBIMainLoop(ntm, print_every=40, checkpoint_every=400, validate_every=100, train_data_gen=tdata_gen, valid_data_gen=vdata_gen, learning_rate=lr, reload_model=False, valid_iters=None, linear_start=False, max_iters=state.max_iters, prefix=prfx) main_loop.run() return channel.COMPLETE