def test1(): n_types = 10 n_labels = 4 print print '# test sequence labeler on mod data with LOLS' data = macarico.util.make_sequence_mod_data(20, 6, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(tRNN, n_labels) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) p_rollin_ref = stochastic(ExponentialAnnealing(0.9)) p_rollout_ref = stochastic(ExponentialAnnealing(0.9)) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, learning_alg=lambda ex: LOLS.lols(ex, HammingLoss, HammingLossReference(), policy, p_rollin_ref, p_rollout_ref), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step], train_eval_skip=1, )
def test2(): # aggrevate print print '# test sequence labeler on mod data with AggreVaTe' n_types = 10 n_labels = 4 data = macarico.util.make_sequence_mod_data(100, 5, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] tRNN = TransitionRNN( [RNNFeatures(n_types)], [AttendAt()], n_labels, ) policy = LinearPolicy(tRNN, n_labels) p_rollin_ref = stochastic(ExponentialAnnealing(0.99)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: AggreVaTe(HammingLossReference(), policy, p_rollin_ref ), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step], n_epochs=4, train_eval_skip=1, )
def test0(): print print '# test sequence labeler on mod data with DAgger' n_types = 10 n_labels = 4 data = [ Example(x, y, n_labels) for x, y in macarico.util.make_sequence_mod_data( 100, 5, n_types, n_labels) ] tRNN = Actor([RNNFeatures(n_types, output_field='mytok_rnn')], [AttendAt(field='mytok_rnn')], n_labels) policy = LinearPolicy(tRNN, n_labels) p_rollin_ref = stochastic(ExponentialAnnealing(0.99)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: DAgger(HammingLossReference(), policy, p_rollin_ref), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step], n_epochs=4, train_eval_skip=1, )
def test_wsj(): print print '# test on wsj subset' from macarico.data import nlp_data tr,de,te,vocab,label_id = \ nlp_data.read_wsj_pos('data/wsj.pos', n_tr=50, n_de=50, n_te=0) n_types = len(vocab) n_labels = len(label_id) print 'n_train: %s, n_dev: %s, n_test: %s' % (len(tr), len(de), len(te)) print 'n_types: %s, n_labels: %s' % (n_types, n_labels) tRNN = TransitionRNN([RNNFeatures(n_types, rnn_type='RNN')], [AttendAt()], n_labels) policy = LinearPolicy(tRNN, n_labels) p_rollin_ref = stochastic(ExponentialAnnealing(0.9)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) macarico.util.trainloop( training_data=tr, dev_data=de, policy=policy, Learner=lambda: DAgger(HammingLossReference(), policy, p_rollin_ref), # Learner = lambda: MaximumLikelihood(HammingLossReference(), policy), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step], n_epochs=10, # train_eval_skip = None, )
def test1(learning_method, exploration): print print '# testing learning_method=%d exploration=%d' % (learning_method, exploration) print n_types = 10 n_labels = 4 data = macarico.util.make_sequence_mod_data(100, 6, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(tRNN, n_labels) optimizer = torch.optim.Adam(policy.parameters(), lr=0.001) p_rollin_ref = stochastic(ExponentialAnnealing(0.9)) p_rollout_ref = stochastic(ExponentialAnnealing(0.99999)) macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: BanditLOLS( HammingLossReference(), policy, p_rollin_ref, p_rollout_ref, learning_method, # LEARN_IPS, LEARN_DR, LEARN_BIASED exploration, ), losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step], train_eval_skip=10, )
def test_restore(n_types, n_labels, data, model): actor = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(actor, n_labels) print 'evaluating new model: %g' % \ macarico.util.evaluate(data, policy, HammingLoss()) policy.load_state_dict(model) print 'evaluating restored model: %g' % \ macarico.util.evaluate(data, policy, HammingLoss())
def test1(LEARNER=LearnerOpts.DAGGER): print print 'Running test 1 with learner=%s' % LEARNER print '=======================================================' n_states = 3 n_actions = 2 tRNN = TransitionRNN([mdp.MDPFeatures(n_states, noise_rate=0.5)], [AttendAt(lambda _: 0, 's')], n_actions) policy = LinearPolicy(tRNN, n_actions) p_rollin_ref = stochastic(ExponentialAnnealing(0.99)) p_rollout_ref = stochastic(ExponentialAnnealing(1)) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) test_mdp, pi_ref = make_ross_mdp() if LEARNER == LearnerOpts.DAGGER: learner = lambda: DAgger(pi_ref, policy, p_rollin_ref) elif LEARNER == LearnerOpts.TWISTED: learner = lambda: TwistedDAgger(pi_ref, policy, p_rollin_ref) elif LEARNER == LearnerOpts.MAXLIK: learner = lambda: MaximumLikelihood(pi_ref, policy) elif LEARNER == LearnerOpts.AGGREVATE: learner = lambda: AggreVaTe(pi_ref, policy, p_rollin_ref) elif LEARNER == LearnerOpts.LOLS: learner = None losses = [] for epoch in xrange(101): optimizer.zero_grad() if learner is not None: l = learner() env = test_mdp.mk_env() res = env.run_episode(l) loss = mdp.MDPLoss()(test_mdp, env) l.update(loss) elif LEARNER == LearnerOpts.LOLS: lols(test_mdp, mdp.MDPLoss, pi_ref, policy, p_rollin_ref, p_rollout_ref) optimizer.step() p_rollin_ref.step() p_rollout_ref.step() env = test_mdp.mk_env() res = env.run_episode(policy) loss = mdp.MDPLoss()(test_mdp, env) losses.append(loss) if epoch % 20 == 0: print epoch, sum(losses[-100:]) / len(losses[-100:]), '\t', res
def run_gridworld(ex, actor): policy = LinearPolicy(actor(), 4) baseline = EWMA(0.8) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) losses = [] for epoch in xrange(2001): optimizer.zero_grad() learner = Reinforce(policy, baseline) env = ex.mk_env() res = env.run_episode(learner) loss = GridLoss()(ex, env) losses.append(loss) if epoch % 100 == 0: print sum(losses[-10:]) / len(losses[-10:]), '\t', res learner.update(loss) optimizer.step()
def build_policy_bag(features_bag, n_actions, loss_fn, n_layers, hidden_dim): return [ LinearPolicy(features, n_actions, loss_fn=loss_fn, n_layers=n_layers, hidden_dim=hidden_dim) for features in features_bag ]
def run_environment(ex, actor, lossfn, rl_alg=None, n_epochs=201, lr=0.01): if rl_alg is None: baseline = EWMA(0.8) rl_alg = lambda policy: Reinforce(policy, baseline) policy = LinearPolicy(actor(), ex.n_actions, n_layers=1) optimizer = torch.optim.Adam(policy.parameters(), lr=lr) losses = [] for epoch in xrange(n_epochs): optimizer.zero_grad() learner = rl_alg(policy) #learner = AdvantageActorCritic(policy, baseline) env = ex.mk_env() res = env.run_episode(learner) # , epoch % 5000 == 0) loss = lossfn(ex, env) losses.append(loss) if epoch % 20 == 0: print(epoch, '\t', sum(losses[-500:]) / len(losses[-500:]), '\t', res) learner.update(loss) optimizer.step()
def run_train(n_types, n_labels, data): actor = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(actor, n_labels) print 'training' _, model = macarico.util.trainloop( training_data=data[:len(data) // 2], dev_data=data[len(data) // 2:], policy=policy, Learner=lambda: MaximumLikelihood(HammingLossReference(), policy), losses=HammingLoss(), optimizer=torch.optim.Adam(policy.parameters(), lr=0.01), n_epochs=2, train_eval_skip=1, returned_parameters='best', ) print 'evaluating learned model: %g' % \ macarico.util.evaluate(data, policy, HammingLoss()) policy.load_state_dict(model) print 'evaluating learned model: %g' % \ macarico.util.evaluate(data, policy, HammingLoss()) return model
def test1(use_bootstrap): n_types = 10 n_labels = 4 print print '# test sequence labeler on mod data with Reslope and', ( 'bootstrap' if use_bootstrap else 'boltzmann'), 'exploration' data = macarico.util.make_sequence_mod_data(3000, 6, n_types, n_labels) data = [Example(x, y, n_labels) for x, y in data] if not use_bootstrap: tRNN = TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels) policy = LinearPolicy(tRNN, n_labels) else: rnns = [ TransitionRNN([RNNFeatures(n_types)], [AttendAt()], n_labels, h_name='h%d' % i) for i in xrange(5) ] policy = BootstrapPolicy(rnns, n_labels) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) p_ref = stochastic(ExponentialAnnealing(0.9)) macarico.util.trainloop( training_data = data[:2048], dev_data = data[2048:], policy = policy, Learner = lambda: Reslope(HammingLossReference(), policy, p_ref, exploration=BanditLOLS.EXPLORE_BOOTSTRAP if use_bootstrap else \ BanditLOLS.EXPLORE_BOLTZMANN ), losses = HammingLoss(), optimizer = optimizer, run_per_epoch = [p_ref.step], train_eval_skip = 1, bandit_evaluation = True, n_epochs = 1, )
def run_ppo(ex, actor, loss_fn, eps, learner_type): print(learner_type) print('Eps: ', eps) policy = LinearPolicy(actor(), ex.n_actions, n_layers=1) baseline = EWMA(0.8) optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) losses = [] n_episodes = 10000 for episode in range(n_episodes): dy.renew_cg() if learner_type == 'ppo': learner = PPO(policy, baseline, eps) elif learner_type == 'reinforce': learner = Reinforce(policy, baseline) env = ex.mk_env() env.run_episode(learner) loss = loss_fn(ex, env) losses.append(loss) if episode % 5 == 0: print('episode: ', episode, 'loss:', sum(losses[-500:]) / len(losses[-500:])) learner.update(loss) optimizer.update()
def test1(task=0, LEARNER=LearnerOpts.DAGGER): print print 'Running test 1 (v%d) with learner=%s' % (task, LEARNER) print '=======================================================' if task == 0: print 'Sequence reversal task, easy version' data = macarico.util.make_sequence_reversal_data(100, 5, 5) foci = [AttendAt(lambda s: s.N - s.n - 1)] elif task == 1: print 'Sequence reversal task, hard version' data = macarico.util.make_sequence_reversal_data(1000, 5, 5) foci = [AttendAt()] elif task == 2: print 'Sequence reversal task, multi-focus version' data = macarico.util.make_sequence_reversal_data(100, 5, 5) foci = [AttendAt(), AttendAt(lambda s: s.N - s.n - 1)] elif task == 3: print 'Memoryless task, add-one mod K' data = macarico.util.make_sequence_mod_data(50, 5, 10, 3) foci = [AttendAt()] elif task == 4: print 'Matti-style data' data = make_matti_data(1000, 20, 2, 0.05) foci = [AttendAt()] n_types = 1 + max({x for X, _ in data for x in X}) n_labels = 1 + max({y for _, Y in data for y in Y}) data = [Example(x, y, n_labels) for x, y in data] random.shuffle(data) m = len(data) // 2 train = data[:m] dev = data[m:] print 'n_train: %s, n_dev: %s' % (len(train), len(dev)) print 'n_types: %s, n_labels: %s' % (n_types, n_labels) print 'learner:', LEARNER print tRNN = Actor([RNNFeatures(n_types)], foci, n_labels) policy = LinearPolicy(tRNN, n_labels) baseline = EWMA(0.8) p_rollin_ref = stochastic(ExponentialAnnealing(0.5)) p_rollout_ref = stochastic(ExponentialAnnealing(0.5)) if LEARNER == LearnerOpts.AC: from macarico.lts.reinforce import AdvantageActorCritic, LinearValueFn baseline = LinearValueFn(policy.features) policy.vfa = baseline # adds params to policy via nn.module optimizer = torch.optim.Adam(policy.parameters(), lr=0.01) if LEARNER == LearnerOpts.DAGGER: learner = lambda: DAgger(HammingLossReference(), policy, p_rollin_ref) elif LEARNER == LearnerOpts.TWISTED: learner = lambda: TwistedDAgger(HammingLossReference(), policy, p_rollin_ref) elif LEARNER == LearnerOpts.MAXLIK: learner = lambda: MaximumLikelihood(HammingLossReference(), policy) elif LEARNER == LearnerOpts.AC: learner = lambda: AdvantageActorCritic(policy, baseline) elif LEARNER == LearnerOpts.REINFORCE: learner = lambda: Reinforce(policy, baseline) elif LEARNER == LearnerOpts.BANDITLOLS: learner = lambda: BanditLOLS(HammingLossReference( ), policy, p_rollin_ref, p_rollout_ref, BanditLOLS.LEARN_DR, BanditLOLS .EXPLORE_UNIFORM, baseline) macarico.util.trainloop( training_data=train, dev_data=dev, policy=policy, Learner=learner, losses=HammingLoss(), optimizer=optimizer, run_per_epoch=[p_rollin_ref.step, p_rollout_ref.step], n_epochs=10, train_eval_skip=1, )