def test_molecule_mcts(self): d_model = 8 hidden_size = 16 num_layers = 2 encoder = Encoder(gen_data.n_characters, d_model, gen_data.char2idx[gen_data.pad_symbol], return_tuple=False) rnn = RewardNetRNN(d_model, hidden_size, num_layers, bidirectional=True, unit_type='gru') env = MoleculeEnv( gen_data, RewardFunction(reward_net=torch.nn.Sequential(encoder, rnn), policy=lambda x: gen_data.all_characters[ np.random.randint(gen_data.n_characters)], actions=gen_data.all_characters)) rewards = [] for i in range(5): env.render() action = env.action_space.sample() s_prime, reward, done, info = env.step(action) rewards.append(reward) if done: env.reset() break print(f'rewards: {rewards}')
def test_mol_env(self): d_model = 8 hidden_size = 16 num_layers = 1 encoder = Encoder(gen_data.n_characters, d_model, gen_data.char2idx[gen_data.pad_symbol], return_tuple=True) rnn = RewardNetRNN(d_model, hidden_size, num_layers, bidirectional=True, unit_type='gru') reward_net = torch.nn.Sequential(encoder, rnn) env = MoleculeEnv( gen_data, RewardFunction(reward_net=reward_net, policy=lambda x: gen_data.all_characters[ np.random.randint(gen_data.n_characters)], actions=gen_data.all_characters)) print(f'sample action: {env.action_space.sample()}') print(f'sample observation: {env.observation_space.sample()}') s = env.reset() for i in range(5): env.render() action = env.action_space.sample() print(f'action = {action}') s_prime, reward, done, info = env.step(action) if done: env.reset() break
def test_reward_rnn(self): x, y = gen_data.random_training_set(batch_size=bz) d_model = 8 hidden_size = 16 num_layers = 2 encoder = Encoder(gen_data.n_characters, d_model, gen_data.char2idx[gen_data.pad_symbol], return_tuple=False) x = encoder([x]) rnn = RewardNetRNN(d_model, hidden_size, num_layers, bidirectional=True, unit_type='lstm') r = rnn(x) print(f'reward: {r}')
def initialize(hparams, demo_data_gen, unbiased_data_gen, prior_data_gen, *args, **kwargs): # Embeddings provider encoder = Encoder( vocab_size=demo_data_gen.n_characters, d_model=hparams['d_model'], padding_idx=demo_data_gen.char2idx[demo_data_gen.pad_symbol], dropout=hparams['dropout'], return_tuple=True) # Agent entities rnn_layers = [] has_stack = True for i in range(1, hparams['agent_params']['num_layers'] + 1): rnn_layers.append( StackRNN(layer_index=i, input_size=hparams['d_model'], hidden_size=hparams['d_model'], has_stack=has_stack, unit_type=hparams['agent_params']['unit_type'], stack_width=hparams['agent_params']['stack_width'], stack_depth=hparams['agent_params']['stack_depth'], k_mask_func=encoder.k_padding_mask)) if hparams['agent_params']['num_layers'] > 1: rnn_layers.append(StackedRNNDropout(hparams['dropout'])) rnn_layers.append(StackedRNNLayerNorm(hparams['d_model'])) agent_net = nn.Sequential( encoder, *rnn_layers, RNNLinearOut(out_dim=demo_data_gen.n_characters, hidden_size=hparams['d_model'], bidirectional=False, bias=True)) agent_net = agent_net.to(device) optimizer_agent_net = parse_optimizer(hparams['agent_params'], agent_net) selector = MolEnvProbabilityActionSelector( actions=demo_data_gen.all_characters) probs_reg = StateActionProbRegistry() init_state_args = { 'num_layers': hparams['agent_params']['num_layers'], 'hidden_size': hparams['d_model'], 'stack_depth': hparams['agent_params']['stack_depth'], 'stack_width': hparams['agent_params']['stack_width'], 'unit_type': hparams['agent_params']['unit_type'] } agent = PolicyAgent(model=agent_net, action_selector=selector, states_preprocessor=seq2tensor, initial_state=agent_net_hidden_states_func, initial_state_args=init_state_args, apply_softmax=True, probs_registry=probs_reg, device=device) drl_alg = REINFORCE(model=agent_net, optimizer=optimizer_agent_net, initial_states_func=agent_net_hidden_states_func, initial_states_args=init_state_args, prior_data_gen=prior_data_gen, device=device, xent_lambda=hparams['xent_lambda'], gamma=hparams['gamma'], grad_clipping=hparams['reinforce_max_norm'], lr_decay_gamma=hparams['lr_decay_gamma'], lr_decay_step=hparams['lr_decay_step_size'], delayed_reward=not hparams['use_monte_carlo_sim']) # Reward function entities reward_net = nn.Sequential( encoder, RewardNetRNN( input_size=hparams['d_model'], hidden_size=hparams['reward_params']['d_model'], num_layers=hparams['reward_params']['num_layers'], bidirectional=hparams['reward_params']['bidirectional'], use_attention=hparams['reward_params']['use_attention'], dropout=hparams['dropout'], unit_type=hparams['reward_params']['unit_type'], use_smiles_validity_flag=hparams['reward_params'] ['use_validity_flag'])) reward_net = reward_net.to(device) expert_model = XGBPredictor(hparams['expert_model_dir']) true_reward_func = get_jak2_max_reward if hparams[ 'bias_mode'] == 'max' else get_jak2_min_reward reward_function = RewardFunction( reward_net, mc_policy=agent, actions=demo_data_gen.all_characters, device=device, use_mc=hparams['use_monte_carlo_sim'], mc_max_sims=hparams['monte_carlo_N'], expert_func=expert_model, no_mc_fill_val=hparams['no_mc_fill_val'], true_reward_func=true_reward_func, use_true_reward=hparams['use_true_reward']) optimizer_reward_net = parse_optimizer(hparams['reward_params'], reward_net) demo_data_gen.set_batch_size( hparams['reward_params']['demo_batch_size']) irl_alg = GuidedRewardLearningIRL( reward_net, optimizer_reward_net, demo_data_gen, k=hparams['reward_params']['irl_alg_num_iter'], agent_net=agent_net, agent_net_init_func=agent_net_hidden_states_func, agent_net_init_func_args=init_state_args, device=device) init_args = { 'agent': agent, 'probs_reg': probs_reg, 'drl_alg': drl_alg, 'irl_alg': irl_alg, 'reward_func': reward_function, 'gamma': hparams['gamma'], 'episodes_to_train': hparams['episodes_to_train'], 'expert_model': expert_model, 'demo_data_gen': demo_data_gen, 'unbiased_data_gen': unbiased_data_gen, 'gen_args': { 'num_layers': hparams['agent_params']['num_layers'], 'hidden_size': hparams['d_model'], 'num_dir': 1, 'stack_depth': hparams['agent_params']['stack_depth'], 'stack_width': hparams['agent_params']['stack_width'], 'has_stack': has_stack, 'has_cell': hparams['agent_params']['unit_type'] == 'lstm', 'device': device } } return init_args
def initialize(hparams, demo_data_gen, unbiased_data_gen, has_critic): # Embeddings provider encoder = Encoder(vocab_size=demo_data_gen.n_characters, d_model=hparams['d_model'], padding_idx=demo_data_gen.char2idx[demo_data_gen.pad_symbol], dropout=hparams['dropout'], return_tuple=True).eval() # Agent entities rnn_layers = [] has_stack = True for i in range(1, hparams['agent_params']['num_layers'] + 1): rnn_layers.append(StackRNN(layer_index=i, input_size=hparams['d_model'], hidden_size=hparams['d_model'], has_stack=has_stack, unit_type=hparams['agent_params']['unit_type'], stack_width=hparams['agent_params']['stack_width'], stack_depth=hparams['agent_params']['stack_depth'], k_mask_func=encoder.k_padding_mask)) if hparams['agent_params']['num_layers'] > 1: rnn_layers.append(StackedRNNDropout(hparams['dropout'])) rnn_layers.append(StackedRNNLayerNorm(hparams['d_model'])) agent_net = nn.Sequential(encoder, *rnn_layers, RNNLinearOut(out_dim=demo_data_gen.n_characters, hidden_size=hparams['d_model'], bidirectional=False, bias=True)) agent_net = agent_net.to(device).eval() init_state_args = {'num_layers': hparams['agent_params']['num_layers'], 'hidden_size': hparams['d_model'], 'stack_depth': hparams['agent_params']['stack_depth'], 'stack_width': hparams['agent_params']['stack_width'], 'unit_type': hparams['agent_params']['unit_type']} if has_critic: critic = nn.Sequential(encoder, CriticRNN(hparams['d_model'], hparams['critic_params']['d_model'], unit_type=hparams['critic_params']['unit_type'], dropout=hparams['critic_params']['dropout'], num_layers=hparams['critic_params']['num_layers'])) critic = critic.to(device).eval() else: critic = None # Reward function entities reward_net_rnn = RewardNetRNN(input_size=hparams['d_model'], hidden_size=hparams['reward_params']['d_model'], num_layers=hparams['reward_params']['num_layers'], bidirectional=hparams['reward_params']['bidirectional'], use_attention=hparams['reward_params']['use_attention'], dropout=hparams['reward_params']['dropout'], unit_type=hparams['reward_params']['unit_type'], use_smiles_validity_flag=hparams['reward_params']['use_validity_flag']) reward_net = nn.Sequential(encoder, reward_net_rnn) reward_net = reward_net.to(device) # expert_model = RNNPredictor(hparams['expert_model_params'], device) demo_data_gen.set_batch_size(hparams['reward_params']['demo_batch_size']) init_args = {'agent_net': agent_net, 'critic_net': critic, 'reward_net': reward_net, 'reward_net_rnn': reward_net_rnn, 'encoder': encoder.eval(), 'gamma': hparams['gamma'], # 'expert_model': expert_model, 'demo_data_gen': demo_data_gen, 'unbiased_data_gen': unbiased_data_gen, 'init_hidden_states_args': init_state_args, 'gen_args': {'num_layers': hparams['agent_params']['num_layers'], 'hidden_size': hparams['d_model'], 'num_dir': 1, 'stack_depth': hparams['agent_params']['stack_depth'], 'stack_width': hparams['agent_params']['stack_width'], 'has_stack': has_stack, 'has_cell': hparams['agent_params']['unit_type'] == 'lstm', 'device': device}} return init_args
def test_policy_net(self): d_model = 8 hidden_size = 16 num_layers = 1 stack_width = 10 stack_depth = 20 unit_type = 'lstm' # Create a function to provide initial hidden states def hidden_states_func(batch_size=1): return [ get_initial_states(batch_size, hidden_size, 1, stack_depth, stack_width, unit_type) for _ in range(num_layers) ] # Encoder to map character indices to embeddings encoder = Encoder(gen_data.n_characters, d_model, gen_data.char2idx[gen_data.pad_symbol], return_tuple=True) # Create agent network stack_rnn = StackRNN(1, d_model, hidden_size, True, 'lstm', stack_width, stack_depth, k_mask_func=encoder.k_padding_mask) stack_linear = RNNLinearOut(gen_data.n_characters, hidden_size, bidirectional=False) agent_net = torch.nn.Sequential(encoder, stack_rnn, stack_linear) # Create agent selector = MolEnvProbabilityActionSelector( actions=gen_data.all_characters) probs_reg = StateActionProbRegistry() agent = PolicyAgent(model=agent_net, action_selector=selector, states_preprocessor=seq2tensor, initial_state=hidden_states_func, apply_softmax=True, probs_registry=probs_reg, device='cpu') # Reward function model rnn = RewardNetRNN(d_model, hidden_size, num_layers, bidirectional=True, unit_type='gru') reward_net = torch.nn.Sequential(encoder, rnn) reward_function = RewardFunction(reward_net=reward_net, mc_policy=agent, actions=gen_data.all_characters) # Create molecule generation environment env = MoleculeEnv(gen_data.all_characters, reward_function) # Ptan ops for aggregating experiences exp_source = ExperienceSourceFirstLast(env, agent, gamma=0.97) rl_alg = REINFORCE(agent_net, torch.optim.Adam(agent_net.parameters()), hidden_states_func) gen_data.set_batch_size(1) irl_alg = GuidedRewardLearningIRL(reward_net, torch.optim.Adam( reward_net.parameters()), demo_gen_data=gen_data) # Begin simulation and training batch_states, batch_actions, batch_qvals = [], [], [] traj_prob = 1. for step_idx, exp in enumerate(exp_source): batch_states.append(exp.state) batch_actions.append(exp.action) batch_qvals.append(exp.reward) traj_prob *= probs_reg.get(list(exp.state), exp.action) print( f'state = {exp.state}, action = {exp.action}, reward = {exp.reward}, next_state = {exp.last_state}' ) if step_idx == 5: break