def make_reward_table(options): """Makes .csv table with rewards for all contracts.""" print 'Reward table is generating' locations_dict = {loc.name: loc for loc in LOCATIONS} rows = [['Class', 'Departure', 'Destination', 'Distance', 'Min reward', 'Max reward']] for route, contract in ROUTES.iteritems(): contract.set_locations(locations_dict[route[0]], locations_dict[route[1]]) advance_funds, reward_funds, _, _ = contract.get_rewards() if options.verbose > 0: print 'Calculating reward for {}'.format(contract) reward_str = '{} + ({} + {}) * Random(1.0, 1.15)'.format( advance_funds, reward_funds, contract.refund_amount, ) min_reward = utils.calculate_reward(contract, reward_str, calc_min=True) max_reward = utils.calculate_reward(contract, reward_str, calc_min=False) rows.append([ contract.__class__.__name__, contract.from_loc.name, contract.to_loc.name, str(round(utils.loc_distance(contract.from_loc, contract.to_loc), 2)), str(min_reward), str(max_reward), ]) if options.verbose > 1: print 'Writing file Rewards.csv' with open('Rewards.csv', 'w') as out: out.write('\n'.join([','.join(row) for row in rows]) + '\n')
def run_test_rewards(self): if not self.prop_id: # run the proposal scenario first self.run_test_proposal() debate_secs = 15 self.create_js_file( 'rewards', { "dao_abi": self.dao_abi, "dao_address": self.dao_addr, "total_rewards": self.args.total_rewards, "proposal_deposit": self.args.proposal_deposit, "transaction_bytecode": '0x0', # fallback function "debating_period": debate_secs, "prop_id": self.next_proposal_id() } ) print( "Notice: Debate period is {} seconds so the test will wait " "as much".format(debate_secs) ) output = self.run_script('rewards.js') results = eval_test('rewards', output, { "provider_reward_portion": calculate_reward( self.token_amounts[0], self.total_supply, self.args.total_rewards) }) self.dao_balance_after_rewards = results['DAO_balance'] self.dao_rewardToken_after_rewards = results['DAO_rewardToken']
def make_reward_table(options): """Makes .csv table with rewards for all contracts.""" print 'Reward table is generating' locations_dict = {loc.name: loc for loc in LOCATIONS} rows = [[ 'Class', 'Departure', 'Destination', 'Distance', 'Min reward', 'Max reward' ]] for route, contract in ROUTES.iteritems(): contract.set_locations(locations_dict[route[0]], locations_dict[route[1]]) advance_funds, reward_funds, _, _ = contract.get_rewards() if options.verbose > 0: print 'Calculating reward for {}'.format(contract) reward_str = '{} + ({} + {}) * Random(1.0, 1.15)'.format( advance_funds, reward_funds, contract.refund_amount, ) min_reward = utils.calculate_reward(contract, reward_str, calc_min=True) max_reward = utils.calculate_reward(contract, reward_str, calc_min=False) rows.append([ contract.__class__.__name__, contract.from_loc.name, contract.to_loc.name, str( round(utils.loc_distance(contract.from_loc, contract.to_loc), 2)), str(min_reward), str(max_reward), ]) if options.verbose > 1: print 'Writing file Rewards.csv' with open('Rewards.csv', 'w') as out: out.write('\n'.join([','.join(row) for row in rows]) + '\n')
def process_graph(self, graph_path, batch_loss): """ Reading a graph and doing a forward pass on a graph with a time budget. :param graph_path: Location of the graph to process. :param batch_loss: Loss on the graphs processed so far in the batch. :return batch_loss: Incremented loss on the current batch being processed. """ data = json.load(open(graph_path)) graph, features = create_features(data, self.model.identifiers) node = random.choice(list(graph.nodes())) attention_loss = 0 for t in range(self.args.time): predictions, node, attention_score = self.model(data, graph, features, node) target, prediction_loss = calculate_predictive_loss(data, predictions) batch_loss = batch_loss + prediction_loss if t < self.args.time-2: attention_loss += (self.args.gamma**(self.args.time-t))*torch.log(attention_score) reward = calculate_reward(target, predictions) batch_loss = batch_loss-reward*attention_loss self.model.reset_attention() return batch_loss
def ppo_train(model_name, load_model=False, actor_filename=None, critic_filename=None, optimizer_filename=None): print("PPO -- Training") env = make('hungry_geese') trainer = env.train(['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = PPOAgent(rows=11, columns=11, num_actions=3) memory = Memory() if load_model: agent.load_model_weights(actor_filename, critic_filename) agent.load_optimizer_weights(optimizer_filename) episode = 0 start_episode = 0 end_episode = 50000 reward_threshold = None threshold_reached = False epochs = 4 batch_size = 128 current_frame = 0 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] for episode in range(start_episode + 1, end_episode + 1): obs_dict = trainer.reset() ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 while not done: current_frame += 1 ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state, training=True) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) memory.add(state, action, reward, next_state, float(done)) obs_dict = next_obs_dict prev_direction = direction ep_reward += reward if current_frame % batch_size == 0: for _ in range(epochs): states, actions, rewards, next_states, dones = memory.get_all_samples() agent.fit(states, actions, rewards, next_states, dones) memory.clear() agent.update_networks() print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if reward_threshold: if len(last_1000_ep_reward) == 1000: if np.mean(last_1000_ep_reward) >= reward_threshold: print("You solved the task after" + str(episode) + "episodes") agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5') threshold_reached = True break if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_action(state) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step(env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(episode) + '_optimizer.npy') agent.save_model_weights('models/ppo_actor_' + model_name + '_' + str(end_episode) + '.h5', 'models/ppo_critic_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/ppo_' + model_name + '_' + str(end_episode) + '_optimizer.npy') if threshold_reached: plt.plot([i for i in range(start_episode + 1000, episode, 1000)], training_rewards) else: plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title("Reward") plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()
def ddqn_train(model_name, load_model=False, model_filename=None, optimizer_filename=None): print("DDQN -- Training") env = make('hungry_geese') trainer = env.train( ['greedy', None, 'agents/boilergoose.py', 'agents/handy_rl.py']) agent = DDQNAgent(rows=11, columns=11, num_actions=3) buffer = ReplayBuffer() strategy = EpsilonGreedyStrategy(start=0.5, end=0.0, decay=0.00001) if load_model: agent.load_model_weights(model_filename) agent.load_optimizer_weights(optimizer_filename) start_episode = 0 end_episode = 50000 epochs = 32 batch_size = 128 training_rewards = [] evaluation_rewards = [] last_1000_ep_reward = [] for episode in range(start_episode + 1, end_episode + 1): obs_dict = trainer.reset() epsilon = strategy.get_epsilon(episode - start_episode) ep_reward, ep_steps, done = 0, 0, False prev_direction = 0 while not done: ep_steps += 1 state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) next_state = preprocess_state(next_obs_dict, direction) buffer.add(state, action, reward, next_state, done) obs_dict = next_obs_dict prev_direction = direction ep_reward += reward if len(buffer) >= batch_size: for _ in range(epochs): states, actions, rewards, next_states, dones = buffer.get_samples( batch_size) agent.fit(states, actions, rewards, next_states, dones) print("EPISODE " + str(episode) + " - REWARD: " + str(ep_reward) + " - STEPS: " + str(ep_steps)) if len(last_1000_ep_reward) == 1000: last_1000_ep_reward = last_1000_ep_reward[1:] last_1000_ep_reward.append(ep_reward) if episode % 10 == 0: agent.update_target_network() if episode % 1000 == 0: print('Episode ' + str(episode) + '/' + str(end_episode)) print('Epsilon: ' + str(round(epsilon, 3))) last_1000_ep_reward_mean = np.mean(last_1000_ep_reward).round(3) training_rewards.append(last_1000_ep_reward_mean) print('Average reward in last 1000 episodes: ' + str(last_1000_ep_reward_mean)) print() if episode % 1000 == 0: eval_reward = 0 for i in range(100): obs_dict = trainer.reset() epsilon = 0 done = False prev_direction = 0 while not done: state = preprocess_state(obs_dict, prev_direction) action = agent.select_epsilon_greedy_action(state, epsilon) direction = get_direction(prev_direction, action) next_obs_dict, _, done, _ = trainer.step( env.specification.action.enum[direction]) reward = calculate_reward(obs_dict, next_obs_dict) obs_dict = next_obs_dict prev_direction = direction eval_reward += reward eval_reward /= 100 evaluation_rewards.append(eval_reward) print("Evaluation reward: " + str(eval_reward)) print() if episode % 5000 == 0: agent.save_model_weights('models/ddqn_' + model_name + '_' + str(episode) + '.h5') agent.save_optimizer_weights('models/ddqn_' + model_name + '_' + str(episode) + '_optimizer.npy') agent.save_model_weights('models/ddqn_' + model_name + '_' + str(end_episode) + '.h5') agent.save_optimizer_weights('models/ddqn_' + model_name + '_' + str(end_episode) + '_optimizer.npy') plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], training_rewards) plt.title('Reward') plt.show() plt.plot([i for i in range(start_episode + 1000, end_episode + 1, 1000)], evaluation_rewards) plt.title('Evaluation rewards') plt.show()