def main(): global epsilon global memory """ This function will be called for training phase. """ # How to sample minerl data is document here: # http://minerl.io/docs/tutorials/data_sampling.html data = minerl.data.make(MINERL_GYM_ENV, data_dir=MINERL_DATA_ROOT) # Sample code for illustration, add your training code below env = gym.make(MINERL_GYM_ENV) env.make_interactive(port=6666, realtime=True) aicrowd_helper.training_start() episodes = 1024 trajectory = data.load_data("v3_excellent_pluot_behemoth-4_3461-4804") for episode in range(episodes): obs = env.reset() done = False netr = 0 with alive_bar(title=f"episode: {episode}") as bar: bar.text("replaying trajectory") for state, action, reward, next_state, done in trajectory: obs, reward, done, info = env.step(action) bar() i = 0 bar.text("testing inputs") while not done: print(i % 64) action = env.action_space.noop() vec = np.zeros((64,)) vec[i % 64] = -0.5 action["vector"] = vec obs, reward, done, info = env.step(action) netr += reward bar() i += 1 aicrowd_helper.register_progress(episode / episodes) # Save trained model to train/ directory # Training 100% Completed aicrowd_helper.register_progress(1) aicrowd_helper.training_end() env.close()
def main(): global epsilon global memory """ This function will be called for training phase. """ # How to sample minerl data is document here: # http://minerl.io/docs/tutorials/data_sampling.html data = minerl.data.make(MINERL_GYM_ENV, data_dir=MINERL_DATA_ROOT) # Sample code for illustration, add your training code below env = gym.make(MINERL_GYM_ENV) # pre train if (checkpoint_dir / "pretrain.h5").exists(): print("Loading pretrain weights") model.load_weights(checkpoint_dir / "pretrain.h5") else: with alive_bar(title="pretrain", calibrate=120) as bar: for current_state, action, reward, next_state, done in data.batch_iter(batch_size=2, num_epochs=5, seq_len=32): loss = model.train_on_batch([current_state["pov"].reshape(-1, 64, 64, 3), current_state["vector"].reshape(-1, 64)], action["vector"].reshape(-1, 64)) bar.text(f"loss: {loss}") bar() model.save_weights(checkpoint_dir / "pretrain.h5") model_target.set_weights(model.get_weights()) env.make_interactive(port=6666) aicrowd_helper.training_start() frame_count = 0 episodes = 1024 for episode in range(episodes): if (checkpoint_dir / f"episode-{episode}.h5").exists(): if not (checkpoint_dir / f"episode-{episode + 1}.h5").exists(): model.load_weights(checkpoint_dir / f"episode-{episode}.h5") if epsilon > epsilon_min: epsilon -= (epsilon_start - epsilon_min) / explore_ts frame_count += 6000 continue obs = env.reset() done = False netr = 0 epoch_loss = [] with alive_bar(title=f"episode: {episode}") as bar: while not done: explore = np.random.rand() < epsilon if explore: bar.text("perform action: explore") action = env.action_space.sample() else: bar.text("perform action: predict") action = env.action_space.noop() action["vector"] = model.predict([obs["pov"].reshape(-1, 64, 64, 3), obs["vector"].reshape(-1, 64)])[0] new_obs, reward, done, info = env.step(action) netr += reward memory.append((obs, action, reward, new_obs, done)) # Make sure we restrict memory size to specified limit if len(memory) > memory_size: memory.pop(0) if frame_count % train_interval == 0: bar.text("training: build replay") replay = random.sample(memory, min(batch_size, len(memory))) states_pov = np.array([a[0]["pov"] for a in replay]).reshape(-1, 64, 64, 3) states_vector = np.array([a[0]["vector"] for a in replay]).reshape(-1, 64) # new_states_pov = np.array([a[3]["pov"] for a in replay]).reshape(-1, 64, 64, 3) # new_states_vector = np.array([a[3]["vector"] for a in replay]).reshape(-1, 64) # Predict the expected utility of current state and new state bar.text("training: predict Q") Q = model_target.predict([states_pov, states_vector]) Q_new = [a[2] for a in replay] + gamma * tf.reduce_max( Q, axis=1 ) # masks = tf.one_hot([a[1]["vector"] for a in replay], 64) bar.text("training: backprop") with tf.GradientTape() as tape: # Train the model on the states and updated Q-values q_values = model([states_pov, states_vector]) # Apply the masks to the Q-values to get the Q-value for action taken # q_action = tf.reduce_sum(tf.multiply(q_values, masks), axis=1) q_action = tf.reduce_sum(q_values, axis=1) # Calculate loss between new Q-value and old Q-value loss = loss_function(Q_new, q_action) grads = tape.gradient(loss, model.trainable_variables) optimizer.apply_gradients(zip(grads, model.trainable_variables)) epoch_loss.append(loss) if epsilon > epsilon_min: epsilon -= (epsilon_start - epsilon_min) / explore_ts print("explore:", explore, "net reward:", netr, "loss:", loss, "epsilon:", epsilon) bar() obs = new_obs if frame_count % target_update_interval == 0: print("updated target model") model_target.set_weights(model.get_weights()) frame_count += 1 model.save_weights(checkpoint_dir / f"episode-{episode}.h5") aicrowd_helper.register_progress(episode / episodes) # Save trained model to train/ directory # Training 100% Completed aicrowd_helper.register_progress(1) aicrowd_helper.training_end() env.close()
import aicrowd_helper import train_submission_code import test_framework import os EVALUATION_RUNNING_ON = os.getenv('EVALUATION_RUNNING_ON', None) EVALUATION_STAGE = os.getenv('EVALUATION_STAGE', 'all') EXITED_SIGNAL_PATH = os.getenv('EXITED_SIGNAL_PATH', 'shared/exited') # Training Phase if EVALUATION_STAGE in ['all', 'training']: aicrowd_helper.training_start() try: train_submission_code.main() aicrowd_helper.training_end() except Exception as e: aicrowd_helper.training_error() print(e) # Testing Phase if EVALUATION_STAGE in ['all', 'testing']: if EVALUATION_RUNNING_ON in ['local']: try: os.remove(EXITED_SIGNAL_PATH) except FileNotFoundError: pass aicrowd_helper.inference_start() try: test_framework.main() aicrowd_helper.inference_end() except Exception as e:
def main(): malmo_base_port = FLAGS.malmo_base_port os.environ["CUDA_VISIBLE_DEVICES"] = FLAGS.gpus malmo.InstanceManager.configure_malmo_base_port(malmo_base_port) observation_space = CustomObservationSpace( pov_resolution=FLAGS.pov_resolution, pov_color_space=FLAGS.pov_color_space) action_space = CustomActionSpace( num_camera_actions=FLAGS.num_camera_actions, camera_max_angle=FLAGS.camera_max_angle) def combined_actor_critic_agent(): return ResnetLSTMAgent(observation_space=observation_space, action_space=action_space, max_step_mul=FLAGS.max_step_mul, core_hidden_size=FLAGS.lstm_hidden_size, use_prev_actions=FLAGS.use_prev_actions, action_embed_type=FLAGS.action_embed_type, action_embed_size=FLAGS.action_embed_size) def separate_actor_critic_agent(): return SeparateActorCriticWrapperAgent( actor=combined_actor_critic_agent(), critic=combined_actor_critic_agent()) if FLAGS.separate_actor_critic: agent_fn = separate_actor_critic_agent else: agent_fn = combined_actor_critic_agent log_dir = FLAGS.logdir # Training Phase if EVALUATION_STAGE in ['all', 'training']: # only write out flags when training pathlib.Path(log_dir).mkdir(parents=True, exist_ok=True) FLAGS.append_flags_into_file( f'{log_dir}/flags_{datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")}.cfg' ) aicrowd_helper.training_start() try: train.main(log_dir=log_dir, load_dir=FLAGS.loaddir, observation_space=observation_space, action_space=action_space, max_step_mul=FLAGS.max_step_mul, fixed_step_mul=FLAGS.fixed_step_mul, step_mul=FLAGS.step_mul, agent_fn=agent_fn, seed=FLAGS.train_seed, malmo_base_port=malmo_base_port) aicrowd_helper.training_end() except Exception as e: aicrowd_helper.training_error() print(traceback.format_exc()) print(e) # Testing Phase if EVALUATION_STAGE in ['all', 'testing']: if EVALUATION_RUNNING_ON in ['local']: try: os.remove(EXITED_SIGNAL_PATH) except FileNotFoundError: pass aicrowd_helper.inference_start() try: test.main(log_dir=log_dir, test_model=FLAGS.test_model, observation_space=observation_space, action_space=action_space, fixed_step_mul=FLAGS.fixed_step_mul, step_mul=FLAGS.step_mul, agent_fn=agent_fn) aicrowd_helper.inference_end() except Exception as e: aicrowd_helper.inference_error() print(traceback.format_exc()) print(e) if EVALUATION_RUNNING_ON in ['local']: from pathlib import Path Path(EXITED_SIGNAL_PATH).touch() # Launch instance manager if EVALUATION_STAGE in ['manager']: from minerl.env.malmo import launch_instance_manager launch_instance_manager()