train_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 1.) test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1) # --- Instantiate agent --- agent = NeuralAgent( env, learning_algo, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, train_policy=train_policy, test_policy=test_policy) # --- load saved network and test # agent.setNetwork("test_4165747fe50541da92a5ea2698b190b90bc006d5.epoch=97") agent.setNetwork(input_nnet) #tesot01 avg = agent._total_mode_reward print(" _total_mode_reward: ", agent._total_mode_reward, ", nmbr of episode: ", agent._totalModeNbrEpisode, ", average per episode: ", avg) Epoch_length = 500 mode = parameters.mode #mode 3 has planning depth 6#mode 2 ahs planning 3 agent.startMode(mode, Epoch_length) agent.run(1, Epoch_length) avg = agent._total_mode_reward / agent._totalModeNbrEpisode print(" _total_mode_reward: ", agent._total_mode_reward, ", nmbr of episode: ", agent._totalModeNbrEpisode, ", average per episode: ", avg) #just testing the saved nnet (possibly by visualizing the actions in the env)
print("The parameters are: {}".format(parameters)) # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every # episode or epoch (or never, hence the resetEvery='none'). agent.attach( bc.EpsilonController(initial_e=parameters.epsilon_start, e_decays=parameters.epsilon_decay, e_min=parameters.epsilon_min, evaluate_on='action', periodicity=1, reset_every='none')) agent.setNetwork( "./backup_maze_lowdim/test_70460bbfb88bb08e2c4c9f4352805f62760b7d2d.epoch=48" ) agent._learning_algo.freezeAllLayersExceptEncoder() agent._learning_algo.resetEncoder() #TODO compare transfer training time with for instance relearning agent.run(10, 500) #10 epochs with 500 steps, so 5000 random steps print("end gathering data") # --- Bind controllers to the agent --- # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and # learning rate as well as the training epoch number. agent.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1)) # Every epoch end, one has the possibility to modify the learning rate using a LearningRateController. Here we # wish to update the learning rate after every training epoch (periodicity=1), according to the parameters given.
show_game=True) # --- Instantiate learning algorithm --- learning_algo = CRAR(env, rng, double_Q=True, high_int_dim=HIGH_INT_DIM, internal_dim=3) test_policy = EpsilonGreedyPolicy(learning_algo, env.nActions(), rng, 0.1) #1.) # --- Instantiate agent --- agent = NeuralAgent(env, learning_algo, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, test_policy=test_policy) #set name of nnet and planning depth: agent.setNetwork("test_71c8fc5b085cd8aa090e8e8e63d0a9450a3b7a27.epoch=35") # agent.setNetwork("test_964ccb7a9490cf3c3309a90d07485a77c3ec6486") #just running to check its behaviour: Epoch_length = 200 mode = 3 #mode 3 has planning depth 6 agent.startMode(mode, Epoch_length) agent.run(1, Epoch_length)
elif args.network == 'DDPG': network = MyACNetwork(environment=env, batch_size=32, random_state=rng) agent = NeuralAgent(env, network, train_policy=EpsilonGreedyPolicy(network, env.nActions(), rng, 0.0), replay_memory_size=1000, batch_size=32, random_state=rng) #agent.attach(bc.VerboseController()) if args.fname == 'baseline': agent = EmpiricalTreatmentAgent(env) else: agent.setNetwork(args.fname) count = 0 length_success = [] avg_rad = [] avg_h_cell_killed = [] avg_percentage = [] avg_doses = [] k = 1000 for i in range(k): #print(i) agent._runEpisode(100000) if env.end_type == 'W': count += 1 length_success.append(env.get_tick() - 350) avg_rad.append(env.total_dose)
controllers_to_disable=[0, 1, 2, 3, 4], periodicity=2, show_score=True, summarize_every=1)) # --- Run the experiment --- try: os.mkdir("params") except Exception: pass # handle loading / saving weights savedPath = fname + "_final" if os.path.exists("nnets/" + savedPath): # ugly, but as in dumpNetwork print("Loading saved net: " + savedPath) agent.setNetwork(savedPath) if test: agent.startMode(PLE_env.VALIDATION_MODE, 10000) dump(vars(parameters), "params/" + fname + ".jldump") agent.run(parameters.epochs, parameters.steps_per_epoch) if test: agent.summarizeTestPerformance() else: # -- save network agent.dumpNetwork(savedPath) # --- Show results --- basename = "scores/" + fname scores = load(basename + "_scores.jldump")
parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, test_policy=test_policy) # --- Create unique filename for FindBestController --- h = hash(vars(parameters), hash_name="sha1") fname = "test_" + h print("The parameters hash is: {}".format(h)) print("The parameters are: {}".format(parameters)) # test saved network # --- load saved network and test agent.setNetwork("test_4165747fe50541da92a5ea2698b190b90bc006d5.epoch=97") avg = agent._total_mode_reward print(" _total_mode_reward: ", agent._total_mode_reward, ", nmbr of episode: ", agent._totalModeNbrEpisode, ", average per episode: ", avg) Epoch_length = 200 mode = 3 #mode 3 has planning depth 6 agent.startMode(mode, Epoch_length) agent.run(1, Epoch_length) avg = agent._total_mode_reward / agent._totalModeNbrEpisode print(" _total_mode_reward: ", agent._total_mode_reward, ", nmbr of episode: ", agent._totalModeNbrEpisode, ", average per episode: ", avg)
print("The parameters are: {}".format(parameters)) # As for the discount factor and the learning rate, one can update periodically the parameter of the epsilon-greedy # policy implemented by the agent. This controllers has a bit more capabilities, as it allows one to choose more # precisely when to update epsilon: after every X action, episode or epoch. This parameter can also be reset every # episode or epoch (or never, hence the resetEvery='none'). agent.attach( bc.EpsilonController(initial_e=parameters.epsilon_start, e_decays=parameters.epsilon_decay, e_min=parameters.epsilon_min, evaluate_on='action', periodicity=1, reset_every='none')) # input_nnet = "backup_maze_lowdim/test_70460bbfb88bb08e2c4c9f4352805f62760b7d2d.epoch=48" agent.setNetwork(input_nnet) agent._learning_algo.freezeAllLayersExceptEncoder() if parameters.mode == 1: agent._learning_algo.resetEncoder() if parameters.mode == 2: agent._learning_algo.freezeAllLayersExceptEncoderPartially() # if parameters.mode == 3: # #TODO dont freeze but very small lr for others models agent.run(10, 500) #10 epochs with 500 steps, so 5000 random steps print("end gathering data") # --- Bind controllers to the agent --- # Before every training epoch (periodicity=1), we want to print a summary of the agent's epsilon, discount and # learning rate as well as the training epoch number. agent.attach(bc.VerboseController(evaluate_on='epoch', periodicity=1))
discount_factor_max=parameters.discount_max, periodicity=1)) agent.attach( bc.FindBestController(validationID=catcher_env.VALIDATION_MODE, testID=None, unique_fname=fname)) # All previous controllers control the agent during the epochs it goes through. However, we want to interleave a # "validation epoch" between each training epoch ("one of two epochs", hence the periodicity=2). We do not want # these validation epoch to interfere with the training of the agent, which is well established by the # TrainerController, EpsilonController and alike. Therefore, we will disable these controllers for the whole # duration of the validation epochs interleaved this way, using the controllersToDisable argument of the # InterleavedTestEpochController. For each validation epoch, we want also to display the sum of all rewards # obtained, hence the showScore=True. Finally, we want to call the summarizePerformance method of ALE_env every # [parameters.period_btw_summary_perfs] *validation* epochs. agent.attach( bc.InterleavedTestEpochController( id=catcher_env.VALIDATION_MODE, epoch_length=parameters.steps_per_test, periodicity=1, show_score=True, summarize_every=1)) # agent.setNetwork("test_71c8fc5b085cd8aa090e8e8e63d0a9450a3b7a27.epoch=35") agent.setNetwork("test_964ccb7a9490cf3c3309a90d07485a77c3ec6486") # freeze network except encoder agent._learning_algo.freezeAllLayersExceptEncoder() #agent.gathering_data=False agent.run(parameters.epochs, parameters.steps_per_epoch)
env, learning_algo2, parameters.replay_memory_size, max(env.inputDimensions()[i][0] for i in range(len(env.inputDimensions()))), parameters.batch_size, rng, train_policy=train_policy2, test_policy=test_policy2) print("The parameters are: {}".format(parameters)) #1 load normal and transferred network # agent.setNetwork("test_4165747fe50541da92a5ea2698b190b90bc006d5.epoch=97") # agent2.setNetwork("nnet.epoch=40") agent.setNetwork(input_normal) #IMPORTANT: first one gets normal visuals agent2.setNetwork(input_transferred) #IMPORTANT: this one gets inverted visuals #SO if you want to compare two normal nnets, remove the true in agent2.getabstract state!! # agent.setNetwork("test31") # agent2.setNetwork("test36") iterations = 40 totaldiff = 0 for i in range(iterations): #2 generate an environment and get abstract states agent.resetEnv() abstract_state = agent.getAbstractState() # print("abstract_state= ", abstract_state) abstract_state2 = agent2.getAbstractState(True) # abstract_state2 = agent2.getAbstractState() # print("abstract_state2= ", abstract_state2)