def main(): epsilon = 1 env = MetaEnvMulti() # TODO EPISODES = args['episodes'] a = str(datetime.now()).split('.')[0] MetaAgent = DQNAgent(state_size=META_STATE_SIZE, action_size=META_OPTION_SIZE, hiddenLayers=[75], dropout=args['dropout'], activation='relu', loadname=None, saveIn=False, learningRate=args['learning_rate'], discountFactor=args['discount_factor']) filename = args['save_folder'] if 'meta-weights' not in args.keys(): filename = "{}{}_Meta_HiddenLayers_{}_Dropout_{}_LearningRate_{}_Gamma_{}_Activation_{}_Episode_{}_single_nn_policy{}.h5".format( filename, a, str(MetaAgent.hiddenLayers), str(MetaAgent.dropout), str(MetaAgent.learning_rate), str(MetaAgent.gamma), MetaAgent.activation, str(EPISODES), args['note_file']) else: filename = filename + args['meta_weights'] # See if the user has given the hidden layer configuration for option_agnet nodes_hidden = [75] # defaulr value if 'controller_hidden' in args.keys(): controller_hidden_config = args['controller_hidden'] # extract the array from same nodes_hidden = [ int(node) for node in controller_hidden_config.split('_') ] # load the agents for the controller , which is single for this case option_agent: DQNAgent = DQNAgent( state_size=CONTROLLER_STATE_SIZE, action_size=CONTROLLER_ACTION_SIZE, hiddenLayers=nodes_hidden, dropout=0.000, activation='relu', loadname=None, saveIn=False, learningRate=0.05, discountFactor=0.7, epsilon=0.0) # Not mkaing an agent for the user based actions option_agent.load(args['controller_weights'] ) # Load the weight for al the controller policies visits = np.zeros([META_OPTION_SIZE ]) # Store the number of Visits of each intentn tyope batch_size = 64 track = [] i = 0 no_controller_breaks = 0 config_file = '{}{}.txt'.format( args['config_folder'], a ) # this is the configualtion older containt all the details of the experimernt along with the files names with open(config_file, 'w') as fil: fil.write(str(args)) fil.write('\n') fil.write("meta_policy_file : {}".format(filename)) for episode in range(EPISODES): # Episode running_meta_reward = 0 [confidence_state, intent_state] = env.reset() # done = False # Running the meta polciy while not done: # Meta Policy Epsiode Loop # print("Round Meta : {}".format(episode)) # Probably not requried all_options = env.constrain_options() state = np.concatenate([confidence_state, intent_state]) state = state.reshape([1, META_STATE_SIZE ]) # Converted to appropritate size meta_start_state = state.copy() option = MetaAgent.act(state, all_options, epsilon=epsilon) next_confidence_state = env.meta_step_start( option) # get the reward at the sub policy level meta_reward = 0 print("The state : {}\nThe option : {}".format( meta_start_state, option)) if option == 5: # the user agent option: pass else: ############################################################# # HERE COMES THE PART FOR CONTROLLER EXECUTION option_completed = False # make a one hot goal vector goal_vector = utils.one_hot(option, NO_INTENTS) i_ = 0 controller_state = np.concatenate( [next_confidence_state, goal_vector]) controller_state = controller_state.reshape( 1, CONTROLLER_STATE_SIZE) while not option_completed: opt_actions = range( CONTROLLER_ACTION_SIZE ) # Currently it is the while possible actions size action = option_agent.act( controller_state, all_act=opt_actions, epsilon=0) # provide episone for greedy approach next_confidence_state, _, option_completed = env.controller_step( option, action) next_controller_state = np.concatenate( [next_confidence_state, goal_vector]) next_controller_state = np.reshape( next_controller_state, [1, CONTROLLER_STATE_SIZE]) # we dont need to store the experience replay memory for the controller policy controller_state = next_controller_state i_ += 1 if i_ > args['break_controller_loop']: no_controller_breaks += 1 break ############################################### confidence_state, next_confidence_state, intent_state, meta_reward, done = env.meta_step_end2( option) meta_end_state = np.concatenate( [next_confidence_state, intent_state]) meta_end_state = meta_end_state.reshape([1, META_STATE_SIZE]) epsilon = MetaAgent.observe( (meta_start_state, option, meta_reward, meta_end_state, done), epsilon=epsilon) print("The next meta state : {}\n The reward : {}\nEpsilon : {}". format(meta_end_state, meta_reward, epsilon)) if MetaAgent.memory.tree.total() > batch_size: MetaAgent.replay() MetaAgent.rem_rew(meta_reward) i += 1 running_meta_reward = running_meta_reward + meta_reward if i % 100 == 0: # calculating different variables to be outputted after every 100 time steps avr_rew = MetaAgent.avg_rew() track.append([ str(i) + " " + str(avr_rew) + " " + str(episode) + " " + str(epsilon) ]) with open("results_" + a + "_.txt", 'w') as fi: for j in range(0, len(track)): line = track[j] fi.write(str(line).strip("[]''") + "\n") # print(track) if done: print( "episode: {}/{}, score: {}, e's: {}\nNumber of Controller breaks : {}" .format(episode, EPISODES, running_meta_reward, epsilon, no_controller_breaks)) print("The state is : ", meta_end_state) break confidence_state = next_confidence_state if episode % 200 == 0: print("Episodes : {}".format(episode)) # Saving the progress print("Saving") # convert this to save model for each policy MetaAgent.save(filename) # agent.saveController(fileController) sleep(0.2) print("Done Saving You can Now Quit") sleep(1)
def main(): epsilon = args['epsilon'] env = FlatEnv() EPISODES = args['episodes'] a = str(datetime.now()).split('.')[0] hidden_layers = [int(i) for i in args['hidden_layers'].split('_')] Agent = DQNAgent(state_size=STATES_SIZE, action_size=ACTION_SIZE, hiddenLayers=hidden_layers, dropout=args['dropout'], activation='relu', loadname=None, saveIn=False, learningRate=args['learning_rate'], discountFactor=args['discount_factor']) filename = args['save_folder'] filename = "{}{}_Flat_HiddenLayers_{}_Dropout_{}_LearningRate_{}_Gamma_{}_Activation_{}_Episode_{}_Flat_rl_policy_{}.h5".format( filename, a, str(Agent.hiddenLayers), str(Agent.dropout), str(Agent.learning_rate), str(Agent.gamma), Agent.activation, str(EPISODES), args['note_file']) batch_size = args['batch_size'] track = [] i = 0 config_file = '{}{}.txt'.format( args['config_folder'], a ) # this is the configualtion older containt all the details of the experimernt along with the files names with open(config_file, 'w') as fil: fil.write(str(args)) fil.write('\n') fil.write("Flat Policy File : {}".format(filename)) for episode in range(EPISODES): # Episode running_reward = 0 [confidence_state, intent_state] = env.reset() # done = False # while not done: # # state = np.concatenate([confidence_state, intent_state]) state = state.reshape([1, STATES_SIZE ]) # Converted to appropritate size bcolors.printblue("The State : {}".format(state)) intent_set_completed = False # over her the option will mean the consolidated intent sapce of the iteration i_ = 0 while not intent_set_completed: all_actions = env.constrain_actions() action = Agent.act( state, all_act=all_actions, epsilon=epsilon) # provide episone for greedy approach confidence_state, intent_state, reward, intent_set_completed, done = env.step( action) # the step will be nromal step next_state = np.concatenate([confidence_state, intent_state]) next_state = np.reshape(next_state, [1, STATES_SIZE]) epsilon = Agent.observe((next_state, action, reward, next_state, intent_set_completed), epsilon=epsilon) if Agent.memory.tree.total() > batch_size: Agent.replay() Agent.rem_rew(reward) running_reward += reward if i % 100 == 0: avr_rew = Agent.avg_rew() track.append([ str(i) + " " + str(avr_rew) + " " + str(episode) + " " + str(epsilon) ]) with open("results_" + a + "_.txt", 'w') as fi: for j in range(0, len(track)): line = track[j] fi.write(str(line).strip("[]''") + "\n") # print(track) if intent_set_completed: print("Moving to the next set of itnent : {}".format( intent_state)) break state = next_state ############################################## if done: bcolors.printgreen( "episode: {}/{}, Reward: {}, e's: {}".format( episode, EPISODES, running_reward, epsilon, )) print("The state is : ", state) break if episode % 200 == 0: print("Episodes : {}".format(episode)) # Saving the progress print("Saving") # convert this to save model for each policy Agent.save(filename) print("Done Saving You can Now Quit") sleep(0.5)