memory_size=5000, batch_size=320, # output_graph=True ) step = 0 # total train episode for episode in range(4): # for each episode train all the user onece episode_start_time = time.time() for currentUserId in trainUserIdRange + 1: #current_Env.numUser currentSeqIndex = 0 observation = current_Env.generateInputVector( currentUserId, currentSeqIndex ) while True: # RL choose action based on observation which its a index number flag, actionIndex = RL.choose_action(observation) action = current_Env.actionTransform(actionIndex) # RL take action and get next observation and reward ifTerminal, currentSeqIndex, observation_, reward = current_Env.update(currentUserId, currentSeqIndex, action) # Experience replay RL.store_transition(observation, actionIndex, reward, observation_) if (step > 200) and (step % 5 == 0): RL.learn() # swap observation observation = observation_ step += 1 # break while loop when end of this episode if ifTerminal: print("User: "******" Done") break