def train(dataset1, dataset2, initial_state, if_restore): total_reward = 0.0 with policy_sess.as_default(): # create the Controller and build the internal policy network controller = Controller(policy_sess, NUM_LAYERS, state_space, reg_param=REGULARIZATION, exploration=EXPLORATION, controller_cells=CONTROLLER_CELLS, embedding_dim=EMBEDDING_DIM, restore_controller=if_restore) # clear the previous files controller.remove_files() # create the Network Manager manager1 = NetworkManager(dataset1, epochs=MAX_EPOCHS, child_batchsize=CHILD_BATCHSIZE, clip_rewards=CLIP_REWARDS, acc_beta=ACCURACY_BETA) manager2 = NetworkManager(dataset2, epochs=MAX_EPOCHS, child_batchsize=CHILD_BATCHSIZE, clip_rewards=CLIP_REWARDS, acc_beta=ACCURACY_BETA) result_reward = [] result_total_reward = [] result_acc = [] result_moving_acc = [] result_explore_acc = [] result_exploit_acc = [] flag = None manager = None for trial in range(MAX_TRIALS): print("\nTrial %d:" % (trial + 1)) if 2 * trial < MAX_TRIALS: manager = manager1 if trial % 2 == 0: actions = state_space.get_local_state_space_add( int(trial / 2), initial_state) else: actions = state_space.get_local_state_space( int(trial / 2), initial_state) else: manager = manager2 with policy_sess.as_default(): K.set_session(policy_sess) flag, actions = controller.get_action( state) # get an action for the previous state # print the action probabilities # state_space.print_actions(actions) print("Actions : ", state_space.parse_state_space_list(actions)) # build a model, train and get reward and accuracy from the network manager reward, previous_acc, moving_acc = manager.get_rewards( state_space.parse_state_space_list(actions)) print("Rewards : ", reward) print("Accuracy : ", previous_acc) print("Movingacc :", moving_acc) with policy_sess.as_default(): K.set_session(policy_sess) total_reward += reward print("Total reward : ", total_reward) # actions and states are equivalent, save the state and reward state = actions controller.store_rollout(state, reward) # train the controller on the saved state and the discounted rewards loss = controller.train_step() print("Controller loss : %0.6f" % (loss)) # write the results of this trial into a file with open('train_history.csv', mode='a+') as f: data = [previous_acc, reward] data.extend(state_space.parse_state_space_list(state)) writer = csv.writer(f) writer.writerow(data) print() result_reward.append(reward) result_total_reward.append(total_reward) result_acc.append(previous_acc) result_moving_acc.append(moving_acc) if 2 * trial >= MAX_TRIALS: if not flag: result_explore_acc.append(previous_acc) else: result_exploit_acc.append(previous_acc) print("Rewards : ", result_reward) print("Total Rewards :", result_total_reward) print("Accuracy : ", result_acc) print("Moving acc : ", result_moving_acc) print("Explore acc :", result_explore_acc) print("Exploit acc : ", result_exploit_acc)
print("Predicted actions : ", state_space.parse_state_space_list(actions)) # build a model, train and get reward and accuracy from the network manager reward, previous_acc = manager.get_rewards( model_fn, state_space.parse_state_space_list(actions)) print("Rewards : ", reward, "Accuracy : ", previous_acc) with policy_sess.as_default(): K.set_session(policy_sess) total_reward += reward print("Total reward : ", total_reward) # actions and states are equivalent, save the state and reward state = actions controller.store_rollout(state, reward) # train the controller on the saved state and the discounted rewards loss = controller.train_step() print("Trial %d: Controller loss : %0.6f" % (trial + 1, loss)) # write the results of this trial into a file with open('train_history.csv', mode='a+') as f: data = [previous_acc, reward] data.extend(state_space.parse_state_space_list(state)) writer = csv.writer(f) writer.writerow(data) print() print("Total Reward : ", total_reward)