def objective(self, params): """ Computes the mean squared error between the networks predictions and the true Q-values. Network is trained based on the input parameters. Arguments --------- :param params : dict Dictionary containing values for hyperparameters to be optimized. Returns ------- :returns : dict Dictionary containint mean squared error between true and estimated (using the parameter configuration from params) Q-values and the status of the optimization. """ a = params["lr"] b = params["lr decay"] c = params["batch size"] d = params["target update"] # initiliaze the RL-agent: agent = AgentDQN(dim_state=self.dim_state, dim_actions=self.dim_actions, hidden_dims=self.hidden_dims, optimizer=Adam(lr=a, decay=b), gamma=self.gamma, eps=self.eps, eps_decay=self.eps_decay, frozen=self.frozen, pretrained=self.pretrained) # initiliaze the environment: env = Env(start=self.start, tcost=self.tcost, horizon=self.horizon, w=self.w, theta=self.theta, regimes=self.regimes) trained_agent, _, _, _, _, _, _ = train_dqn(agent, env, self.train_episodes, c, self.init_d_size, self.max_d_size, d, self.freeze_after) pred = trained_agent.qnn.predict(self.x) true = self.y mse = np.mean((pred - true)**2) return {"loss": mse, "status": STATUS_OK}
type=int, help='evaluate frequency in run_episode', default=500) parser.add_argument('--update_target_steps', type=int, help='update frequency for target Q model', default=16) parser.add_argument('--ckpt_path', type=str, help='weight file name for finetunig(Optional)', default='ckpt/episode_5000.ckpt') parser.add_argument('--save_checkpoint_freq', type=int, help='episode interval to save checkpoint', default=2000) if __name__ == '__main__': args = parser.parse_args() if args.cuda and not torch.cuda.is_available(): print('CUDA is not availale, maybe you should not set --cuda') sys.exit(1) if args.play and args.ckpt_path == '': print('When test, a pretrained weight model file should be given') sys.exit(1) if args.cuda: print('With GPU support!') if args.play: play_game(args) else: train_dqn(args)
def main(parameters): print(parameters) is_double = False is_dueling = False suffix = 'dqn' if parameters.model == 'dqn': if parameters.dueling == 'True': suffix = 'dueling_' + suffix is_dueling = True if parameters.double == 'True': suffix = 'double_' + suffix is_double = True else: suffix = 'ppo' # HyperParameters batch_size = 32 lr = 0.002 # learning rate betas = (0.9, 0.999) gamma = 0.9 # reward discount target_iter = 1000 # target update frequency memory_capacity = 1000 train_loss = 0 epochs = 20 # model_path = '/nfs/private/distribute-strategy/mdp/double_dueling/' model_path = 'mdp/' + suffix + '/' print(model_path) # load data raw_data = pd.read_csv('mdp/mdp_processed_data.csv') raw_data[cat_fea_name] = raw_data[cat_fea_name].fillna( raw_data[cat_fea_name].max() + 1) df = raw_data[raw_data.create_time < '2019-11-28:00:00:00'] eval_data = raw_data[raw_data.create_time >= '2019-11-28:00:00:00'] onehot = joblib.load('mdp/one_hot_online.model') onehot.handle_unknown = 'ignore' # onehot = OneHotEncoder().fit(df[cat_fea_name]) # joblib.dump(onehot, '/nfs/private/distribute-strategy/mdp/one_hot_online.model') state_dim = len(high_fea_name) + len(continuous_fea_name) + len( cnt_fea_name) + len(binary_fea_name) + len( onehot.get_feature_names()) + 4 print('state dim', state_dim) action_dim = 5 df['action'] = df.funds_channel_id.apply(one_hot_action) df['reward'] = df.reward.apply(lambda x: -1 if x == -1 else x / 1000000) df['done'] = df.next_funds_channel_id.apply(lambda x: True if x != '-1' else False) df[high_fea_name + continuous_fea_name + cnt_fea_name + binary_fea_name] = df[high_fea_name + continuous_fea_name + cnt_fea_name + binary_fea_name].astype('float') # df[tongdun_fea_name] = 0 df['mobile_city_id'] = 0 df = df[df.duplicated(['uid', 'action'], keep='first') == False].sort_values(by=['uid', 'create_time']) df_dense = np.round( df[high_fea_name + continuous_fea_name + cnt_fea_name + binary_fea_name].values, 6) df_wide = onehot.transform(df[cat_fea_name]).A df_fail = df[['fail_a', 'fail_b', 'fail_c', 'fail_d']].values # train model if parameters.model == 'ppo': model = train_ppo(df, df_dense, df_wide, df_fail, state_dim, action_dim, lr, betas, gamma, epochs, model_path) else: model = train_dqn(df, df_dense, df_wide, df_fail, state_dim, action_dim, memory_capacity, lr, betas, gamma, target_iter, epochs, model_path, is_double=is_double, is_dueling=is_dueling) # eval model continous = eval_data[high_fea_name + continuous_fea_name + cnt_fea_name + binary_fea_name].values catgory = onehot.transform(eval_data[cat_fea_name].values).A state_data = eval_data[['fail_a', 'fail_b', 'fail_c', 'fail_d']].values data = torch.FloatTensor( np.concatenate((continous, catgory, state_data), axis=1)) model_result = model(data).data.numpy() print(np.round(model_result[:100], 2)) eval_data['model_result'] = model_result.argmax(axis=1) eval_data['model_result'] = eval_data.model_result.apply( lambda x: inverse_action(x)) print('========== total result ===========') print(eval_data.model_result.value_counts()) c = eval_data.uid.value_counts() multi = c[c >= 3].index.values print('========== apply larger than 3times and not pass users ===========') print(eval_data[eval_data.uid.isin(multi) & (eval_data.funds_channel_id == 'e') & (eval_data.reward <= 0)].model_result.value_counts())
agent = AgentDQN(dim_state=dim_state, dim_actions=dim_actions, hidden_dims=hidden_dims, optimizer=Adam(), gamma=gamma, eps=epsilon, eps_decay=epsilon_decay, frozen=frozen, pretrained=pretrained) trained_agent, train_loss_1, train_states_1, \ train_actions_1, train_rewards_1, train_new_states_1, \ train_pred_1 = train_dqn(agent=agent, environment=env, episodes=3000, batch_size=512, init_d_size=500000, max_d_size=500000, target_update=200, freeze_after=freeze_after) states_1 = pd.DataFrame(train_states_1) actions_1 = pd.DataFrame(train_actions_1) rewards_1 = pd.DataFrame(train_rewards_1) new_states_1 = pd.DataFrame(train_new_states_1) del train_states_1, train_actions_1, train_rewards_1, \ train_new_states_1 log_1 = pd.concat([states_1, actions_1, rewards_1, new_states_1], axis=1) loss_1 = pd.DataFrame(train_loss_1) pred_1 = pd.DataFrame(train_pred_1) del train_loss_1, train_pred_1
angle, angular_v, left_leg_on_groud, right_leg_on_ground] ACTION: Discrete(4)- [Do Nothing, fire left engine, main engine, right engine] REWARD: - moving from the top of the screen to landing pad & zero speed : +100..140 - If lander moves away from landing pad, it loses reward back - Episode finish w. lander crashing : -100 - Episode finish w. lander coming to rest : +100 - Each leg ground contact : +10 - Firing main engine : -0.3/frame - Solved : +200 ''' # 1. Initialize_environment env = gym.make('LunarLander-v2') env.seed(0) state = env.reset() print('State shape: ', env.observation_space.shape, type(state)) print(state) # 2. Initialize Agent agent = Agent(state_size=8, action_size=4, seed=0) # 3. Train Agent scores = train_dqn(env, agent, n_episodes=4000) # 4. Simulate Agent in the Environment simulate_env(env, agent, model_path='models/checkpoint_1900.pth')
from train import train_dqn import pickle env_name = 'BreakoutNoFrameskip-v4' exp_name = 'dqn' train_dqn(env_name, exp_name, notebook=False)