Example #1
0
    def objective(self, params):
        """
        Computes the mean squared error between the networks predictions and
        the true Q-values. Network is trained based on the input parameters.

        Arguments
        ---------
        :param params : dict
               Dictionary containing values for hyperparameters to be
               optimized.

        Returns
        -------
        :returns : dict
                 Dictionary containint mean squared error between true and
                 estimated (using the parameter configuration from params)
                 Q-values and the status of the optimization.
        """

        a = params["lr"]
        b = params["lr decay"]
        c = params["batch size"]
        d = params["target update"]

        # initiliaze the RL-agent:
        agent = AgentDQN(dim_state=self.dim_state,
                         dim_actions=self.dim_actions,
                         hidden_dims=self.hidden_dims,
                         optimizer=Adam(lr=a, decay=b),
                         gamma=self.gamma,
                         eps=self.eps,
                         eps_decay=self.eps_decay,
                         frozen=self.frozen,
                         pretrained=self.pretrained)

        # initiliaze the environment:
        env = Env(start=self.start,
                  tcost=self.tcost,
                  horizon=self.horizon,
                  w=self.w,
                  theta=self.theta,
                  regimes=self.regimes)

        trained_agent, _, _, _, _, _, _ = train_dqn(agent, env,
                                                    self.train_episodes, c,
                                                    self.init_d_size,
                                                    self.max_d_size, d,
                                                    self.freeze_after)

        pred = trained_agent.qnn.predict(self.x)
        true = self.y

        mse = np.mean((pred - true)**2)

        return {"loss": mse, "status": STATUS_OK}
Example #2
0
                    type=int,
                    help='evaluate frequency in run_episode',
                    default=500)
parser.add_argument('--update_target_steps',
                    type=int,
                    help='update frequency for target Q model',
                    default=16)
parser.add_argument('--ckpt_path',
                    type=str,
                    help='weight file name for finetunig(Optional)',
                    default='ckpt/episode_5000.ckpt')
parser.add_argument('--save_checkpoint_freq',
                    type=int,
                    help='episode interval to save checkpoint',
                    default=2000)

if __name__ == '__main__':
    args = parser.parse_args()
    if args.cuda and not torch.cuda.is_available():
        print('CUDA is not availale, maybe you should not set --cuda')
        sys.exit(1)
    if args.play and args.ckpt_path == '':
        print('When test, a pretrained weight model file should be given')
        sys.exit(1)
    if args.cuda:
        print('With GPU support!')
    if args.play:
        play_game(args)
    else:
        train_dqn(args)
Example #3
0
def main(parameters):
    print(parameters)
    is_double = False
    is_dueling = False
    suffix = 'dqn'
    if parameters.model == 'dqn':
        if parameters.dueling == 'True':
            suffix = 'dueling_' + suffix
            is_dueling = True
        if parameters.double == 'True':
            suffix = 'double_' + suffix
            is_double = True
    else:
        suffix = 'ppo'

    # HyperParameters
    batch_size = 32
    lr = 0.002  # learning rate
    betas = (0.9, 0.999)
    gamma = 0.9  # reward discount
    target_iter = 1000  # target update frequency
    memory_capacity = 1000
    train_loss = 0
    epochs = 20
    # model_path = '/nfs/private/distribute-strategy/mdp/double_dueling/'
    model_path = 'mdp/' + suffix + '/'
    print(model_path)

    # load data
    raw_data = pd.read_csv('mdp/mdp_processed_data.csv')
    raw_data[cat_fea_name] = raw_data[cat_fea_name].fillna(
        raw_data[cat_fea_name].max() + 1)
    df = raw_data[raw_data.create_time < '2019-11-28:00:00:00']
    eval_data = raw_data[raw_data.create_time >= '2019-11-28:00:00:00']

    onehot = joblib.load('mdp/one_hot_online.model')
    onehot.handle_unknown = 'ignore'
    # onehot = OneHotEncoder().fit(df[cat_fea_name])
    # joblib.dump(onehot, '/nfs/private/distribute-strategy/mdp/one_hot_online.model')
    state_dim = len(high_fea_name) + len(continuous_fea_name) + len(
        cnt_fea_name) + len(binary_fea_name) + len(
            onehot.get_feature_names()) + 4
    print('state dim', state_dim)
    action_dim = 5

    df['action'] = df.funds_channel_id.apply(one_hot_action)

    df['reward'] = df.reward.apply(lambda x: -1 if x == -1 else x / 1000000)

    df['done'] = df.next_funds_channel_id.apply(lambda x: True
                                                if x != '-1' else False)

    df[high_fea_name + continuous_fea_name + cnt_fea_name +
       binary_fea_name] = df[high_fea_name + continuous_fea_name +
                             cnt_fea_name + binary_fea_name].astype('float')

    # df[tongdun_fea_name] = 0
    df['mobile_city_id'] = 0

    df = df[df.duplicated(['uid', 'action'], keep='first') ==
            False].sort_values(by=['uid', 'create_time'])

    df_dense = np.round(
        df[high_fea_name + continuous_fea_name + cnt_fea_name +
           binary_fea_name].values, 6)

    df_wide = onehot.transform(df[cat_fea_name]).A

    df_fail = df[['fail_a', 'fail_b', 'fail_c', 'fail_d']].values

    # train model
    if parameters.model == 'ppo':
        model = train_ppo(df, df_dense, df_wide, df_fail, state_dim,
                          action_dim, lr, betas, gamma, epochs, model_path)
    else:
        model = train_dqn(df,
                          df_dense,
                          df_wide,
                          df_fail,
                          state_dim,
                          action_dim,
                          memory_capacity,
                          lr,
                          betas,
                          gamma,
                          target_iter,
                          epochs,
                          model_path,
                          is_double=is_double,
                          is_dueling=is_dueling)

    # eval model
    continous = eval_data[high_fea_name + continuous_fea_name + cnt_fea_name +
                          binary_fea_name].values
    catgory = onehot.transform(eval_data[cat_fea_name].values).A
    state_data = eval_data[['fail_a', 'fail_b', 'fail_c', 'fail_d']].values
    data = torch.FloatTensor(
        np.concatenate((continous, catgory, state_data), axis=1))

    model_result = model(data).data.numpy()

    print(np.round(model_result[:100], 2))

    eval_data['model_result'] = model_result.argmax(axis=1)
    eval_data['model_result'] = eval_data.model_result.apply(
        lambda x: inverse_action(x))

    print('========== total result ===========')
    print(eval_data.model_result.value_counts())

    c = eval_data.uid.value_counts()
    multi = c[c >= 3].index.values
    print('========== apply larger than 3times and not pass users ===========')
    print(eval_data[eval_data.uid.isin(multi)
                    & (eval_data.funds_channel_id == 'e') &
                    (eval_data.reward <= 0)].model_result.value_counts())
Example #4
0
        agent = AgentDQN(dim_state=dim_state,
                         dim_actions=dim_actions,
                         hidden_dims=hidden_dims,
                         optimizer=Adam(),
                         gamma=gamma,
                         eps=epsilon,
                         eps_decay=epsilon_decay,
                         frozen=frozen,
                         pretrained=pretrained)

        trained_agent, train_loss_1, train_states_1, \
            train_actions_1, train_rewards_1, train_new_states_1, \
            train_pred_1 = train_dqn(agent=agent,
                                     environment=env,
                                     episodes=3000,
                                     batch_size=512,
                                     init_d_size=500000,
                                     max_d_size=500000,
                                     target_update=200,
                                     freeze_after=freeze_after)

        states_1 = pd.DataFrame(train_states_1)
        actions_1 = pd.DataFrame(train_actions_1)
        rewards_1 = pd.DataFrame(train_rewards_1)
        new_states_1 = pd.DataFrame(train_new_states_1)
        del train_states_1, train_actions_1, train_rewards_1, \
            train_new_states_1
        log_1 = pd.concat([states_1, actions_1, rewards_1, new_states_1],
                          axis=1)
        loss_1 = pd.DataFrame(train_loss_1)
        pred_1 = pd.DataFrame(train_pred_1)
        del train_loss_1, train_pred_1
Example #5
0
              angle,  angular_v, left_leg_on_groud, right_leg_on_ground]
                   
ACTION: Discrete(4)- 
        [Do Nothing, fire left engine, main engine, right engine]
    
REWARD: 
    - moving from the top of the screen to landing pad & zero speed : +100..140 
    - If lander moves away from landing pad, it loses reward back
    - Episode finish w. lander crashing       : -100 
    - Episode finish w. lander coming to rest : +100
    - Each leg ground contact                 : +10
    - Firing main engine                      : -0.3/frame
    - Solved                                  : +200 
'''

# 1. Initialize_environment
env = gym.make('LunarLander-v2')
env.seed(0)
state = env.reset()

print('State shape: ', env.observation_space.shape, type(state))
print(state)

# 2. Initialize Agent
agent = Agent(state_size=8, action_size=4, seed=0)

# 3. Train Agent
scores = train_dqn(env, agent, n_episodes=4000)

# 4. Simulate Agent in the Environment
simulate_env(env, agent, model_path='models/checkpoint_1900.pth')
Example #6
0
from train import train_dqn
import pickle

env_name = 'BreakoutNoFrameskip-v4'
exp_name = 'dqn'
train_dqn(env_name, exp_name, notebook=False)