def main():
    cuda0 = torch.device('cuda:0')
    v_num = 32
    # discount_gamma = 0.9
    epsilon = 0
    learning_rate = 0.0001
    epsilon_cut = 0.99
    epsilon_min = 0

    #跟新轮次设定
    iteration_num = 2

    v_state_num = 2
    batch_size = 100
    start_time = datetime.datetime.now()

    hv_env_num = 100
    envs = hvenv('../data',v_state_num, v_num, hv_env_num,step_num,iteration_num)

    hv_env_num_val= 1000
    envs_val = hvenv('../data',v_state_num, v_num, hv_env_num_val,step_num,iteration_num)


    rlnnet = DQN(2*v_num**2,2**step_num).to(cuda0)
    rln_tgt_net = DQN(2*v_num**2,2**step_num).to(cuda0)
    optimizer = torch.optim.Adam(params=rlnnet.parameters(), lr=learning_rate)
    exp_buffer = collections.deque()
    exp_buffer_val = collections.deque()

    obs = envs.reset()
    obs_val = envs_val.reset()

    reward = envs.reward()
    reward_val = envs_val.reward()

    i = 0
    # rlnnet = torch.load('../weight/16step_multi_nobn_big_8192_large_ddqn_classical_init_-3learning_rate'+str(step_num)+'step_'+str(iteration_num)+'iteration_best_model.pth')

    # rlnnet = torch.load('../weight/16step_multi_big_8192_large_ddqn_classical_init_-3learning_rate'+str(step_num)+'step_'+str(iteration_num)+'iteration_best_model.pth')
    # rlnnet = torch.load('../weight/16step_multi_big_large_ddqn_classical_init_-3learning_rate'+str(step_num)+'step_'+str(iteration_num)+'iteration_best_model.pth')


    while True:

        if i % SYC_NUM == 0:
            print('----------------------------')
            print('start_time: ' + str(start_time))
            print('i/SYC_NUM: ' + str(i / SYC_NUM))
            print('syc epsilon: ' + str(epsilon))
            print('learning_rate: '+ str(learning_rate))
            print('v_state_num: '+str(v_state_num))
            print('v_num: '+str(v_num))
            print('hv_env_num: '+str(hv_env_num))
            print('batch_size: '+str(batch_size))
            print('step_num: '+str(step_num))
            print('iteration_num: '+str(iteration_num))
            print('BUFFER_START_NUM: '+str(BUFFER_START_NUM))
            print('BUFFER_length_NUM: '+str(len(exp_buffer)))
            print('----------------------------')


            epsilon *= epsilon_cut

            # torch.save(rlnnet, '../weight/16step_multi_big_large_ddqn_classical_init_-3learning_rate'+str(step_num)+'step_'+str(iteration_num)+'iteration_best_model.pth')
            # torch.save(rlnnet, '../weight/16step_multi_big_8192_large_ddqn_classical_init_-3learning_rate'+str(step_num)+'step_'+str(iteration_num)+'iteration_best_model.pth')
            # torch.save(rlnnet,'../weight/16step_multi_nobn_big_8192_large_ddqn_classical_init_-3learning_rate'+str(step_num)+'step_'+str(iteration_num)+'iteration_best_model.pth')
            rln_tgt_net.load_state_dict(rlnnet.state_dict())

        if epsilon < epsilon_min:
            epsilon = epsilon_min

        i += 1

        rlnnet.train()
        #获取经验
        exp_buffer, obs, reward = fresh_exp_buffer(exp_buffer, rlnnet, envs, obs, epsilon, cuda0,reward,BUFFER_START_NUM,BUFFER_MAX_NUM)
        optimizer.zero_grad()
        exp_batch_index = np.random.choice(np.arange(len(exp_buffer)), size=batch_size, replace=False)
        batch = batch_sample(exp_batch_index,exp_buffer)

        loss_t = calc_loss(batch, rlnnet, rln_tgt_net, device=cuda0)
        loss_t.backward()
        optimizer.step()

        if i%100==0:
            rlnnet.eval()
            print('-------eval testing: epsilon = 0 -------')
            for _ in range(int(v_num*iteration_num/step_num)):
                exp_buffer_val, obs_val, reward_val = fresh_exp_buffer(exp_buffer_val, rlnnet, envs_val, obs_val, 0, cuda0, reward_val,
                                                       10000, 11000,True)
            print('length buffer: ' + str(len(exp_buffer_val)))
            print('-------eval end-------')
Esempio n. 2
0
def main():
    cuda0 = torch.device('cuda:0')
    v_num = 16
    # discount_gamma = 0.9
    epsilon = 0
    learning_rate = 0.0001
    epsilon_cut = 0.99
    epsilon_min = 0

    #跟新轮次设定
    iteration_num = 1

    v_state_num = 2
    batch_size = 100
    start_time = datetime.datetime.now()

    hv_env_num = 100
    envs = hvenv('../data', v_state_num, v_num, hv_env_num, step_num,
                 iteration_num)

    hv_env_num_val = 1000
    envs_val = hvenv('../data', v_state_num, v_num, hv_env_num_val, step_num,
                     iteration_num)

    rlnnet = DQN(2 * v_num**2, 2**step_num).to(cuda0)
    rln_tgt_net = DQN(2 * v_num**2, 2**step_num).to(cuda0)
    optimizer = torch.optim.Adam(params=rlnnet.parameters(), lr=learning_rate)
    exp_buffer = collections.deque()
    exp_buffer_val = collections.deque()

    obs = envs.reset()
    obs_val = envs_val.reset()

    reward = envs.reward()
    reward_val = envs_val.reward()

    i = 0
    reward_record = 0
    # rlnnet = torch.load(SAVE_PATH)

    while True:

        if i % SYC_NUM == 0:
            print('----------------------------')
            print('start_time: ' + str(start_time))
            print('i/SYC_NUM: ' + str(i / SYC_NUM))
            print('syc epsilon: ' + str(epsilon))
            print('learning_rate: ' + str(learning_rate))
            print('v_state_num: ' + str(v_state_num))
            print('v_num: ' + str(v_num))
            print('hv_env_num: ' + str(hv_env_num))
            print('batch_size: ' + str(batch_size))
            print('step_num: ' + str(step_num))
            print('iteration_num: ' + str(iteration_num))
            print('BUFFER_START_NUM: ' + str(BUFFER_START_NUM))
            print('BUFFER_length_NUM: ' + str(len(exp_buffer)))
            print('----------------------------')

            epsilon *= epsilon_cut
            rln_tgt_net.load_state_dict(rlnnet.state_dict())

        if epsilon < epsilon_min:
            epsilon = epsilon_min

        i += 1

        rlnnet.train()
        #获取经验
        exp_buffer, obs, reward = fresh_exp_buffer(exp_buffer, rlnnet, envs,
                                                   obs, epsilon, cuda0, reward,
                                                   BUFFER_START_NUM,
                                                   BUFFER_MAX_NUM)
        optimizer.zero_grad()
        exp_batch_index = np.random.choice(np.arange(len(exp_buffer)),
                                           size=batch_size,
                                           replace=False)
        batch = batch_sample(exp_batch_index, exp_buffer)

        loss_t = calc_loss(batch, rlnnet, rln_tgt_net, device=cuda0)
        loss_t.backward()
        optimizer.step()

        if i % 100 == 0:
            obs_val, reward_val, reward_record = eval(rlnnet, envs_val,
                                                      obs_val, epsilon, cuda0,
                                                      reward_val,
                                                      reward_record)