Ejemplo n.º 1
0
    def train(self):
        args = self.args
        torch.manual_seed(args.seed)
        env = env = grid2op.make(args.env_name,
                                 test=args.for_test,
                                 reward_class=L2RPNReward)
        shared_model = ActorCritic(env.observation_space.size(),
                                   self.action_space, args.hidden_size)
        shared_model.share_memory()

        if args.no_shared:
            optimizer = None
        else:
            optimizer = my_optim.SharedAdam(shared_model.parameters(),
                                            lr=args.lr)
            optimizer.share_memory()

        processes = []

        counter = mp.Value('i', 0)
        lock = mp.Lock()

        p = mp.Process(target=self.do_test,
                       args=(args.num_processes, args, shared_model, counter))
        p.start()
        processes.append(p)

        for rank in range(0, args.num_processes):
            p = mp.Process(target=self.do_train,
                           args=(rank, args, shared_model, counter, lock,
                                 optimizer))
            p.start()
            processes.append(p)
        for p in processes:
            p.join()
    def __init__(self, args_, logger_):
        self.args = args_
        self.logger = logger_
        self.env = AtariEnv(gym.make(self.args.game),
                            args_.frame_seq,
                            args_.frame_skip,
                            render=True)
        self.shared_model = A3CLSTMNet(self.env.state_shape,
                                       self.env.action_dim)
        self.shared_model.share_memory()
        self.optim = my_optim.SharedAdam(self.shared_model.parameters(),
                                         lr=self.args.lr)
        self.optim.share_memory()
        # visdom
        self.vis = visdom.Visdom()
        self.main_update_step = Value('d', 0)
        # load model
        if self.args.load_weight != 0:
            self.load_model(self.args.load_weight)

        self.jobs = []
        if self.args.t_flag:
            for process_id in xrange(self.args.jobs):
                job = A3CSingleProcess(process_id, self, logger_)
                self.jobs.append(job)
        self.test_win = None
Ejemplo n.º 3
0
def main():
    #env
    args = config()
    mp.set_start_method("spawn")
    os.environ['OMP_NUM_THREADS'] = '1'
    os.environ['CUDA_VISIBLE_DEVICES'] = ""

    env = create_atari_env(args.env_name)
    shared_model = AcotrCritic(env.observation_space.shape[0],
                               env.action_space)
    shared_model.share_memory()

    optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=args.lr)
    optimizer.share_memory()

    processes = []

    counter = mp.Value('i', 0)
    lock = mp.Lock()

    p = mp.Process(target=test,
                   args=(args.num_processes, args, shared_model, counter,
                         "./log/"))
    p.start()
    processes.append(p)

    for rank in range(0, args.num_processes):
        p = mp.Process(target=train,
                       args=(rank, args, shared_model, counter, lock,
                             optimizer))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
Ejemplo n.º 4
0
class Params():
    def __init__(self):
        self.lr = 0.0001
        self.gamma = 0.99
        self.tau = 1.
        self.seed = 1
        self.num_processes = 16
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'Breakout-v0'


# Main run
os.environ['OMP_NUM_THREADS'] = '1'
params = Params()
torch.manual_seed(params.seed)
env = create_atari_env(params.env_name)
shared_model = ActorCritic(env.observation_space.shape[0], env.action_space)
shared_model.share_memory()
optimizer = my_optim.SharedAdam(shared_model.parameters(), lr=params.lr)
optimizer.share_memory()
processes = []
p = mp.Process(target=test, args=(params.num_processes, params, shared_model))
p.start()
processes.append(p)
for rank in range(0, params.num_processes):
    p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
    p.start()
    processes.append(p)
for p in processes:
    p.join()
Ejemplo n.º 5
0
    loader = DataLoader(opt)  # not used in training procedure, just used to set vocab_size and seq_length
    opt.vocab_size = loader.vocab_size
    opt.seq_length = loader.seq_length

    model = models.setup(opt)
    model.train()
    num_parameter = get_num_params(model)
    print('number of parameters: ' + str(num_parameter))

    if opt.async_opt:
        if opt.use_cuda:
            model.cuda()
        model.share_memory()
        optimizer = my_optim.SharedAdam(model.parameters(),
                                        lr=opt.optim_lr,
                                        betas=(opt.optim_adam_beta1, opt.optim_adam_beta2),
                                        weight_decay=opt.optim_weight_decay)
        optimizer.share_memory()
        processes = []
        for rank in range(opt.num_processes):
            p = mp.Process(target=train, args=(rank, model, opt, optimizer))
            p.start()
            processes.append(p)

        for p in processes:
            p.join()
    else:
        if opt.use_cuda:
            model.cuda()
        rank = 0
        optimizer = None
Ejemplo n.º 6
0
    shared_model.share_memory()

    if not args.no_curiosity:
        # <---ICM---
        shared_curiosity = IntrinsicCuriosityModule(
            # env.observation_space.shape[0], env.action_space)
            args.num_stack,
            env.action_space)
        shared_curiosity.share_memory()
        # ---ICM--->

    if args.no_shared:
        optimizer = None
    else:
        if args.no_curiosity:
            optimizer = my_optim.SharedAdam(shared_model.parameters(),
                                            lr=args.lr)
        elif not args.no_curiosity:
            if not args.curiosity_only:
                optimizer = my_optim.SharedAdam(  # ICM
                    chain(shared_model.parameters(),
                          shared_curiosity.parameters()),
                    lr=args.lr)
            elif args.curiosity_only:
                optimizer = my_optim.SharedAdam(shared_curiosity.parameters(),
                                                lr=args.lr)
        optimizer.share_memory()

    if (args.model_file is not None) and (args.optimizer_file is not None):
        logging.info("Start with a pretrained model")
        shared_model.load_state_dict(torch.load(args.model_file))
        optimizer.load_state_dict(torch.load(args.optimizer_file))
def main(method):
    args = built_parser(method=method)
    env = gym.make(args.env_name)
    state_dim = env.observation_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  #+ sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
        for i in range(args.num_learners):
            #device = torch.device("cuda")
            device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
Ejemplo n.º 8
0
        if args.planning:
            d_module = load_d_module(env.action_space.shape[0], args)

        shared_model = R_Module(env.action_space.shape[0],
                                args.dim,
                                discrete=args.discrete,
                                baseline=args.baseline,
                                state_space=env.observation_space.shape[0])

        # shared reward module for everyone
        shared_model.share_memory()

        if args.no_shared:
            optimizer = None
        else:
            optimizer = my_optim.SharedAdam(shared_model.parameters(),
                                            lr=args.lr)
            optimizer.share_memory()

        processes = []

        train_agent_method = None

        total_args = args
        train_agent_method = train_rewards

        for rank in range(0, args.num_processes):
            if rank == 0:
                p = mp.Process(target=train_agent_method,
                               args=(rank, total_args, shared_model, enc,
                                     optimizer, tb_log_dir, d_module))
            else:
Ejemplo n.º 9
0
def main(method):

    params = {
        'obs_size': (160, 100),  # screen size of cv2 window
        'dt': 0.025,  # time interval between two frames
        'ego_vehicle_filter':
        'vehicle.lincoln*',  # filter for defining ego vehicle
        'port': 2000,  # connection port
        'task_mode':
        'Straight',  # mode of the task, [random, roundabout (only for Town03)]
        'code_mode': 'train',
        'max_time_episode': 100,  # maximum timesteps per episode
        'desired_speed': 15,  # desired speed (m/s)
        'max_ego_spawn_times': 100,  # maximum times to spawn ego vehicle
    }

    args = built_parser(method=method)
    env = gym.make(args.env_name, params=params)
    state_dim = env.state_space.shape
    action_dim = env.action_space.shape[0]

    args.state_dim = state_dim
    args.action_dim = action_dim
    action_high = env.action_space.high
    action_low = env.action_space.low
    args.action_high = action_high.tolist()
    args.action_low = action_low.tolist()
    args.seed = np.random.randint(0, 30)
    args.init_time = time.time()
    num_cpu = mp.cpu_count()
    print(state_dim, action_dim, action_high, num_cpu)

    if args.alpha == 'auto' and args.target_entropy == 'auto':
        delta_a = np.array(args.action_high, dtype=np.float32) - np.array(
            args.action_low, dtype=np.float32)
        args.target_entropy = -1 * args.action_dim  # + sum(np.log(delta_a/2))

    Q_net1 = QNet(args)
    Q_net1.train()
    Q_net1.share_memory()
    Q_net1_target = QNet(args)
    Q_net1_target.train()
    Q_net1_target.share_memory()
    Q_net2 = QNet(args)
    Q_net2.train()
    Q_net2.share_memory()
    Q_net2_target = QNet(args)
    Q_net2_target.train()
    Q_net2_target.share_memory()
    actor1 = PolicyNet(args)

    print("Network inited")

    if args.code_model == "eval":
        actor1.load_state_dict(
            torch.load('./' + args.env_name + '/method_' + str(args.method) +
                       '/model/policy_' + str(args.max_train) + '.pkl'))
    actor1.train()
    actor1.share_memory()
    actor1_target = PolicyNet(args)
    actor1_target.train()
    actor1_target.share_memory()
    actor2 = PolicyNet(args)
    actor2.train()
    actor2.share_memory()
    actor2_target = PolicyNet(args)
    actor2_target.train()
    actor2_target.share_memory()

    print("Network set")

    Q_net1_target.load_state_dict(Q_net1.state_dict())
    Q_net2_target.load_state_dict(Q_net2.state_dict())
    actor1_target.load_state_dict(actor1.state_dict())
    actor2_target.load_state_dict(actor2.state_dict())

    print("Network loaded!")

    Q_net1_optimizer = my_optim.SharedAdam(Q_net1.parameters(),
                                           lr=args.critic_lr)
    Q_net1_optimizer.share_memory()
    Q_net2_optimizer = my_optim.SharedAdam(Q_net2.parameters(),
                                           lr=args.critic_lr)
    Q_net2_optimizer.share_memory()
    actor1_optimizer = my_optim.SharedAdam(actor1.parameters(),
                                           lr=args.actor_lr)
    actor1_optimizer.share_memory()
    actor2_optimizer = my_optim.SharedAdam(actor2.parameters(),
                                           lr=args.actor_lr)
    actor2_optimizer.share_memory()
    log_alpha = torch.zeros(1, dtype=torch.float32, requires_grad=True)
    log_alpha.share_memory_()
    alpha_optimizer = my_optim.SharedAdam([log_alpha], lr=args.alpha_lr)
    alpha_optimizer.share_memory()

    print("Optimizer done")

    share_net = [
        Q_net1, Q_net1_target, Q_net2, Q_net2_target, actor1, actor1_target,
        actor2, actor2_target, log_alpha
    ]
    share_optimizer = [
        Q_net1_optimizer, Q_net2_optimizer, actor1_optimizer, actor2_optimizer,
        alpha_optimizer
    ]

    experience_in_queue = []
    experience_out_queue = []
    for i in range(args.num_buffers):
        experience_in_queue.append(Queue(maxsize=10))
        experience_out_queue.append(Queue(maxsize=10))
    shared_queue = [experience_in_queue, experience_out_queue]
    step_counter = mp.Value('i', 0)
    stop_sign = mp.Value('i', 0)
    iteration_counter = mp.Value('i', 0)
    shared_value = [step_counter, stop_sign, iteration_counter]
    lock = mp.Lock()
    procs = []
    if args.code_model == "train":
        for i in range(args.num_learners):
            if i % 2 == 0:
                device = torch.device("cuda:1")
            else:
                device = torch.device("cuda:0")
            # device = torch.device("cpu")
            procs.append(
                Process(target=leaner_agent,
                        args=(args, shared_queue, shared_value, share_net,
                              share_optimizer, device, lock, i)))
        for i in range(args.num_actors):
            procs.append(
                Process(target=actor_agent,
                        args=(args, shared_queue, shared_value,
                              [actor1, Q_net1], lock, i)))
        for i in range(args.num_buffers):
            procs.append(
                Process(target=buffer,
                        args=(args, shared_queue, shared_value, i)))
        procs.append(
            Process(target=evaluate_agent,
                    args=(args, shared_value, share_net)))
    elif args.code_model == "simu":
        procs.append(Process(target=simu_agent, args=(args, shared_value)))

    for p in procs:
        p.start()
    for p in procs:
        p.join()
Ejemplo n.º 10
0
        self.seed = 1
        self.num_processes = 16
        self.num_steps = 20
        self.max_episode_length = 10000
        self.env_name = 'Breakout-v0'


# Main run
os.environ['OMP_NUM_THREADS'] = '1'  # 1 thread per core
params = Params()  # creating the params object from the Params class, that sets all the model parameters
torch.manual_seed(params.seed)  # setting the seed (not essential)
env = create_atari_env(params.env_name)  # we create an optimized environment thanks to universe
shared_model = ActorCritic(env.observation_space.shape[0],
                           env.action_space)  # shared_model is the model shared by the different agents (different threads in different cores)
shared_model.share_memory()  # storing the model in the shared memory of the computer, which allows the threads to have access to this shared memory even if they are in different cores
optimizer = my_optim.SharedAdam(shared_model.parameters(),
                                lr=params.lr)  # the optimizer is also shared because it acts on the shared model
optimizer.share_memory()  # same, we store the optimizer in the shared memory so that all the agents can have access to this shared memory to optimize the model
processes = []  # initializing the processes with an empty list
p = mp.Process(target=test, args=(params.num_processes, params,
                                  shared_model))  # allowing to create the 'test' process with some arguments 'args' passed to the 'test' target function - the 'test' process doesn't update the shared model but uses it on a part of it - torch.multiprocessing.Process runs a function in an independent thread
p.start()  # starting the created process p
processes.append(p)  # adding the created process p to the list of processes
for rank in range(0,
                  params.num_processes):  # making a loop to run all the other processes that will be trained by updating the shared model
    p = mp.Process(target=train, args=(rank, params, shared_model, optimizer))
    p.start()
    processes.append(p)
for p in processes:  # creating a pointer that will allow to kill all the threads when at least one of the threads, or main.py will be killed, allowing to stop the program safely
    print('working')
    p.join()
Ejemplo n.º 11
0
parser.add_argument('--test', action='store_true',
                    help='test ')
parser.add_argument('--feature', type=int, default=96, 
                    help='features num')


if __name__ == '__main__':
    args = parser.parse_args()
    os.environ['OMP_NUM_THREADS'] = '1'
    torch.manual_seed(args.seed)

    num_inputs = args.feature
    num_actions = 9

    ac_net = ActorCritic(num_inputs, num_actions)
    opt_ac = my_optim.SharedAdam(ac_net.parameters(), lr=args.lr)

    if args.resume:
        print("=> loading checkpoint ")
        checkpoint = torch.load('../models/kankan/best.t7')
        #args.start_epoch = checkpoint['epoch']
        #best_prec1 = checkpoint['best_prec1']
        ac_net.load_state_dict(checkpoint['state_dict'])
        #opt_ac.load_state_dict(checkpoint['optimizer'])
        print(ac_net)
        print("=> loaded checkpoint  (epoch {})"
                .format(checkpoint['epoch']))

    ac_net.share_memory()
    #opt_ac = my_optim.SharedAdam(ac_net.parameters(), lr=args.lr)
    opt_ac.share_memory()
Ejemplo n.º 12
0
operative_temp = [all_parameter[8]] + all_parameter[10:21]
cost_flex = all_parameter[2:8] + [all_parameter[9]]
state_num = all_parameter[10:109] + all_parameter[0:6] + [
    all_parameter[7]
] + predictionFlat(params.file_path_prediction, (time_step_update) % 8760)

state = np.array(state_normalization(params.file_path_norm, state_num))
state = torch.from_numpy(state).float()

cx = torch.zeros(1, params.hidden_layer
                 )  # the cell states of the LSTM are reinitialized to zero
hx = torch.zeros(1, params.hidden_layer
                 )  # the hidden states of the LSTM are reinitialized to zero

model = ActorCritic(178, params.output_space)
optimizer = my_optim.SharedAdam(model.parameters(), lr=params.lr)

value, action_values, (hx, cx) = model(
    (state.unsqueeze(0), (hx, cx))
)  # getting from the model the output V(S) of the critic, the output Q(S,A) of the actor, and the new hidden & cell states
prob = F.softmax(
    action_values, dim=1
)  # generating a distribution of probabilities of the Q-values according to the softmax: prob(a) = exp(prob(a))/sum_b(exp(prob(b)))
log_prob = F.log_softmax(
    action_values, dim=1
)  # generating a distribution of log probabilities of the Q-values according to the log softmax: log_prob(a) = log(prob(a))
entropy = -(log_prob * prob).sum(1)  # H(p) = - sum_x p(x).log(p(x))
action = prob.multinomial(
    1
).data  # selecting an action by taking a random draw from the prob distribution
log_prob = log_prob.gather(
Ejemplo n.º 13
0
        self.env_name = 'Pendulum-v0'


if __name__ == '__main__':
    os.environ['OMP_NUM_THREADS'] = '1'
    params = Params()
    torch.manual_seed(params.seed)
    env = gym.make(params.env_name)
    num_inputs = env.observation_space.shape[0]
    num_outputs = env.action_space.shape[0]

    shared_p = Policy(num_inputs, num_outputs)
    shared_v = Value(num_inputs)
    shared_p.share_memory()
    shared_v.share_memory()
    optimizer_p = my_optim.SharedAdam(shared_p.parameters(), lr=params.lr)
    optimizer_v = my_optim.SharedAdam(shared_v.parameters(), lr=params.lr)

    processes = []
    p = mp.Process(target=test, args=(params.num_processes, params, shared_p))
    p.start()
    processes.append(p)
    for rank in range(0, params.num_processes):
        p = mp.Process(target=train,
                       args=(rank, params, shared_p, shared_v, optimizer_p,
                             optimizer_v))
        p.start()
        processes.append(p)
    for p in processes:
        p.join()
Ejemplo n.º 14
0
    cx = torch.zeros(1, 256)
    hx = torch.zeros(1, 256)
    state = env.reset()
    state = torch.from_numpy(state)

    # <---ICM---
    shared_curiosity = IntrinsicCuriosityModule2(args.num_stack,
                                                 env.action_space,
                                                 args.epsilon)
    shared_curiosity.share_memory()
    # ---ICM--->

    if args.no_shared:
        optimizer = None
    else:
        optimizer = my_optim.SharedAdam(shared_curiosity.parameters(),
                                        lr=args.lr)
        optimizer.share_memory()

    if args.curiosity_file is not None:
        logging.info("Load curiosity")
        shared_curiosity.load_state_dict(torch.load(args.curiosity_file),
                                         strict=False)

    if args.optimizer_file is not None:
        logging.info("Load optimizer")
        optimizer.load_state_dict(torch.load(args.optimizer_file))

    if args.new_curiosity:
        logging.info("Bayesian curiosity")

    processes = []