Beispiel #1
0
    print("Number of episodes : " + str(args.num_episodes))
    for i_episode in range(args.num_episodes):
        '''
        Here, num_episodes correspond to the generations in Algo 1.
        In every generation, the population is evaluated, ranked, mutated, and re-instered into population
        '''
        evo.evaluate_pop()
        evo.rank_pop_selection_mutation()

        print("Evolutionary Fitness = " + str(evo.best_policy.fitness))
        '''
        #############
        The DDPG part
        #############
        '''
        state = torch.Tensor([env.reset()])  # algo line 6
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
            0, args.exploration_end -
            i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()
        episode_reward = 0

        for t in range(args.num_steps):  # line 7
            # forward pass through the actor network
            action = agent.select_action(state, ounoise)  # line 8
            next_state, reward, done, _ = env.step(action.numpy()[0])  # line 9
            episode_reward += reward

            action = torch.Tensor(action)
            mask = torch.Tensor([not done])
            next_state = torch.Tensor([next_state])
Beispiel #2
0
        agent.actor_target = nn.DataParallel(agent.actor_target)
        agent.actor_perturbed = nn.DataParallel(agent.actor_perturbed)
        agent.critic = nn.DataParallel(agent.critic)
        agent.critic_target = nn.DataParallel(agent.critic_target)
    agent.actor.to(device)
    agent.actor_target.to(device)
    agent.actor_perturbed.to(device)
    agent.critic.to(device)
    agent.critic_target.to(device)

end_str = "_{}_{}".format(args.env_name, args.model_suffix)
agent.load_model("models/ddpg_actor" + end_str, "models/ddpg_critic" + end_str)

while True:
    episode_reward = 0
    state = torch.Tensor([env.reset()]).to(device)
    env.render()
    while True:
        action = agent.select_action(state, None, None)
        next_state, reward, done, _ = env.step(action.cpu().numpy()[0])
        env.render()
        episode_reward += reward

        #action = torch.Tensor(action).to(device)
        mask = torch.Tensor([not done]).to(device)
        next_state = torch.Tensor([next_state]).to(device)
        reward = torch.Tensor([reward]).to(device)

        state = next_state
        print("Reward: {}; Episode reward: {}".format(reward, episode_reward))
Beispiel #3
0
offset_time = args.offset_time # 0
start_time = time.time()
Qevaluations=[]
goal_path = cur_path + '/' +str(args.flow_family)+ '/' +str(args.env_name) + '/' + 'seed_' + str(args.seed)
os.makedirs(goal_path)
print(start_time,args.env_name)
if 'dataframe' in args:
    df = args.dataframe
else:
    df = pd.DataFrame(columns=["total_steps", "score_eval", "time_so_far"])

for i_episode in itertools.count(start_episode):
    episode_reward = 0
    episode_steps = 0
    done = False
    state = env.reset()

    while not done:
        if args.start_steps > total_numsteps:
            action = np.random.uniform(env.action_space.low,env.action_space.high,env.action_space.shape[0])  # Sample random action
        else:
            action = agent.select_action(state)  # Sample action from policy
        if len(memory) > args.start_steps:
            # Number of updates per step in environment
            for i in range(args.updates_per_step):
                # Update parameters of all the networks
                (critic_1_loss, critic_2_loss,
                 policy_loss,
                 _, _,
                 policy_info,
                 )= agent.update_parameters(memory, args.batch_size, updates)
Beispiel #4
0
                 env.observation_space.shape[0], env.action_space)

memory = ReplayMemory(args.replay_size)

ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None
param_noise = AdaptiveParamNoiseSpec(
    initial_stddev=0.05,
    desired_action_stddev=args.noise_scale,
    adaptation_coefficient=1.05) if args.param_noise else None

rewards = []
total_numsteps = 0
updates = 0

for i_episode in range(args.num_episodes):
    state = torch.Tensor([env.reset()])

    if args.ou_noise:
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(
            0, args.exploration_end -
            i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()

    if args.param_noise and args.algo == "DDPG":
        agent.perturb_actor_parameters(param_noise)

    episode_reward = 0
    while True:
        action = agent.select_action(state, ounoise, param_noise)
        next_state, reward, done, _ = env.step(action.numpy()[0])
        total_numsteps += 1
# Agent
agent = SAC(env.observation_space.shape[0], env.action_space, args)

writer = SummaryWriter()

# Memory
memory = ReplayMemory(args.replay_size)

# Training Loop
rewards = []
total_numsteps = 0
updates = 0

for i_episode in itertools.count():
    state = env.reset()

    episode_reward = 0
    while True:
        action = agent.select_action(state)  # Sample action from policy
        next_state, reward, done, _ = env.step(action)  # Step
        mask = not done  # 1 for not done and 0 for done
        memory.push(state, action, reward, next_state, mask)  # Append transition to memory
        if len(memory) > args.batch_size:
            for i in range(args.updates_per_step): # Number of updates per step in environment
                # Sample a batch from memory
                state_batch, action_batch, reward_batch, next_state_batch, mask_batch = memory.sample(args.batch_size)
                # Update parameters of all the networks
                value_loss, critic_1_loss, critic_2_loss, policy_loss, ent_loss, alpha = agent.update_parameters(state_batch, action_batch,
                                                                                                reward_batch, next_state_batch, 
                                                                                                mask_batch, updates)
Beispiel #6
0
        a_noise.reset()
    if p_noise is not None:
        a.perturb_actor_parameters(param_noise)


total_steps = 0
print(base_dir)

if args.num_steps is not None:
    assert args.num_epochs is None
    nb_epochs = int(
        args.num_steps) // (args.num_epochs_cycles * args.num_rollout_steps)
else:
    nb_epochs = 500

state = agent.Tensor([env.reset()])

episode_reward = 0
agent.train()

reset_noise(agent, noise, param_noise)

if args.visualize:
    vis = visdom.Visdom(env=base_dir)
else:
    vis = None

train_steps = 0
episode_timesteps = 0
for epoch in trange(nb_epochs):
    for cycle in range(args.num_epochs_cycles):
Beispiel #7
0
    env_name = args.env
    try:
        env = NormalizedActions(envs.env_list[env_name](render=args.render))
    except TypeError as err:
        print('no argument render,  assumping env.render will just work')
        env = NormalizedActions(envs.env_list[env_name]())

    assert np.any(np.abs(env.action_space.low) <= 1.) and np.any(
        np.abs(env.action_space.high) <= 1.), 'Action space not normalizd'

    if args.record:
        env = gym.wrappers.Monitor(env,
                                   './data/vid/mpc/{}-{}'.format(
                                       env_name, args.frame),
                                   force=True)
    env.reset()

    env.seed(args.seed)
    np.random.seed(args.seed)
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    action_dim = env.action_space.shape[0]
    state_dim = env.observation_space.shape[0]

    device = 'cpu'
    if torch.cuda.is_available():
        device = 'cuda:0'
Beispiel #8
0
        writer_test = SummaryWriter(log_dir='runs/' + folder + 'run_' +
                                    str(i_run) + '/test')

        # Setup Replay Memory
        memory = ReplayMemory(args.replay_size)

        # TRAINING LOOP
        total_numsteps = updates = running_episode_reward = running_episode_reward_100 = 0
        rewards = []

        for i_episode in itertools.count(1):
            print(updates)
            ts = time.time()
            episode_reward = episode_steps = 0
            done = False
            state = env.reset()
            if cnn:
                state_buffer = StateBuffer(args.state_buffer_size, state)
                state = state_buffer.get_state()

            critic_1_loss_acc = critic_2_loss_acc = policy_loss_acc = ent_loss_acc = alpha_acc = 0

            while not done:
                # if cnn:
                #     writer_train.add_images('episode_{}'.format(str(i_episode)), state_buffer.get_tensor(), episode_steps)
                if i_episode < args.warm_up_episode:
                    action = env.action_space.sample()  # Sample random action
                else:
                    action = agent.select_action(
                        state)  # Sample action from policy
if args.layers == 1:
    policy = SingleLayerPolicy(args.hidden_size, env.observation_space.shape[0], env.action_space)
elif args.layers== 2:
    policy = TwoLayerPolicy(args.hidden_size, env.observation_space.shape[0], env.action_space)
agent = LPO(args.hidden_size, env.observation_space.shape[0], env.action_space, args.constraint_size, policy)

dir = 'ckpt_' + env_name
if not os.path.exists(dir):    
    os.mkdir(dir)

# change this. 
# sample 1 trajectory of n steps (n = numsteps: arg)
# for each of the n steps, sample k trajectories (k: arg)
# create constraints for these n*k constraints
for i_episode in range(args.num_rollouts):
    start_state = env.reset()
    state = torch.Tensor([start_state])
    entropies = []
    log_probs = []
    
    states = [start_state]
    states_bytes = [start_state.tobytes()]
    actions = []
    rewards = []

    for t in range(args.num_steps):
        action, log_prob, entropy = agent.select_action(state)
        actions.append(action)
        action = action.cpu()

        next_state, reward, done, _ = env.step(action.numpy()[0])
Beispiel #10
0
    ounoise = OUNoise(env.action_space.shape[0]) if args.ou_noise else None
    param_noise = AdaptiveParamNoiseSpec(
        initial_stddev=0.05,
        desired_action_stddev=args.noise_scale,
        adaptation_coefficient=1.05) if args.param_noise else None
else:
    ounoise = None
    param_noise = None

rewards = []
total_numsteps = 0
updates = 0
device = torch.device('cuda')
num_steps = args.num_frames // args.num_processes

state = torch.Tensor(env.reset())

episode_rewards = torch.zeros(args.num_processes, 1).to(device)
final_rewards = torch.zeros(args.num_processes, 1).to(device)
start = time.time()
for step in range(int(num_steps)):
    '''
    if args.ou_noise and ounoise is not None: 
        ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -
                                                                      i_episode) / args.exploration_end + args.final_noise_scale
        ounoise.reset()

    if args.param_noise and args.algo == "DDPG" and param_noise is not None:
        agent.perturb_actor_parameters(param_noise)
    '''
    episode_reward = 0
Beispiel #11
0
                               '/tmp/{}-experiment'.format(env_name),
                               force=True)

    env.seed(args.seed)
    torch.manual_seed(args.seed)
    np.random.seed(args.seed)

    agent = REINFORCE(args.hidden_size, env.observation_space.shape[0],
                      env.action_space)

    dir = 'ckpt_' + env_name
    if not os.path.exists(dir):
        os.mkdir(dir)

    for i_episode in range(args.num_episodes):
        env_reset = np.expand_dims(env.reset(), 0)
        state = torch.Tensor(env_reset)
        # print(state)
        entropies = []
        log_probs = []
        rewards = []
        for t in range(args.num_steps):

            action, log_prob, entropy = agent.select_action(state)
            action = action.cpu()

            next_state, reward, done, _ = env.step(action.numpy()[0])

            if args.render:
                env.render()