Esempio n. 1
0
def main():
    env = gym.make('LunarLanderContinuous-v2')
    log_dir = 'log/lander'

    # env = gym.make('Pendulum-v0')
    # log_dir = 'log/pendulum'

    # paper settings
    # agent = DDPG(env, sigma=0.2, num_episodes=1000, buffer_size=1000000, batch_size=64,
    #              tau=1e-3, batch_norm=True, merge_layer=2)

    # did not work unless I merged action into critic at first layer
    # worked btter without batchnorm

    k = 4000
    """
    agent = DDPG(env, log_dir, sigma=0.2, num_episodes=k, buffer_size=1000000, batch_size=64,
                 tau=1e-3, batch_norm=False, merge_layer=0)
    print('training start')
    agent.train()
    """
    agent = DDPG(env,
                 log_dir,
                 sigma=0.2,
                 num_episodes=k,
                 buffer_size=1000000,
                 batch_size=64,
                 tau=1e-2,
                 batch_norm=False,
                 merge_layer=0)
    print('training1 start')
    agent.train1()
    """
Esempio n. 2
0
def main(args):
    env = gym.make(args['env_name'])

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    action_dim = env.action_space.shape[0]
    max_action = env.action_space.high[0]
    state_dim = env.observation_space.shape[0]

    ddpg = DDPG(args, action_dim, max_action, state_dim, device)
    trained_actor = torch.load(args['model_directory'])
    ddpg.actor.load_state_dict(trained_actor)

    timestep = 0
    for episode in range(args['max_episode']):
        episode_reward = 0
        state = env.reset()

        while True:
            action = ddpg.get_action(state)
            next_state, reward, done, info = env.step(action)
            env.render()
            episode_reward += reward
            state = next_state
            timestep += 1

            if done:
                print('episode: ', episode,
                      '   reward : %.3f' % (episode_reward), '    timestep :',
                      timestep)
                break
Esempio n. 3
0
def Test():
    print 'test'
    cmd = 'sudo python ./utils/delete_episode_data.py'
    os.system(cmd)

    env = NetworkEnv()
    agent = DDPG(env.state_dim, env.rate_dim, env.path_dim, env.action_dim,
                 env.kPath)
    rewards = []

    workload = datamining
    state = env.reset(workload)
    agent.load()
    t1 = time.time()
    for i in range(test_max_episode):

        Perform = []
        ep_r = 0.0
        if i % 5 == 0:
            if env.CDF_file == datamining:
                env.set_CDF_file(websearch)
            else:
                env.set_CDF_file(datamining)

        for t in range(test_max_step):
            print '\n'
            print '第%d回合,第%d步骤:' % (i, t)
            print 'state', state

            action = agent.select_action(state)
            print 'action:', action

            # execute action
            next_state, reward, perform = env.step(action)
            Perform.append(perform)

            ep_r += reward
            state = next_state
            print 'reward', reward
            print(
                "Ep_i {}, the ep_r is {:0.2f}, the step is {}, the reward is {}"
                .format(i, ep_r, t, reward))
            if t == test_max_step - 1:
                print("Ep_i {}, the ep_r is {:0.2f}".format(i, ep_r))
                df_perform = pd.DataFrame(Perform)
                df_perform.to_csv("./data/test_perform.csv",
                                  mode='a',
                                  header=False,
                                  index=False)
                break

        rewards.append(ep_r)

    file_time = open('./data/test_time.txt', mode='w')
    df = pd.DataFrame([rewards])
    df.to_csv("./data/test_rewards.csv", mode='w', header=False, index=False)
    print max_episode, 'episode的总运行时间:', time.time() - t1
    file_time.write(str(time.time() - t1))
Esempio n. 4
0
def main():
    ddpg = DDPG(0, 0, torch.cuda.is_available())
    env.init_state()

    if os.path.exists('models/ddpg_actor_'):
        ddpg.load_model()
    else:
        print("Please ensure models existing!")

    while True:
        action = ddpg.select_action(env.state)
        env.step(action)
        print(env.last_score)
def main():

    # Initialize the ANNs
    agent = DDPG()

    rospy.init_node("neuro_deep_planner", anonymous=False)

    ros_handler = ROSHandler()
    ros_handler.on_policy = False

    while not rospy.is_shutdown():

        # If we have a new msg we might have to execute an action and need to put the new experience in the buffer
        if ros_handler.new_msg():
            if not ros_handler.is_episode_finished:
                # Send back the action to execute
                ros_handler.publish_action(agent.get_action(ros_handler.state))

            # Safe the past state and action + the reward and new state into the replay buffer
            agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished)

        elif ros_handler.new_setting():

            agent.noise_flag = ros_handler.noise_flag

        else:
            # Train the network!
            agent.train()
Esempio n. 6
0
def main():
    experiment = 'model-builder-v0'  #specify environments here
    env = gym.make(experiment)
    #steps= env.spec.timestep_limit #steps per episode
    steps = 20
    assert isinstance(env.observation_space,
                      Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"

    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter = 0
    reward_per_episode = 0
    total_reward = 0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    print("Number of States:", num_states)
    print("Number of Actions:", num_actions)
    print("Number of Steps per episode:", steps)
    #saving reward:
    reward_st = np.array([0])

    for i in range(episodes):
        print("==== Starting episode no:", i, "====", "\n")
        observation = env.reset()
        reward_per_episode = 0
        for t in range(steps):
            #rendering environmet (optional)
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x, [1, 300, 300, 2]))
            noise = exploration_noise.noise()
            action = action[
                0] + noise  #Select action according to current policy and exploration noise
            print("Action at step", t, " :", action, "\n")

            observation, reward, done, info = env.step(action)

            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x, observation, action, reward, done)
            #train critic and actor network
            if counter > 64:
                agent.train()
            reward_per_episode += reward
            counter += 1
            #check if episode ends:
            if (done or (t == steps - 1)):
                print('EPISODE: ', i, ' Steps: ', t, ' Total Reward: ',
                      reward_per_episode)
                print("Printing reward to file")
                exploration_noise.reset(
                )  #reinitializing random noise for action exploration
                reward_st = np.append(reward_st, reward_per_episode)
                np.savetxt('episode_reward.txt', reward_st, newline="\n")
                print('\n\n')
                break
    total_reward += reward_per_episode
    print("Average reward per episode {}".format(total_reward / episodes))
Esempio n. 7
0
def main():
    experiment= 'InvertedPendulum-v1' #specify environments here
    env= gym.make(experiment)
    steps= env.spec.timestep_limit #steps per episode    
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env, is_batch_norm)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    reward_per_episode = 0    
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]    
    print "Number of States:", num_states
    print "Number of Actions:", num_actions
    print "Number of Steps per episode:", steps
    #saving reward:
    reward_st = np.array([0])
      
    
    for i in xrange(episodes):
        print "==== Starting episode no:",i,"====","\n"
        observation = env.reset()
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            env.render()
            x = observation
            action = agent.evaluate_actor(np.reshape(x,[1,num_states]))
            noise = exploration_noise.noise()
            action = action[0] + noise #Select action according to current policy and exploration noise
            print "Action at step", t ," :",action,"\n"
            
            observation,reward,done,info=env.step(action)
            
            #add s_t,s_t+1,action,reward to experience memory
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()
            reward_per_episode+=reward
            counter+=1
            #check if episode ends:
            if (done or (t == steps-1)):
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                print "Printing reward to file"
                exploration_noise.reset() #reinitializing random noise for action exploration
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)    
Esempio n. 8
0
from strategies import OUStrategy
from utils import SEED
import mxnet as mx

# set environment, policy, qfunc, strategy

env = normalize(CartpoleEnv())

policy = DeterministicMLPPolicy(env.spec)
qfunc = ContinuousMLPQ(env.spec)
strategy = OUStrategy(env.spec)

# set the training algorithm and train

algo = DDPG(
    env=env,
    policy=policy,
    qfunc=qfunc,
    strategy=strategy,
    ctx=mx.gpu(0),
    max_path_length=100,
    epoch_length=1000,
    memory_start_size=10000,
    n_epochs=1000,
    discount=0.99,
    qfunc_lr=1e-3,
    policy_lr=1e-4,
    seed=SEED)

algo.train()
Esempio n. 9
0
def train(env, nb_epochs, nb_epoch_cycles, normalize_observations, actor_lr, critic_lr, action_noise,
          gamma, nb_train_steps, nb_rollout_steps, batch_size, memory, tau=0.01):

    max_action = env.action_space.high
    agent = DDPG(memory, env.observation_space.shape[0], env.action_space.shape[0],
                 gamma=gamma, tau=tau,
                 normalize_observations=normalize_observations,
                 batch_size=batch_size, action_noise=action_noise,
                 actor_lr=actor_lr, critic_lr=critic_lr,
                 )
    if USE_CUDA:
        agent.cuda()
    # Set up logging stuff only for a single worker.
    step = 0
    episode = 0
    episode_rewards_history = deque(maxlen=100)
    # Prepare everything.

    agent.reset()
    obs = env.reset()
    done = False
    episode_reward = 0.
    episode_step = 0
    episodes = 0
    t = 0

    epoch = 0
    start_time = time.time()

    epoch_episode_rewards = []
    epoch_episode_steps = []
    epoch_start_time = time.time()
    epoch_actions = []
    epoch_qs = []
    epoch_episodes = 0
    for epoch in range(nb_epochs):
        for cycle in range(nb_epoch_cycles):
            # Perform rollouts.
            for t_rollout in range(nb_rollout_steps):
                # Predict next action.
                action, q = agent.pi(obs, apply_noise=True, compute_Q=True)
                assert action.shape == env.action_space.shape

                # Execute next action.
                assert max_action.shape == action.shape
                new_obs, r, done, info = env.step(max_action * action)
                t += 1
                episode_reward += r
                episode_step += 1

                # Book-keeping.
                epoch_actions.append(action)
                epoch_qs.append(q)
                agent.store_transition(obs, action, r, new_obs, done)
                obs = new_obs

                if done:
                    # Episode done.
                    epoch_episode_rewards.append(episode_reward)
                    episode_rewards_history.append(episode_reward)
                    epoch_episode_steps.append(episode_step)
                    episode_reward = 0.
                    episode_step = 0
                    epoch_episodes += 1
                    episodes += 1

                    agent.reset()
                    obs = env.reset()

            # Train.
            epoch_actor_losses = []
            epoch_critic_losses = []
            for t_train in range(nb_train_steps):
                cl, al = agent.train()
                epoch_critic_losses.append(cl)
                epoch_actor_losses.append(al)
                agent.update_target_net()

        # Log stats.
        # XXX shouldn't call np.mean on variable length lists
        duration = time.time() - start_time
        combined_stats = dict()
        combined_stats['rollout/return'] = np.mean(epoch_episode_rewards)
        combined_stats['rollout/return_history'] = np.mean(episode_rewards_history)
        combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps)
        combined_stats['rollout/actions_mean'] = np.mean(epoch_actions)
        combined_stats['rollout/Q_mean'] = np.mean(epoch_qs)
        combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses)
        combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses)
        combined_stats['total/duration'] = duration
        combined_stats['total/steps_per_second'] = float(t) / float(duration)
        combined_stats['total/episodes'] = episodes
        combined_stats['rollout/episodes'] = epoch_episodes
        combined_stats['rollout/actions_std'] = np.std(epoch_actions)

        # Total statistics.
        combined_stats['total/epochs'] = epoch + 1
        combined_stats['total/steps'] = t

        for key in sorted(combined_stats.keys()):
            logger.record_tabular(key, combined_stats[key])
        logger.dump_tabular()
        logger.info('')
def main():

    # Initialize the ANNs
    agent = DDPG()

    rospy.init_node("neuro_deep_planner", anonymous=False)

    ros_handler = ROSHandler()
    ros_handler.on_policy = False

    # For plotting
    currently_plotting = False
    goal_count = 0
    crash_count = 0
    start_time = 0

    # Make sure the directory for the plotting exists
    if not tf.gfile.Exists(PLOT_PATH):
        tf.gfile.MakeDirs(PLOT_PATH)
    f = open(PLOT_PATH + '/results', 'w')

    while not rospy.is_shutdown():

        # If we are plotting results we don't want to train and need to turn of noise!
        if PLOTTING and not currently_plotting and agent.training_step > 0 and \
                                agent.training_step % PLOT_INTERVALL == 0:
            currently_plotting = True
            agent.noise_flag = False
            start_time = rospy.get_time()

        if currently_plotting and rospy.get_time() - start_time > PLOT_TIME:
            # Plot the results
            string = str(agent.training_step) + ', ' + str(goal_count) + ', ' + str(crash_count) + '\n'
            f.write(string)

            # Reset all parameters
            currently_plotting = False
            agent.noise_flag = True
            goal_count = 0
            crash_count = 0

        # If we are plotting results we need to count reached goals and crashes
        if currently_plotting:
            # Count the positive and negative rewards
            if ros_handler.new_msg():
                if not ros_handler.is_episode_finished:
                    # Send back the action to execute
                    ros_handler.publish_action(agent.get_action(ros_handler.state))
                elif ros_handler.reward == 1:
                    goal_count += 1
                elif ros_handler.reward == -1:
                    crash_count += 1

        # If we're not plotting results
        else:
            # If we have a new msg we might have to execute an action and need to put the new experience in the buffer
            if ros_handler.new_msg():
                if not ros_handler.is_episode_finished:
                    # Send back the action to execute
                    ros_handler.publish_action(agent.get_action(ros_handler.state))

                # Safe the past state and action + the reward and new state into the replay buffer
                agent.set_experience(ros_handler.state, ros_handler.reward, ros_handler.is_episode_finished)

            elif ros_handler.new_setting():

                agent.noise_flag = ros_handler.noise_flag

            else:
                # Train the network!
                agent.train()
Esempio n. 11
0
def main():
    experiment= 'InvertedPendulum-v1'
    env= gym.make(experiment)
    assert isinstance(env.observation_space, Box), "observation space must be continuous"
    assert isinstance(env.action_space, Box), "action space must be continuous"
    #Randomly initialize critic,actor,target critic, target actor network  and replay buffer   
    agent = DDPG(env)
    exploration_noise = OUNoise(env.action_space.shape[0])
    counter=0
    total_reward=0
    num_states = env.observation_space.shape[0]
    num_actions = env.action_space.shape[0]
    
    #saving reward:
    reward_st = np.array([0])
    
    
    
    for i in xrange(episodes):
        observation = env.reset()
    
        reward_per_episode = 0
        for t in xrange(steps):
            #rendering environmet (optional)            
            #env.render()
            
            x = observation
            #select action using actor network model
            action = agent.evaluate_actor(np.reshape(x,[num_actions,num_states]))
            
            noise = exploration_noise.noise()
            
                       
            action = action[0] + noise
            
            
            print 'Agent.Action :',action
            print '\n'
            print '\n'
            
                      
            observation,reward,done,[]=env.step(action)
            #add s_t,s_t+1,action,reward to experience memeroy
            agent.add_experience(x,observation,action,reward,done)
            #train critic and actor network
            if counter > 64: 
                agent.train()            
            
            reward_per_episode+=reward
            
            counter+=1
            #check if episode ends:
            if done:
                print 'EPISODE: ',i,' Steps: ',t,' Total Reward: ',reward_per_episode
                exploration_noise.reset()
                reward_st = np.append(reward_st,reward_per_episode)
                np.savetxt('episode_reward.txt',reward_st, newline="\n")
                print '\n'
                print '\n'
                break
    total_reward+=reward_per_episode            
    print "Average reward per episode {}".format(total_reward / episodes)    
Esempio n. 12
0
parser.add_argument('--replay_size', type=int, default=1000000, metavar='N',
        help='size of replay buffer (default: 1000000)')
parser.add_argument('--render', action='store_true',
        help='render the environment')
parser.add_argument('--threshold', type=float, default=0.5, metavar='G',
        help='threshold for PMV')
args = parser.parse_args()

num_inputs  = 4
num_outputs = 2

torch.manual_seed(args.seed)
np.random.seed(args.seed)

action_space = np.array([0.0, 0.0])
agent = DDPG(args.gamma, args.tau, args.hidden_size, num_inputs, action_space)

memory = ReplayMemory(args.replay_size)
ounoise = OUNoise(num_outputs)

db = db_opt.DB()
if db.is_open():
    print("the connection is open")
else:
    print("the connection is closed")
    sys.exit()

step = 1
rewards = []
for i_episode in range(args.num_episodes):
    ounoise.scale = (args.noise_scale - args.final_noise_scale) * max(0, args.exploration_end -