예제 #1
0
    def __init__(self, env_name, state_dim, action_dim):
        self.name = 'DriverAgent'  # name for uploading results
        self.env_name = env_name
        # Randomly initialize actor network and critic network
        # with both their target networks
        self.state_dim = state_dim
        self.action_dim = action_dim

        # Tensorflow Session
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)

        # Actor & Critic Network
        self.actor = ActorNetwork(self.sess, state_dim, action_dim, BATCH_SIZE,
                                  TAU, LRA)
        self.critic = CriticNetwork(self.sess, state_dim, action_dim,
                                    BATCH_SIZE, TAU, LRA)

        # Replay Memory
        self.memory = ReplayMemory(MEMORY_SIZE)

        # Loss value
        self.loss = 0

        # loading networks. modify as you want
        self.saver = tf.train.Saver()
        if not os.path.exists(ckp_dir):
            print("Could not find old network weights")
        else:
            self.saver.restore(self.sess, os.path.join(ckp_dir, ckp_name))
            print("Successfully loaded:", ckp_name)
예제 #2
0
파일: Agent.py 프로젝트: ymkim1019/aibirds
    def __init__(self, trainable=1, load_model=1):
        super(Agent, self).__init__('Agent')

        np.random.seed(1337)

        self.step = 0
        self.state_cache = dict()
        self.action_cache = dict()

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.trainable = trainable
        K.set_session(self.sess)

        self.actor = ActorNetwork(self.sess, globalConfig.TAU, globalConfig.LRA)
        self.critic = CriticNetwork(self.sess, globalConfig.TAU, globalConfig.LRC)

        self.buff = ReplayBuffer(globalConfig.BUFFER_SIZE)  # Create replay buffer
        self.cnt = 0

        if load_model == 1:
            # Now load the weight
            print("Now we load the weight")
            try:
                self.actor.model.load_weights("actormodel.h5")
                self.critic.model.load_weights("criticmodel.h5")
                self.actor.target_model.load_weights("actormodel.h5")
                self.critic.target_model.load_weights("criticmodel.h5")
                print("Weight load successfully")
            except:
                print("Cannot find the weight")

        self.graph = tf.get_default_graph()
예제 #3
0
파일: DDPG.py 프로젝트: Cobaramin/Dota2Bot
    def __init__(self):

        # Variable Definition
        self.ep = 0
        self.replace_freq = cf.REPLACE_FREQ
        self.save_freq = cf.SAVE_FREQ
        self.WEIGHT_PATH = cf.WEIGHT_PATH

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True

        # Session Setup & graph
        self.sess = tf.Session(config=config)
        from keras import backend as K
        K.set_session(self.sess)
        self.tf_graph = tf.get_default_graph()

        # Network
        self.actor = ActorNetwork(self.sess, self.tf_graph, cf.STATE_DIM, cf.ACTION_DIM, cf.TAU, cf.LRA)
        self.critic = CriticNetwork(self.sess, self.tf_graph, cf.STATE_DIM, cf.ACTION_DIM, cf.TAU, cf.LRC)
        self.memory = ReplayBuffer(cf.BUFFER_SIZE)

        # write graph
        self.timestamp = int(time.time())
        self.sum_writer = tf.summary.FileWriter(cf.TMP_PATH + '/ddpg' + str(self.timestamp), self.tf_graph)
예제 #4
0
    def __init__(self,
                 input_shape,
                 actions,
                 discount_factor,
                 replay_buffer,
                 minibatch_size,
                 logger,
                 name="ppo"):
        self.input_shape = input_shape
        self.action_space = actions
        self.discount_factor = discount_factor
        self.minibatch_size = minibatch_size

        self.actor = ActorNetwork(self.input_shape, self.action_space,
                                  self.discount_factor, self.minibatch_size)
        self.critic = CriticNetwork(self.input_shape, self.action_space,
                                    self.discount_factor, self.minibatch_size)

        self.states = []
        self.actions = []
        self.values = []
        self.masks = []
        self.rewards = []
        self.actions_probs = []
        self.actions_onehot = []

        super(PPOAgent, self).__init__(logger, replay_buffer, name=name)
예제 #5
0
파일: adp.py 프로젝트: IrohXu/Gomoku-XYH19
class Main():
    def __init__(self, board):
        self.TRAIN = False
        self.board = board
        self.ME = 1
        self.OPPONENT = 2
        self.action_network = ActionNetwork(objective=self.ME)
        self.critic_network = CriticNetwork(
            params=[len(board.features) * 5 + 2, 60, 1],
            pattern_finder=board.pattern_finder)  # 神经网络结构
        if os.path.exists(CRITIC_NETWORK_SAVEPATH):
            self.critic_network.layers = pickle.load(
                open(CRITIC_NETWORK_SAVEPATH, 'rb'))
            logDebug('Using existing model at ' + CRITIC_NETWORK_SAVEPATH)

        self.system_model = SystemModel(who=self.ME)

    def run_me(self):
        try:
            if board.whose_turn is None:
                board.whose_turn = self.ME

            actions, values = self.get_candidate_actions()
            action, value = self.action_network.forward(
                self.board, actions, values)
            board_now = deepcopy(self.board)
            board_next = self.system_model.forward(action)  # pp.do_mymove here
            if self.TRAIN:
                reward = 1.0 if check_win(
                    board_now, action[0], action[1], who=self.ME) else 0.0
                self.critic_network.back_propagation(board_now, board_next,
                                                     reward)
        except:
            logTraceBack()
            raise Exception('f**k')

    def run_opponent(self, x, y):
        try:
            if board.whose_turn is None:
                board.whose_turn = self.OPPONENT

            board_now = deepcopy(self.board)
            self.board[x][y] = 2
            board_next = self.board
            reward = 0.0
            self.critic_network.back_propagation(board_now, board_next, reward)
        except:
            logTraceBack
            raise Exception('f**k')

    def get_candidate_actions(self):
        actions = Adjacent(self.board)  # 返回临近的点
        values = []
        for action in actions:
            board_next = self.system_model.forward_if(self.board, action)
            values.append(self.critic_network.forward(board_next))
        return actions, values
예제 #6
0
파일: adp.py 프로젝트: IrohXu/Gomoku-XYH19
    def __init__(self, board):
        self.TRAIN = False
        self.board = board
        self.ME = 1
        self.OPPONENT = 2
        self.action_network = ActionNetwork(objective=self.ME)
        self.critic_network = CriticNetwork(
            params=[len(board.features) * 5 + 2, 60, 1],
            pattern_finder=board.pattern_finder)  # 神经网络结构
        if os.path.exists(CRITIC_NETWORK_SAVEPATH):
            self.critic_network.layers = pickle.load(
                open(CRITIC_NETWORK_SAVEPATH, 'rb'))
            logDebug('Using existing model at ' + CRITIC_NETWORK_SAVEPATH)

        self.system_model = SystemModel(who=self.ME)
예제 #7
0
 def __init__(self, BUFFER_SIZE, BATCH_SIZE, GAMMA, TAU, LRA, LRC,
              action_dim, state_dim, EXPLORE, epsilon, total_loss,
              total_reward, train_indicator, s_t, a_t, r_t, s_t1, done,
              speed_lmit, sensor_dis):
     self.BUFFER_SIZE = BUFFER_SIZE
     self.BATCH_SIZE = BATCH_SIZE
     self.GAMMA = GAMMA
     self.TAU = TAU
     self.LRA = LRA
     self.LRC = LRC
     self.action_dim = action_dim
     self.state_dim = state_dim
     self.EXPLORE = EXPLORE
     self.epsilon = epsilon
     self.total_loss = total_loss
     self.total_reward = total_reward
     self.train_indicator = train_indicator
     self.s_t = s_t
     self.a_t = a_t
     self.r_t = r_t
     self.s_t1 = s_t1
     self.done = done
     self.speed_limit = speed_lmit
     self.sensor_dis = sensor_dis
     # Tensorflow GPU optimization
     config = tf.ConfigProto()
     config.gpu_options.allow_growth = True
     sess = tf.Session(config=config)
     from keras import backend as K
     K.set_session(sess)
     self.actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU,
                               LRA)
     self.critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE,
                                 TAU, LRC)
예제 #8
0
 def __init__(self):
     for i in range(args.num_actors):
         self.actor.append(
             ActorNetwork(self.sess, self.state_dim, self.action_dim, 0, 0,
                          0))
     for i in range(args.num_critics):
         self.critic.append(
             CriticNetwork(self.sess, self.state_dim, self.action_dim, -1,
                           0, 0))
예제 #9
0
파일: ddpg.py 프로젝트: harshsha5/DDPG-TD3
    def __init__(self, env, outfile_name, hindsight):
        """Initialize the DDPG object.

        Args:
            env: an instance of gym.Env on which we aim to learn a policy.
            outfile_name: (str) name of the output filename.
        """
        action_dim = len(env.action_space.low)
        state_dim = len(env.observation_space.low)
        np.random.seed(1337)
        self.env = env

        self.sess = tf.compat.v1.Session()
        tf.keras.backend.set_session(self.sess)
        self.batch_size = BATCH_SIZE
        self.buffer = ReplayBuffer(BUFFER_SIZE)
        self.burn_in_memory_size = BURN_IN_MEMORY
        self.Critic = CriticNetwork(self.sess,
                                    state_dim,
                                    action_dim,
                                    self.batch_size,
                                    tau=TAU,
                                    learning_rate=LEARNING_RATE_CRITIC)
        self.noise_mu = NOISE_MU
        self.Noise_sigma = NOISE_SIGMA * (env.action_space.high[0] -
                                          env.action_space.low[0])
        self.Actor = ActorNetwork(sess=self.sess,
                                  state_size=state_dim,
                                  action_size=action_dim,
                                  batch_size=self.batch_size,
                                  tau=TAU,
                                  learning_rate=LEARNING_RATE_ACTOR)

        # Defining a custom name for the Tensorboard summary.
        timestr = time.strftime("%Y%m%d-%H%M%S")

        if hindsight:
            save_path = "runs/HER_DDPG_" + timestr + '/'
        else:
            save_path = "runs/DDPG_" + timestr + '/'

        self.writer = SummaryWriter(save_path)
        self.outfile = outfile_name
        self.action_range = 1
예제 #10
0
    def __init__(self, state_processor):
        self.BUFFER_SIZE = 1000
        self.BATCH_SIZE = 32
        self.GAMMA = 0.99
        self.TAU = 0.001  # Target Network HyperParameters
        self.LRA = 0.0001  # Learning rate for Actor
        self.LRC = 0.001  # Learning rate for Critic

        self.action_dim = 21  # Target/Action
        self.state_dim = 131055  # columns in input state

        np.random.seed(1337)
        self.total_reward = 0.
        self.loss = 0

        self.EXPLORE = 100000.
        self.reward = 0
        self.done = False
        self.epsilon = 1
        self.indicator = 0

        self.train_indicator = 1  # 1 means Train, 0 means simply Run

        # Tensorflow GPU optimization
        self.config = tf.ConfigProto()
        self.config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=self.config)
        K.set_session(self.sess)

        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_dim,
                                  self.BATCH_SIZE, self.TAU, self.LRA)
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_dim,
                                    self.BATCH_SIZE, self.TAU, self.LRC)
        self.buff = ReplayBuffer(self.BUFFER_SIZE)  # Create replay buffer
        self.combat_buff = ReplayBuffer(
            self.BUFFER_SIZE)  # Create combat replay buffer
        self.turn_buff = ReplayBuffer(
            self.BUFFER_SIZE)  # Create turn replay buffer
        global graph
        graph = tf.get_default_graph()

        self.state_processor = state_processor
예제 #11
0
파일: ddpg.py 프로젝트: RCX112/Pong-DeepRL
    def __init__(self, outputs, memorySize, discountFactor,
                 learningRate_Critic, learningRate_Actor, target_update_rate,
                 img_rows, img_cols, img_channels):
        """
        Parameters:
            - outputs: output size
            - memorySize: size of the memory that will store each state
            - discountFactor: the discount factor (gamma)
            - learningRate: learning rate
            - learnStart: steps to happen before for learning. Set to 128
        """
        self.action_size = outputs
        self.memory = memory.Memory(memorySize)
        self.discountFactor = discountFactor
        self.learningRateCritic = learningRate_Critic
        self.learningRateActor = learningRate_Actor
        self.img_rows = img_rows
        self.img_cols = img_cols
        self.img_channels = img_channels
        self.target_update_rate = target_update_rate

        self.img_shape = (self.img_channels, self.img_rows, self.img_cols)
        if K.image_dim_ordering() == 'tf':
            self.img_shape = (self.img_rows, self.img_cols, self.img_channels)

        with tf.device(TF_DEVICE):
            config = tf.ConfigProto()
            config.gpu_options.allow_growth = True
            self.sess = tf.Session(config=config)
            K.set_session(self.sess)

        print 'tf config!'

        self.actor = ActorNetwork(self.sess, self.img_shape, self.action_size,
                                  self.target_update_rate,
                                  self.learningRateActor)
        print 'actor'
        self.critic = CriticNetwork(self.sess, self.img_shape,
                                    self.action_size, self.target_update_rate,
                                    self.learningRateCritic)
        print 'critic'
    def __init__(self, env, gamma=0.98, start_epsilon=1, end_epsilon=0.01, decay=500, lr=1e-4, n_batch=32,
                 n_memory=50000, n_update_target=500, start_learning=1000, log_dir=None):
        self.env = env
        n_act = self.env.action_space.n
        n_obs = np.prod(self.env.observation_space.shape)

        self.critic = CriticNetwork(n_act=n_act, n_obs=n_obs)
        self.target_critic = CriticNetwork(n_act=n_act, n_obs=n_obs)
        self._update_target()

        self.memory = ReplayMemory(n_obs, n_memory)

        self.gamma = gamma
        self.s_epsilon = start_epsilon
        self.e_epsilon = end_epsilon
        self.decay = decay
        self.n_batch = n_batch
        self.n_update_target = n_update_target
        self.start_learning = start_learning

        self.criterion = nn.MSELoss()
        self.optimizer = optim.Adam(self.critic.parameters(), lr=lr)
예제 #13
0
    def launch_train(self, train_indicator=1):  # 1 means Train, 0 means simply Run
        print 'Launch Training Process'
        np.random.seed(1337)

        self.state_t = self.sim_inter.get_state()
        self.state_dim = self.sim_inter.state_dim
        self.actor = ActorNetwork(self.sess, self.state_dim, self.action_size, self.batch_size, self.tau, self.LRA)
        self.critic = CriticNetwork(self.sess, self.state_dim, self.action_size, self.batch_size, self.tau, self.LRC)
        self.buff = ReplayBuffer(self.buffer_size)
        self.load_weights()

        for e in range(self.episode_count):
            print("Episode : " + str(e) + " Replay Buffer " + str(self.buff.count()))

            for j in range(self.max_steps):
                self.loss = 0
                self.total_reward = 0
                self.action_t = self.action_noise(train_indicator)
                choose_action = np.argmax(self.action_t[0][0:4])
                collision, if_pass = self.update_action(choose_action, train_indicator, e)

                if self.if_done:
                    self.sim_inter = UpdateInter()
                    self.state_t = self.sim_inter.get_state()
                    self.if_done = False
                    break

            if train_indicator:
                self.update_weights()

            self.total_correct += int(collision <= 0 and if_pass)
            self.total_wrong += int(collision > 0)
            accuracy = 0
            if self.total_correct + self.total_wrong:
                accuracy = self.total_correct / (self.total_correct + self.total_wrong)

            if np.mod(e, 100) == 0:
                self.accuracy_all.append(accuracy)
                self.total_correct = 0
                self.total_wrong = 0

            print("TOTAL REWARD @ " + str(e) + "-th Episode  : Reward " + str(self.total_reward) +
                  " Collision " + str(collision > 0) + " Accuracy " + str(accuracy) +
                  " All Accuracy " + str(self.accuracy_all))
            print("")
        print("Finish.")
예제 #14
0
def playGame(DDPG_config,
             train_indicator=1):  #1 means Train, 0 means simply Run
    # SETUP STARTS HERE
    if train_indicator > 0:
        folder = setup_run(DDPG_config)
    elif train_indicator == 0:
        folder = DDPG_config['EXPERIMENT']

    if DDPG_config['RSEED'] == 0:
        DDPG_config['RSEED'] = None
    np.random.seed(DDPG_config['RSEED'])

    ACTIVE_NODES = DDPG_config['ACTIVE_NODES']

    # Generate an environment
    if DDPG_config['ENV'] == 'balancing':
        env = OmnetBalancerEnv(DDPG_config, folder)
    elif DDPG_config['ENV'] == 'label':
        env = OmnetLinkweightEnv(DDPG_config, folder)

    action_dim, state_dim = env.a_dim, env.s_dim

    MU = DDPG_config['MU']
    THETA = DDPG_config['THETA']
    SIGMA = DDPG_config['SIGMA']

    ou = OU(action_dim, MU, THETA, SIGMA)  #Ornstein-Uhlenbeck Process

    BUFFER_SIZE = DDPG_config['BUFFER_SIZE']
    BATCH_SIZE = DDPG_config['BATCH_SIZE']
    GAMMA = DDPG_config['GAMMA']
    EXPLORE = DDPG_config['EXPLORE']
    EPISODE_COUNT = DDPG_config['EPISODE_COUNT']
    MAX_STEPS = DDPG_config['MAX_STEPS']
    if EXPLORE <= 1:
        EXPLORE = EPISODE_COUNT * MAX_STEPS * EXPLORE
    # SETUP ENDS HERE

    reward = 0
    done = False
    wise = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, DDPG_config)
    critic = CriticNetwork(sess, state_dim, action_dim, DDPG_config)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    ltm = ['a_h0', 'a_h1', 'a_V', 'c_w1', 'c_a1', 'c_h1', 'c_h3', 'c_V']
    layers_to_mind = {}
    L2 = {}

    for k in ltm:
        layers_to_mind[k] = 0
        L2[k] = 0

    vector_to_file(ltm, folder + 'weightsL2' + 'Log.csv', 'w')

    #Now load the weight
    try:
        actor.model.load_weights(folder + "actormodel.h5")
        critic.model.load_weights(folder + "criticmodel.h5")
        actor.target_model.load_weights(folder + "actormodel.h5")
        critic.target_model.load_weights(folder + "criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("OMNeT++ Experiment Start.")
    # initial state of simulator
    s_t = env.reset()
    loss = 0
    for i in range(EPISODE_COUNT):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        total_reward = 0
        for j in range(MAX_STEPS):
            print('step ', j)
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))

            if train_indicator and epsilon > 0 and (step % 1000) // 100 != 9:
                noise_t[0] = epsilon * ou.evolve()

            a = a_t_original[0]
            n = noise_t[0]
            a_t[0] = np.where((a + n > 0) & (a + n < 1), a + n,
                              a - n).clip(min=0, max=1)

            # execute action
            s_t1, r_t, done = env.step(a_t[0], j)
            # print(s_t1)
            print('reward ', r_t)

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            scale = lambda x: x
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = scale(np.asarray([e[0] for e in batch]))
            actions = scale(np.asarray([e[1] for e in batch]))
            rewards = scale(np.asarray([e[2] for e in batch]))
            new_states = scale(np.asarray([e[3] for e in batch]))
            dones = np.asarray([e[4] for e in batch])

            y_t = np.zeros([len(batch), action_dim])
            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator and len(batch) >= BATCH_SIZE:
                loss = critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                # does this give an output like train_on_batch above? NO
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()
                with open(folder + 'lossLog.csv', 'a') as file:
                    file.write(pretty(loss) + '\n')

            total_reward += r_t
            s_t = s_t1

            for layer in actor.model.layers + critic.model.layers:
                if layer.name in layers_to_mind.keys():
                    L2[layer.name] = np.linalg.norm(
                        np.ravel(layer.get_weights()[0]) -
                        layers_to_mind[layer.name])
                    #                     vector_to_file(np.ravel(layer.get_weights()[0]), folder + 'weights_' + layer.name + 'Log.csv', 'a')
                    layers_to_mind[layer.name] = np.ravel(
                        layer.get_weights()[0])


#             if max(L2.values()) <= 0.02:
#                 wise = True

            if train_indicator and len(batch) >= BATCH_SIZE:
                vector_to_file([L2[x] for x in ltm],
                               folder + 'weightsL2' + 'Log.csv', 'a')

            vector_to_file(a_t_original[0], folder + 'actionLog.csv', 'a')
            vector_to_file(noise_t[0], folder + 'noiseLog.csv', 'a')

            if 'PRINT' in DDPG_config.keys() and DDPG_config['PRINT']:
                print("Episode", "%5d" % i, "Step", "%5d" % step, "Reward",
                      "%.6f" % r_t)
                print("Epsilon", "%.6f" % max(epsilon, 0))

                att_ = np.split(a_t[0], ACTIVE_NODES)
                for _ in range(ACTIVE_NODES):
                    att_[_] = np.insert(att_[_], _, -1)
                att_ = np.concatenate(att_)
                print("Action\n", att_.reshape(ACTIVE_NODES, ACTIVE_NODES))
                print(max(L2, key=L2.get), pretty(max(L2.values())))

            step += 1
            if done or wise:
                break

        if step % 1000 == 0:  # writes at every 1000 step
            if (train_indicator):
                actor.model.save_weights(folder + "actormodel.h5",
                                         overwrite=True)
                actor.model.save_weights(folder + "actormodel" + str(step) +
                                         ".h5")
                with open(folder + "actormodel.json", "w") as outfile:
                    outfile.write(actor.model.to_json(indent=4) + '\n')

                critic.model.save_weights(folder + "criticmodel.h5",
                                          overwrite=True)
                critic.model.save_weights(folder + "criticmodel" + str(step) +
                                          ".h5")
                with open(folder + "criticmodel.json", "w") as outfile:
                    outfile.write(critic.model.to_json(indent=4) + '\n')

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down
    print("Finish.")
예제 #15
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 24  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 300000.
    episode_count = 20000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    # epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    load_name = "sample_v0_40"
    print("Now we load the weight")
    try:
        actor.model.load_weights("saved/actormodel_{}.h5".format(load_name))
        critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name))
        actor.target_model.load_weights(
            "saved/actormodel_{}.h5".format(load_name))
        critic.target_model.load_weights(
            "saved/criticmodel_{}.h5".format(load_name))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    plt.figure()
    overall_scores = []
    model_name = "sample_v0"

    print("TORCS Experiment Start.")

    attacks = []
    for i in range(-10, 0):
        val = i / 10.0
        attacks.append([77, val])
    # for i in range(45, 55):
    #     attacks.append([i, -1.5])
    #     attacks.append([i, 1.5])
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        # if np.mod(i, 3) == 0:
        #     ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        # else:
        #     ob = env.reset()
        ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ))

        total_reward = 0.
        cur_sample = []
        for j in range(max_steps):
            # if j == 50:
            # time.sleep(0.099)
            # continue
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            # if j > 120:
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            if j < 20 and train_indicator:
                a_t[0][1] += 0.5
            # if j == 71:
            #     print("cp attack!")
            #     if a_t[0][0] > 0:
            #         a_t[0][0] = -0.3
            #     else:
            #         a_t[0][0] = 0.3
            # print("%.2f"%a_t[0][0])
            # a_t[0][2] += 0.7
            # if ob.speedX > 0.6:
            # a_t[0][1] = 0
            if (j == attacks[i][0]):
                print('cp attack on {} with {}'.format(attacks[i][0],
                                                       attacks[i][1]))
                a_t[0][0] = attacks[i][1]
            ob, r_t, done, info = env.step(a_t[0])
            print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format(
                j, r_t, a_t[0][0], a_t[0][1], a_t[0][2])

            # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm)
            # if(r_t < -50):
            #     r_t -= 10000
            #     done = True
            if j > 20 and ob.rpm <= 0.09426:
                r_t -= 1000
                done = True

            theta = 0.1
            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                              ob.speedY, ob.speedZ))
            # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1])
            # print(np.linalg.norm(s_t1_new - s_t1))
            # s_t1 = s_t1_new

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            cur_step_sample = [
                s_t.tolist(), a_t[0].tolist(), r_t,
                s_t1.tolist(), done
            ]
            cur_sample.append(cur_step_sample)

            # #Do the batch update
            # batch = buff.getBatch(BATCH_SIZE)
            # states = np.asarray([e[0] for e in batch])
            # actions = np.asarray([e[1] for e in batch])
            # rewards = np.asarray([e[2] for e in batch])
            # new_states = np.asarray([e[3] for e in batch])
            # dones = np.asarray([e[4] for e in batch])
            # y_t = np.asarray([e[1] for e in batch])

            # target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])

            # for k in range(len(batch)):
            #     if dones[k]:
            #         y_t[k] = rewards[k]
            #     else:
            #         y_t[k] = rewards[k] + GAMMA*target_q_values[k]

            # if (train_indicator):
            #     loss += critic.model.train_on_batch([states,actions], y_t)
            #     a_for_grad = actor.model.predict(states)
            #     grads = critic.gradients(states, a_for_grad)
            #     actor.train(states, grads)
            #     actor.target_train()
            #     critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

            if j > 200:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("saved/actormodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                         overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("saved/criticmodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                          overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
        s = "{},{},{:.3f},{},{}\n".format(i, j, total_reward, attacks[i][0],
                                          attacks[i][1])
        with open('logs/attack_{}.csv'.format(model_name), 'a') as the_file:
            the_file.write(s)
        # overall_scores.append(total_reward)
        # plt.clf()
        # plt.plot(overall_scores)
        # plt.savefig("train_plots/{}_{}.jpg".format(model_name, int(step/10000)))
        # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile:
        #     pickle.dump(cur_sample, outfile)

    env.end()  # This is for shutting down TORCS
    print("Finish.")
예제 #16
0
def start_training(goal_position):
    debug = True
    env = Environment(
        debug, goal_position
    )  #Put here all teh function needed for the interaction with the env

    observ_dim = env.num_states
    actions_dim = env.num_actions

    #Define buffer size and dimension
    buffer_size = 5000
    miniBatch_size = 32

    #Define Hyperparameters values
    gamma = 0.98  #learning parameter --> discount factor: model the fact that future reward are worth less than immediate reward
    #MQ value factor, if settled near 1 means tha learning is quickly
    tau = 0.001  # neural networks updating

    #training parameters

    explore = 10000
    max_episode = 5000
    max_steps_in_ep = 10000
    reward = 0

    done = False
    epsilon = 0.9  #exploration exploitation value
    indicator = 0

    plot_reward = False
    save_stats = True
    #Create Empty array for Plotting VAriables
    ep_reward = []
    episode = []
    distance = []

    distance_step = []
    step_reward = []

    #Define goal pos only for print purpose
    distance_error = []
    goal_position = [2.0, 3.0]
    episode_check = 0
    desired_checking_episode = 10
    #If running on RDS uncomment this part
    #Tensorflow GPU optimization
    #    config = tf.ConfigProto()
    #    config.gpu_options.allow_growth = True
    #    sess = tf.Session(config=config)
    #    from keras import backend as K
    #    K.set_session(sess)
    #
    #Say to tensorflow to run on CPU
    config = tf.ConfigProto(device_count={'GPU': 0})
    sess = tf.Session(config=config)
    K.set_session(sess)

    #Define the actor, critic Network and Buffer

    actor = ActorNetwork(env, sess)
    critic = CriticNetwork(env, sess)
    replay_buffer = ReplayBuffer()
    saved_path = '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved'  #/Model_Weights_saved'
    save_directory = os.path.join(os.getcwd(), saved_path)

    try:
        actor.model.load_weights(
            "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/499_actor_weights.h5"
        )
        actor.model_target.load_weights(
            "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/499_actor_weights.h5"
        )
        critic.model.load_weights(
            '/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Critic_weights/499_critic_model.h5'
        )
        critic.model_target.load_weights(
            "/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Critic_weights/499_critic_model.h5"
        )

        #critic.model_target.load_weights("/home/parallels/catkin_ws/src/deep_drone/src/Data_Saved/Actor_weights/219_critic_weights.h5")
        print("WEIGHTS LOAD CORRECTLY")
    except:
        print("ERR: WEIGHTS LOAD UNCORRECTLY")

    if not os.path.isdir(
            save_directory):  #return true if path is in an existing directory
        os.makedirs(save_directory)
    os.chdir(save_directory)

    #plot graphs settings
    if (plot_reward):
        plt.ion()  #turn the interactive mode on
        plt.title('Training Curve')
        plt.xlabel('Episodes')
        plt.ylabel('Total Reward')
        plt.grid()

        plt.ion()
        plt.title('Distance Error')
        plt.xlabel('Episodes')
        plt.ylabel('Cartesian Error')
        plt.grid()
#Principal Training LOOP
    for ep in range(500, max_episode):
        #receive initial observation state
        state_t = env._reset(
        )  #reset environment ---> waiting for take off -> give also the state information relative to the actual drone position ecc
        state_t = np.asarray(
            state_t
        )  #create an array that is the state at time t : errorX,errorY, Terminal
        total_reward = [0]  #initialize reward
        terminal = [False]  #flag relative to the training phase
        step = 0  #number of iteration inside eac episode
        episode_check = episode_check + 1
        while (terminal[0] == False):
            if step > 200:  #200:
                break  # exit from the main loop

            step = step + 1

            #            if debug:
            #                print('###############################')
            #print('step: {}'.format(step))
            print(
                '############################################################')
            loss = 0
            epsilon -= 1.0 / explore  #define the expolre exploit probabilities

            action_t = np.zeros(
                [1, actions_dim]
            )  #create a zero array with the same dimesion of the number of actions
            noise_t = np.zeros([1, actions_dim])  #noise array

            #the current action is selected according to current policy and exploration noise
            #The action is predicted from the actor network without noise

            action_t_initial = actor.model.predict(
                state_t.reshape(1, state_t.shape[0])
            )  #state_t.reshape(1, state_t.shape[0])) #make prediction given the state input,shape gives the dimension of the vector.
            #print('action_t_initial', action_t_initial)

            #adding noise to the action predicted
            noise_t[0][0] = OUhlenbeck_noise(epsilon, action_t_initial[0][0])
            noise_t[0][1] = OUhlenbeck_noise(epsilon, action_t_initial[0][1])
            #noise_t[0][2] = OUhlenbeck_noise(epsilon,action_t_initial[0][2])

            action_t[0][0] = action_t_initial[0][0] + noise_t[0][0]
            action_t[0][1] = action_t_initial[0][1] + noise_t[0][1]

            #Step, Apply action in the environment and reach a new state
            state_t1, reward_t, terminal = env._step(action_t[0], step)
            #print('state_t1 : {}'.format(state_t1))

            state_t1 = np.asarray(state_t1)  #create array of the new state
            #Now the sequence state_t, actions, reward, state_t1 must be add to the replay buffer experience
            replay_buffer.add_experience(state_t, action_t[0], reward_t,
                                         state_t1, terminal)

            #Sample a new experience (set of sate, action, state1, reward, terminal) from batch
            mini_batch = replay_buffer.take_experience()

            states_buff = np.asarray([i[0] for i in mini_batch])
            actions_buff = np.asarray([i[1] for i in mini_batch])
            reward_buff = np.asarray([i[2] for i in mini_batch])
            state_new_buff = np.asarray([i[3] for i in mini_batch])
            terminal_buff = np.asarray([i[4] for i in mini_batch])
            #istantiate a y_target vector which must be of the same dimesion of the length of the mini batch
            #y_target = np.asarray([i[1] for i in mini_batch]) #it is only to have the array of the desired dimension

            #Predic an action from Actor Network given the state_new_buff from mini_batch
            action_new_buff = actor.model_target.predict(state_new_buff)

            #Take the prediction from the Critic network about possible Q target relatives to the new_state and action taken from mini batch
            Q_target_predicted = critic.model_target.predict(
                [state_new_buff, action_new_buff])
            #            print('Q_target_predicted', Q_target_predicted)
            #            print('reward_buff', reward_buff)
            #Update the target of the Q value evaluating the BElmann Equation
            y_target = []
            for j in range(len(mini_batch)):

                if terminal_buff[j]:
                    #y_target[j] =  reward_buff[j]
                    y_target.append(reward_buff[j])
                else:

                    y_target.append(
                        reward_buff[j] + gamma * Q_target_predicted[j]
                    )  #it append every time an array and create a sort of list

            #i resize all in order to obtain an array with 1 column and many rows as the dimension of the batch
            y_target = np.resize(y_target, [len(mini_batch), 1])

            #Evaluate the loss error utilizing the model.train_on_batch and update the weights of the critic
            #having as target the y_target evaluated from the belmann equation
            loss = loss + critic.model.train_on_batch(
                [states_buff, actions_buff],
                y_target)  # L = 1/N * sum(y_target - Q(si,ai|theta^Q)^2)

            #The actor policy is updated using the sampled policy gradient
            ############ see https://yanpanlau.github.io/2016/10/11/Torcs-Keras.html for the full expalantion
            #An action is predicted taking the states from the buffer. The action predicted will be used to evaluate the increasing of the critic_gradient
            action_for_grad = actor.model.predict(states_buff)
            #The actor network is trained computing the gradient of the critic network repect to actions.
            #This because the actor network must be trained to follow the maximum gradient increasing direction of the critic network that represent in fact the q network.
            #Like in Q learning, in the Q table, you follow tha action that increase the Q value. Sa me choose here, only different is that instead of having a value
            #to follow we have the gradient of the Critic NEtwork
            critic_gradient = critic.gradients(states_buff, action_for_grad)
            #The actor network is trained having as input the states from which the critic gradient is computed and as target the critic_gradient itself.
            #The goal of the actor networ is to output actions that goes in the direction of the gradient and every time maximize it
            actor.actor_train(states_buff, critic_gradient)
            #The last two rows are done in order to updates the target network
            #theta^Q = tau*theta^Q +(1- tau)*theta^Q'
            actor.target_net_train()
            critic.target_net_train()

            #Evaluate distance error fro print purpose
            error_x = (goal_position[0] - state_t[0])
            error_y = (goal_position[1] - state_t[1])
            distance_error = math.sqrt(error_x * error_x + error_y * error_y)

            #Update Total Reward
            #print('reward_t', reward_t)

            if not reward_t[0]:
                reward_t[0] = -100 * distance_error

            total_reward[0] = total_reward[0] + reward_t[0]

            #The new state becomes the actual state
            state_t = state_t1

            #### Save distance and reward for each step only for pllotting purpose
            distance_step.append(distance_error)
            step_reward.append(reward_t[0])
            if (terminal[0] == True or step == 200):
                distance_step_mat = np.asarray(distance_step)

                step_reward_mat = np.asarray(step_reward)

                distance_step_name = 'Statistics/Step_Statistics/%d_distance_step.csv' % (
                    ep)
                step_reward_name = 'Statistics/Step_Statistics/%d_step_reward.csv' % (
                    ep)

                np.savetxt(
                    distance_step_name, distance_step_mat, delimiter=","
                )  #Nel post processing in matlab importare il vettore episode su asse x e fare plot con reward e distance su asse y
                np.savetxt(step_reward_name, step_reward_mat, delimiter=",")
                distance_step_mat = []
                step_reward_mat = []
                distance_step = []
                step_reward = []

            #Save Model and Weights every 50 episodes as a checkpoint
            print(
                'episode: {}, steps: {}, tot_rewards: {}, terminal: {}'.format(
                    ep, step, total_reward, terminal))

            print('distance_error:{}, pos_x: {}, pos_y: {}'.format(
                distance_error, state_t[0], state_t[1]))

            #if ((step+1)%10 == 0):
        if (episode_check == desired_checking_episode):
            #save Model
            action_model_name = 'Actor_weights/%d_actor_model.h5' % (ep)
            critic_model_name = 'Critic_weights/%d_critic_model.h5' % (ep)
            save_path = os.path.join(save_directory, action_model_name)
            actor.model.save(action_model_name)  #True if you want to overwrite
            critic.model.save(critic_model_name)
            print('Model Saved in path: %s' % save_directory)

            #Save Weights
            model_ext = ".h5"
            model_ext2 = ".json"
            action_save_weights_name = 'Actor_weights/%d_actor_weights' % (ep)
            actor.model.save_weights(action_save_weights_name + model_ext,
                                     overwrite=True)  #Save Weights
            with open(action_save_weights_name + model_ext2, "w") as outfile:
                json.dump(actor.model.to_json(),
                          outfile)  #save Model Archutecture, not weights

            critic_save_weights_name = 'critic_weights/ %d_critic_weights' % (
                ep)
            critic.model.save_weights(critic_save_weights_name + model_ext,
                                      overwrite=True)
            with open(critic_save_weights_name + model_ext2, "w") as outfile:
                json.dump(critic.model.to_json(), outfile)

            print('Weights Saved in path: %s' % save_directory)

        #######################
        #Save Statistics
        if (save_stats):

            episode.append(ep)
            ep_reward.append(total_reward[0])
            distance.append(distance_error)

            if (episode_check == desired_checking_episode):

                ep_reward_mat = np.asarray(ep_reward)
                episode_mat = np.asarray([episode])
                distance_mat = np.asarray(distance)

                episode_mat = np.resize(episode_mat, [ep, 1])

                episode_name = 'Statistics/%d_episode.csv' % (ep)
                episode_reward_name = 'Statistics/%d_reward.csv' % (ep)
                distance_name = 'Statistics/%d_distance.csv' % (ep)
                np.savetxt(
                    episode_name, episode_mat, delimiter=","
                )  #Nel post processing in matlab importare il vettore episode su asse x e fare plot con reward e distance su asse y
                np.savetxt(episode_reward_name, ep_reward_mat, delimiter=",")
                np.savetxt(distance_name, distance_mat, delimiter=",")
                episode_check = 0
예제 #17
0
def playGame(checkpoints=None,
             train_indicator=1,
             eps=1.0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 40000
    BATCH_SIZE = 16
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.01  #Learning rate for Actor
    LRC = 0.05  #Lerning rate for Critic

    vision = True
    action_dim = 3  #Steering/Acceleration/Brake

    if vision:
        state_dim = (64, 64, 3)  #of sensors input
    else:
        state_dim = 29
    np.random.seed(1337)

    EXPLORE = 1000000.
    episode_count = 2000
    max_steps = 8000000
    reward = 0
    done = False
    step = 0
    epsilon = eps
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)
    summary_writer = tf.train.SummaryWriter('logs', graph_def=sess.graph_def)
    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA,
                         vision, summary_writer)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC,
                           vision)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer
    history = History()

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
    log_file = open('train_log.log', 'w')
    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel_{}.h5".format(checkpoints))
        critic.model.load_weights("criticmodel_{}.h5".foramt(checkpoints))
        actor.target_model.load_weights("actormodel_{}.h5".format(checkpoints))
        critic.target_model.load_weights(
            "criticmodel_{}.h5".format(checkpoints))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    max_reward = 0
    min_reward = 0

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        if vision:
            history.fill((ob.img))
            s_t = history.get()
        else:
            s_t = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

        total_reward = 0.
        total_damage = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            if vision:
                a_t_original = actor.model.predict(
                    s_t.reshape((-1, ) + state_dim))
            else:
                a_t_original = actor.model.predict(s_t.reshape(
                    1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.30, 0.30)
            noise_t[0][1] = 0.1 + train_indicator * max(
                epsilon, 0) * OU.function(a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])
            damage = ob.damage

            if vision:
                last_s_t = history.get().copy()
                history.add((ob.img))
                next_s_t = history.get().copy()
                if np.mod(step, 4) == 0:
                    buff.add(last_s_t, a_t[0], r_t, next_s_t,
                             done)  #Add replay buffer
                s_t1 = history.get()
            else:
                s_t1 = np.hstack(
                    (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                     ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
                buff.add(s_t, a_t[0], r_t, s_t1, done)

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            if vision:
                target_q_values = critic.target_model.predict([
                    new_states.reshape((-1, ) + state_dim),
                    actor.target_model.predict(new_states).reshape(
                        (-1, ) + (action_dim, ))
                ])
            else:
                target_q_values = critic.target_model.predict(
                    [new_states,
                     actor.target_model.predict(new_states)])
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator and buff.count() >= 1000:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)

                actor.target_train()
                critic.target_train()

            total_reward += r_t
            total_damage += damage
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel_{}.h5".format(i),
                                         overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel_{}.h5".format(i),
                                          overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
        max_reward = max(max_reward, total_reward)
        min_reward = min(min_reward, total_reward)
        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward) + "  EPS " + str(epsilon))
        print("Total Step: " + str(step) + ' Max: ' + str(max_reward) +
              ' Min: ' + str(min_reward))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
예제 #18
0
# L2 REGULARISATION
L2C = 0.00
L2A = 0.0

env = gym.make(ENVIRONMENT_NAME)
action_dim = env.action_space.shape[0]
action_high = +1.
action_low = -1.

input_dim = env.observation_space.shape[0]

sess = tf.InteractiveSession(config=tf.ConfigProto(
                             intra_op_parallelism_threads=2))
actor = ActorNetwork(sess, input_dim, action_dim, BATCH_SIZE, TAU, LRA, L2A)
critic = CriticNetwork(sess, input_dim, action_dim, BATCH_SIZE, TAU, LRC, L2C)
buff = ReplayBuffer(BUFFER_SIZE)
# exploration = OUNoise(action_dim)

#env.monitor.start('experiments/' + 'cartPoli-v0',force=True)

reward_vector = np.zeros(10000)

for ep in range(10000):
    # open up a game state
    s_t, r_0, done = env.reset(), 0, False
        
    #s_t = s_t.reshape()
    REWARD = 0
    # exploration.reset()
    for t in range(1000):
예제 #19
0
def playGame(train_indicator=1):  #1 means Train, 0 means simply Run

    BUFFER_SIZE = 100000
    BATCH_SIZE = 30
    GAMMA = 0.99
    TAU = 0.0001  #Target Network HyperParameters
    LRA = 0.00001  #Learning rate for Actor
    LRC = 0.0001  #Lerning rate for Critic

    action_dim = 1  #Steering/Acceleration/Brake
    state_dim = 15  #of sensors input

    np.random.seed(1337)
    vision = False

    EXPLORE = 1000000.
    episode_count = 3000
    max_steps = 1000000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0
    t_dt = 0.0005

    #TCP/IP communication for MATLAB - Python
    HOST = '0.0.0.0'
    PORT = 40000
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.setsockopt(socket.SOL_SOCKET, socket.SO_SNDBUF, 4096)
    s.bind((HOST, PORT))
    #Matlab client waiting
    s.listen(1)
    print("waiting for response from client at port ", PORT)
    conn, addr = s.accept()

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    #Now load the weight
    print("Now we load the weight")

    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")

        print("Weight load successfully")

    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        total_reward = 0.

        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            Lateral = 0
            #Carsim export(input factor) variable catch s_t
            try:
                ob_exports = conn.recv(4096)
            except KeyboardInterrupt:
                #conn.shutdown()
                conn.close()
                break
            ob_exports1 = json.loads(ob_exports.decode('utf-8'))
            print('export=', ob_exports1)
            if not ob_exports:
                #conn.shutdown()
                conn.close()
                break
            t_current = ob_exports1[0]
            T_bar_Tq = ob_exports1[1] / 10
            LatG = ob_exports1[2]
            YawRate = ob_exports1[3] / 50
            Yaw = ob_exports1[4] / 3.14
            Lateral = ob_exports1[5] / 20
            Steer_SW = ob_exports1[6] / 6000
            StrAV_SW = ob_exports1[7] / 5000
            Steer_L1 = ob_exports1[8] / 180
            Steer_R1 = ob_exports1[9] / 180
            Steer_L2 = ob_exports1[10] / 4
            Steer_R2 = ob_exports1[11] / 4
            Xcg_TM = ob_exports1[12] / 1000
            Ycg_TM = ob_exports1[13] / 300
            Zcg_TM = ob_exports1[14] / 45
            curv = ob_exports1[15]
            #            print('T_bar_Tq=',T_bar_Tq)
            #            print('LatG=',LatG)

            s_t = np.hstack((T_bar_Tq, LatG, YawRate, Yaw, Lateral, Steer_SW,
                             StrAV_SW, Steer_L1, Steer_R1, Steer_L2, Steer_R2,
                             Xcg_TM, Ycg_TM, Zcg_TM, curv))
            print('s_t=', s_t)
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            print('a_t_original=', a_t_original)
            print('a_t_original=', a_t_original)
            a_t_inv = a_t_original[0][0]
            print(a_t_inv.shape)
            critic_gradient = critic.gradients(s_t.reshape(1, s_t.shape[0]),
                                               a_t_original)
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.00, 0.00)
            #            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            #            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            #The following code do the stochastic brake

            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            #            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            #            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            a_t[0][0] = a_t[0][0] * 3500

            t_current = t_current + t_dt
            print('t_next=', t_current)
            print(a_t[0])
            at = np.array(a_t[0])
            #            print("at=",at)
            at1 = np.insert(at, 0, t_current)
            #            print('at1=,',at1)
            at2 = list(at1)
            print('at2=,', at2)

            #provide action value to matlab
            try:
                at_json = json.dumps(at2)
                a = '\r\n'
                at_json1 = at_json + a
                #               print('at_json1',at_json1)
                at_json2 = at_json1.encode('utf-8')
                #               print('at_json2',at_json2)
                conn.sendall(at_json2)
            except KeyboardInterrupt:
                #conn.shutdown()
                conn.close()
                break

            #Carsim export(input factor) variable catch s_t1
            try:
                ob_exports = conn.recv(4096)
            except KeyboardInterrupt:
                #conn.shutdown()
                conn.close()
                break
            ob_exports1 = json.loads(ob_exports.decode('utf-8'))
            print('s_t1=', ob_exports1)
            if not ob_exports:
                #conn.shutdown()
                conn.close()
                break
            T_bar_Tq1 = ob_exports1[0] / 10
            LatG1 = ob_exports1[1]
            YawRate1 = ob_exports1[2] / 50
            Yaw1 = ob_exports1[3] / 3.14
            Lateral1 = ob_exports1[4] / 20
            Steer_SW1 = ob_exports1[5] / 6000
            StrAV_SW1 = ob_exports1[6] / 5000
            Steer_L11 = ob_exports1[7] / 180
            Steer_R11 = ob_exports1[8] / 180
            Steer_L21 = ob_exports1[9] / 4
            Steer_R21 = ob_exports1[10] / 4
            Xcg_TM1 = ob_exports1[11] / 1000
            Ycg_TM1 = ob_exports1[12] / 300
            Zcg_TM1 = ob_exports1[13] / 45
            curv = ob_exports1[14]
            r_t = ob_exports1[15]
            done = ob_exports1[16]
            #            print('T_bar_Tq1=',T_bar_Tq1)
            print('r_t=', r_t)

            #            if abs(Lateral1) > 1 or abs(Yaw1) > 1 :
            if t_current > 20 or abs(Yaw1) > 1:

                break

            s_t1 = np.hstack(
                (T_bar_Tq1, LatG1, YawRate1, Yaw1, Lateral1, Steer_SW1,
                 StrAV_SW1, Steer_L11, Steer_R11, Steer_L21, Steer_R21,
                 Xcg_TM1, Ycg_TM1, Zcg_TM1, curv))
            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            #            print ("Rewards=",rewards)
            #            print ("Actions=",actions)
            #            print ("states=",states)
            #            print (states.shape)

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])
            #            print("rt1=",target_q_values)
            #            print(target_q_values.shape)

            for k in range(len(batch)):

                if dones[k]:

                    y_t[k] = rewards[k]

                else:

                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):

                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                #                print("a_for_grad=",a_for_grad)
                #                print(a_for_grad.shape)
                grads = critic.gradients(states, a_for_grad)
                #                print("grads=",grads)
                #                print(grads.shape)
                if step > 30:
                    grads_factor = gradient_inverter(critic_gradient,
                                                     a_t_inv,
                                                     p_min=-1,
                                                     p_max=1,
                                                     BATCH_SIZE=30)
                else:
                    grads_factor = 1
#                print("grads_factor=",grads_factor)
                grads_factor1 = np.asarray(grads_factor)
                grads3 = grads * grads_factor1
                #                print("grads3=",grads3)
                actor.train(states, grads3)
                actor.target_train()
                critic.target_train()

            total_reward += r_t

            s_t = s_t1

            print("Episode", i, "t_current", t_current, "Action", a_t,
                  "Reward", r_t, "Loss", loss, "step", step)

            step += 1

            if done:

                break
        #s.shutdown()

        if (train_indicator):

            print("Now we save model")
            actor.model.save_weights("actormodel.h5", overwrite=True)
            with open("actormodel.json", "w") as outfile:

                json.dump(actor.model.to_json(), outfile)

            critic.model.save_weights("criticmodel.h5", overwrite=True)
            with open("criticmodel.json", "w") as outfile:

                json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))

        print("")


#        s.close() # TCP/IP socket close

    s.close()  # TCP/IP socket close
    print("Finish.")
예제 #20
0
def RunSim(train_toggle):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 1000000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001
    LRA = 0.0002  #Actor Learning
    LRC = 0.002  #Critic Learning

    #Sim options
    lqr_toggle = 0
    action_dim = 1
    state_dim = 2
    hist_rt = []
    hist_reward = []

    d_exploration = 200000.
    num_max_episodes = 250
    max_seconds = 15
    reward = -100
    done = False
    step = 0
    epsilon = 1
    Noise_magnitude = 15

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    # Initialize Actor and Critic Networks
    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

    #Create replay buffer
    buff = ReplayBuffer(BUFFER_SIZE)

    #Load the ODE environment
    env = ODE(np.zeros(state_dim))

    # Load network parameters from previous training
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
    except:
        print("Weight Error")

    print("Taining state is ", train_toggle)

    for i in range(num_max_episodes):
        print("Current episode : " + str(i))
        # Before every episote completly reset the envrionment
        ob = env.reset()
        s_t = np.asarray(ob)[:, None].T
        total_reward = 0.
        max_steps = int(max_seconds / env.dt)

        # Run the episode
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / d_exploration
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            a_t_original = actor.model.predict(s_t)
            noise_t[0] = noise_toggle * train_toggle * max(
                epsilon, 0.05) * Noise_magnitude * np.random.randn(action_dim)
            a_t[0] = a_t_original[0] + noise_t[0]
            #a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            if lqr_toggle == 1:
                a_t[0] = -klqr.dot(np.asarray(s_t[0]))
            ob, r_t, done, info = env.step(a_t[0])
            s_t1 = np.asarray(ob)[:, None].T
            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0][0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3][0] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            #Output from target Networks
            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])
            #TD
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_toggle):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            hist_rt.append(r_t)
            s_t = s_t1

            step += 1
            if done:
                break
        hist_reward.append(np.mean(hist_rt))
        hist_rt = []
        if np.mod(i, 1) == 0:
            plt.close()
            hist = np.asarray(env.hist)
            u = np.asarray(env.u_hist)
            fig = plt.figure(figsize=(15, 5))
            plt.suptitle('Episode ' + str(i))
            ax1 = fig.add_subplot(121)
            rhist = np.asarray(env.ref_hist)
            time = np.linspace(0, j * env.dt, num=j + 1)
            ax1.plot(time, hist[:, 0], 'b', label='Output')
            ax1.plot(time, rhist[:, 0], 'b-.', label='Reference')
            ax1.set_ylabel('y(t)')
            ax1.set_xlabel('t')
            ax2 = fig.add_subplot(122)
            ax2.plot(time, u[:], 'g')
            ax2.set_ylabel('Control Signal u(t)')
            ax2.set_xlabel('t')

            #plt.show(block=False)
            fig.savefig('figures/results' + str(i) + '.pdf')

            fig2 = plt.figure(figsize=(5, 5))
            plt.title('Mean Reward Per Espisode')
            plt.plot(hist_reward, 'go')
            ymax = 100
            #plt.ylim(-100, 0)
            #plt.show(block=False)
            fig2.savefig('figures/reward.pdf')
            print 'reward', total_reward

        if np.mod(i, 10) == 0:
            if (train_toggle):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)
class Game(object):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    mx = 0
    LOG = 0
    trainnum = 0
    modelcnt = 0
    noiselevel = 0.5
    rpm = rpm(2000000)
    TAU = 0.001
    lr_actor = 3e-4
    lr_critic = 3e-4
    train_interval = 1
    train_times = 100
    action_dim = 18
    state_dim = 76
    max_steps = 1000 // 4
    cnt = 0
    GAMMA = 0.96
    BATCH_SIZE = 128
    log_path = './logs'

    import threading as th
    lock = th.Lock()

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU,
                         lr_actor)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU,
                           lr_critic)

    callback = TensorBoard(log_path)
    callback.set_model(critic.model)

    def write_log(self, callback, names, logs, batch_no):
        output = open('logs/data.txt', 'w')
        output.write(str(self.LOG) + ' ' + str(self.trainnum))
        output.close()
        for name, value in zip(names, itertools.repeat(logs)):
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value
            summary_value.tag = name
            callback.writer.add_summary(summary, batch_no)
            callback.writer.flush()
        callback = TensorBoard(self.log_path)

    def play(self, env, cnt):
        episode_memory = []
        step = 0
        s_t = env.reset()
        total_reward = 0.
        sp = 0.
        noise_t = np.zeros([1, self.action_dim])
        a_t = np.zeros([1, self.action_dim])
        noise = self.noiselevel
        self.noiselevel = noise * 0.999
        for j in range(self.max_steps):
            self.lock.acquire()
            global graph
            with graph.as_default():
                a_t_original = self.actor.model.predict(np.array([s_t]))
            self.lock.release()
            noise = noise * 0.98
            if cnt % 3 == 0:
                if j % 5 == 0:
                    noise_t[0] = np.random.randn(self.action_dim) * noise
            elif cnt % 3 == 1:
                if j % 5 == 0:
                    noise_t[0] = np.random.randn(self.action_dim) * noise * 2
            else:
                noise_t = np.zeros([1, self.action_dim])
            a_t = a_t_original + noise_t
            for i in range(self.action_dim):
                if (a_t[0][i] > 1):
                    a_t[0][i] = 1
                elif (a_t[0][i] < 0):
                    a_t[0][i] = 0
            ob, r_t, done, _, pen = env.step(a_t[0])
            s_t1 = ob
            episode_memory.append([s_t, a_t[0], r_t - pen, done, s_t1])
            total_reward += r_t
            sp += pen
            s_t = s_t1
            step += 1
            if done or step == 1000 / 4 - 1:
                if total_reward > self.mx:
                    self.mx = total_reward
                print("Episode", cnt, "Step", step, "Reward", total_reward,
                      "max", self.mx, "penalty", sp)
                train_names = ['reward']
                self.lock.acquire()
                self.LOG = self.LOG + 1
                self.write_log(self.callback, train_names, total_reward,
                               self.LOG)
                self.lock.release()
                break
        self.lock.acquire()
        for i in range(step):
            self.rpm.add(episode_memory[i])
        self.lock.release()

    def playonce(self, env, T):
        from multi import fastenv
        fenv = fastenv(env, 4)
        self.play(fenv, T)
        env.rel()
        del fenv

    def play_ignore(self, env, T):
        import threading as th
        try:
            t = th.Thread(target=self.playonce, args=(
                env,
                T,
            ))
            t.setDaemon(True)
            t.start()
        except:
            print("startfail")

    def playifavailable(self, T):
        while True:
            remote_env = farmer.acq_env()
            if remote_env == False:
                pass
            else:
                self.play_ignore(remote_env, T)
                break

    def train(self):
        memory = self.rpm
        if memory.size() < self.BATCH_SIZE:
            return
        global graph
        loss = 0
        for T in range(self.train_times):
            [states, actions, rewards, dones,
             new_states] = memory.sample_batch(self.BATCH_SIZE)
            y_t = np.asarray([0.0] * self.BATCH_SIZE)
            rewards = np.concatenate(rewards)
            self.lock.acquire()
            with graph.as_default():
                target_q_values = self.critic.target_model.predict(
                    [new_states,
                     self.actor.target_model.predict(new_states)])
            target_q_values = target_q_values.reshape(
                [1, target_q_values.shape[0]])[0]
            for k in range(self.BATCH_SIZE):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + self.GAMMA * target_q_values[k]
            with graph.as_default():
                self.critic.model.optimizer.learning_rate = self.lr_critic
                logs = self.critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = self.actor.model.predict(states)
                grads = self.critic.gradients(states, a_for_grad)
                self.actor.train(states, grads, learning_rate=self.lr_actor)
                self.actor.target_train()
                self.critic.target_train()
            train_names = ['train_loss']
            self.write_log(self.callback, train_names, logs, self.trainnum)
            self.trainnum = self.trainnum + 1
            loss = loss + logs
            self.lock.release()
        print("train", memory.size(), loss)

    def save(self):
        self.modelcnt = self.modelcnt + 1
        self.actor.target_model.save_weights("logs/actormodel.h5",
                                             overwrite=True)
        self.critic.target_model.save_weights("logs/criticmodel.h5",
                                              overwrite=True)
        self.actor.target_model.save_weights("logs/actormodel{}.h5".format(
            self.modelcnt))
        self.critic.target_model.save_weights("logs/criticmodel{}.h5".format(
            self.modelcnt))
        print("save")

    def pre(self):
        print("Now we load the weight")
        try:
            input = open('logs/data.txt', 'r')
            self.LOG, self.trainnum = map(int, input.read().split(' '))
            print("LOG", self.LOG, "trainnum", self.trainnum)
            input.close()
            print("log found")
            self.critic.model.load_weights("logs/criticmodel.h5")
            self.critic.target_model.load_weights("logs/criticmodel.h5")
            self.actor.model.load_weights("logs/actormodel.h5")
            self.actor.target_model.load_weights("logs/actormodel.h5")
            print("Weight load successfully")
            self.rpm.load('logs/rpm.pickle')
            print("rmp success")
        except:
            if self.LOG > 0:
                print("Load fault")
                return False
            else:
                print("A new experiment")
        return True

    def run(self):
        np.random.seed(23333)
        episode_count = 10000
        reward = 0
        done = False
        LOSS = 0

        for T in range(50):
            self.playifavailable(T)
        for T in range(episode_count):
            self.train()
            self.playifavailable(T)
            if np.mod(T, 100) == 0 and T >= 100:
                self.save()
        print("Finish.")
예제 #22
0
import gym
import tensorflow as tf
import numpy as np
from ActorNetwork import ActorNetwork
from CriticNetwork import CriticNetwork
from ddpg import DDPG

env_name ='CartPole-v1'
env = gym.make(env_name)
env._max_episode_steps = 200

stop_train_score=200 #stop training after reaching score for 3 consecutive episodes
sess = tf.Session()
critic = CriticNetwork(sess, 4, 2, 0.01, 0.001)
actor = ActorNetwork(sess, 4, 2, 0.01, 0.001, activation='softmax')
ddpg = DDPG(sess, actor, critic, batch_size=32)

def train_game( max_steps=10000):
    state = env.reset()
    done = False
    r = 0
    step_count = 0
    while not done and step_count <= max_steps:
        a = ddpg.get_action_for_state(state, True, [0.6, 0.6], [0.5,0.5], [0.2,0.2])
        new_state, reward, done, _ = env.step(np.argmax(a))
        ddpg.step(state, a, reward, new_state, done)
        r += reward
        step_count += 1
        state = new_state
    return r, ddpg.mean_loss
예제 #23
0
def train(train_indicator=1):
    env = Env()

    BUFFER_SIZE = 200000
    BATCH_SIZE = 128
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.0001  # Learning rate for Actor
    LRC = 0.001  # Lerning rate for Critic

    action_dim = env.action_dim
    state_dim = env.observation_space()

    np.random.seed(1337)

    EXPLORE = 100000.
    episode_count = 100
    max_steps = 10000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)

    print("load model weight")
    try:
        actor.model.load_weights("model/actormodel.h5")
        critic.model.load_weights("model/criticmodel.h5")
        actor.target_model.load_weights("model/actormodel.h5")
        critic.target_model.load_weights("model/criticmodel.h5")
        print("load successfully")
    except:
        print("Cannot find the model weight")

    s_t = env.reset()

    for i in range(episode_count):
        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        total_reward = 0.
        for j in range(max_steps):
            loss = 0

            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 10.0, 1, 7)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0, 1, 3)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]

            s_t1, r_t, _ = env.step(a_t[0])

            buff.add(s_t, a_t[0], r_t, s_t1, done)

            # env.get_memory(buff)

            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if train_indicator:
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if train_indicator:
                print("save model model")
                actor.model.save_weights("model/actormodel.h5", overwrite=True)
                # actor.model.save_weights("model/actormodel.h5", overwrite=True)
                # with open("model/actormodel.json", "wb") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("model/criticmodel.h5",
                                          overwrite=True)
                # critic.model.save_weights("model/criticmodel.h5", overwrite=True)
                # with open("model/criticmodel.json", "wb") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    print("Finish.")
    return actor
예제 #24
0
파일: ddpg_hockey.py 프로젝트: ataitler/DQN
env = gym.make(ENVIRONMENT_NAME)
if ENVIRONMENT_NAME is 'Hockey-v2':
	env_left = gym.make(TEST_ENV_LEFT)
	env_middle = gym.make(TEST_ENV_MIDDLE)
	env_right = gym.make(TEST_ENV_RIGHT)
action_dim = env.action_space.shape[0]
action_high = env.action_space.high
action_low = env.action_space.low

input_dim = env.observation_space.shape[0]

sess = tf.InteractiveSession()
logger = tf.train.SummaryWriter(OUT_DIR, sess.graph)
actor = ActorNetwork(sess, input_dim, action_dim, BATCH_SIZE, TAU, LRA, L2A)
critic = CriticNetwork(sess, input_dim, action_dim, BATCH_SIZE, TAU, LRC, L2C)
buff = ReplayBuffer(BUFFER_SIZE)
summary = tf.merge_all_summaries()

n = OUnoise(action_dim, 0.15, NOISE)
#n = OUnoise(action_dim)

saver = tf.train.Saver()
ckpt = tf.train.get_checkpoint_state(OUT_DIR)
if ckpt and ckpt.model_checkpoint_path:
	saver.restore(sess,ckpt.model_checkpoint_path)
	print("Model loaded from disk")

# initialize logger
L = Logger()
log_not_empty = L.Load(OUT_DIR+LOG_FILE)
예제 #25
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.0001    #Learning rate for Actor
    LRC = 0.001     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)

    #Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        for j in range(max_steps):
            loss = 0 
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1,action_dim])
            noise_t = np.zeros([1,action_dim])
            
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 30) == 0:
                print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
        
            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
예제 #26
0
def playGame(train_indicator = 1):
    BUFFER_SIZE = 10000
    BATCH_SIZE = 128
    GAMMA = 0.9
    TAU = 0.01
    lr_actor = 1e-3
    lr_critic = 1e-3
    train_interval = 1
    train_times = 20
    action_dim = 3
    state_dim = 2
    
    np.random.seed(2333)
    
    EXPLORE = 5000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0
    LOSS = 0
    
    
    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)
    
    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, lr_actor)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, lr_critic)
    buff = ReplayBuffer(BUFFER_SIZE)
    
    env = gym.make('MountainCarContinuous-v0')
    # env = wrappers.Monitor(env, '/tmp/cartpole-experiment-1')
    
    # Now load the weight
    print("Now we load the weight")
    try:
        # actor.model.load_weights("actormodel.h5")
        # critic.model.load_weights("criticmodel.h5")
        # actor.target_model.load_weights("actormodel.h5")
        # critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")
    loss = 0
    for i in range(episode_count):
        # print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))
        ob = env.reset()
        s_t = ob
        total_reward = 0.
        for j in range(max_steps):
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            for k in range(action_dim):
                noise_t[0][k] = train_indicator * max(epsilon, 0) * OU().function(a_t_original[0][k], 0, 1.0, 0.3)
            action = a_t_original[0]
            env.render()
            ob, r_t, done, _ = env.step(action)
            s_t1 = ob
            # print(ob)
            buff.add(s_t, a_t_original[0], r_t, s_t1, done)
            
            total_reward += r_t
            s_t = s_t1
            step += 1
            if done:
                print("Episode", i, "Step", step, "Reward", total_reward)
                break
        if (train_indicator) and i % train_interval == 0:
            loss = 0
            for T in range(train_times):
                batch = buff.getBatch(BATCH_SIZE)
                states = np.asarray([e[0] for e in batch])
                actions = np.asarray([e[1] for e in batch])
                rewards = np.asarray([e[2] for e in batch])
                new_states = np.asarray([e[3] for e in batch])
                dones = np.asarray([e[4] for e in batch])
                y_t = np.asarray([0.0 for e in batch])
                    
                target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])
                    
                for k in range(len(batch)):
                    if dones[k]:
                        y_t[k] = rewards[k]
                    else:
                        y_t[k] = rewards[k] + GAMMA * target_q_values[k]
                loss = critic.model.train_on_batch([states,actions], y_t)
                
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()                
            print("Episode", i, "Step", step, "Loss", loss)
        if np.mod(i, 3) == 0:
            if (train_indicator):
            # print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)
        
                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

#print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward), "loss: " + str(LOSS), "epsilon" + str(epsilon))
    #       print("Total Step: " + str(step))
    #       print("")
    
    print("Finish.")
    env.close()
예제 #27
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    # Now load the weight
    # print("Now we load the weight")
    # try:
    #     actor.model.load_weights("actormodel.h5")
    #     critic.model.load_weights("criticmodel.h5")
    #     actor.target_model.load_weights("actormodel.h5")
    #     critic.target_model.load_weights("criticmodel.h5")
    #     print("Weight load successfully")
    # except:
    #     print("Cannot find the weight")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                         ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))
        print ob.track

        total_reward = 0.
        stucked = 0
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            if random.random() <= 0.1:
                print("********Now we apply the brake***********")
                noise_t[0][2] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][2], 0.2, 1.00,
                                              0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack(
                (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,
                 ob.speedZ, ob.wheelSpinVel / 100.0, ob.rpm))

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t,
                  "Loss", loss)

            step += 1
            if done:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
예제 #28
0
파일: Agent.py 프로젝트: ymkim1019/aibirds
class Agent(EventTask):
    # ENV -> AGENT
    OBSERVE = 0

    # AGENT -> ENV
    ACT = 0

    def __init__(self, trainable=1, load_model=1):
        super(Agent, self).__init__('Agent')

        np.random.seed(1337)

        self.step = 0
        self.state_cache = dict()
        self.action_cache = dict()

        # Tensorflow GPU optimization
        config = tf.ConfigProto()
        config.gpu_options.allow_growth = True
        self.sess = tf.Session(config=config)
        self.trainable = trainable
        K.set_session(self.sess)

        self.actor = ActorNetwork(self.sess, globalConfig.TAU, globalConfig.LRA)
        self.critic = CriticNetwork(self.sess, globalConfig.TAU, globalConfig.LRC)

        self.buff = ReplayBuffer(globalConfig.BUFFER_SIZE)  # Create replay buffer
        self.cnt = 0

        if load_model == 1:
            # Now load the weight
            print("Now we load the weight")
            try:
                self.actor.model.load_weights("actormodel.h5")
                self.critic.model.load_weights("criticmodel.h5")
                self.actor.target_model.load_weights("actormodel.h5")
                self.critic.target_model.load_weights("criticmodel.h5")
                print("Weight load successfully")
            except:
                print("Cannot find the weight")

        self.graph = tf.get_default_graph()


    def do_job(self, job_obj):
        (job_id, data, env_proxy) = job_obj
        if self.verbose:
            print(str.format("Processing Job id:{}..", job_id))

        with self.graph.as_default():
            if job_id == self.OBSERVE:
                # observation
                is_first_shot, done, n_pigs, n_stones, n_woods, n_ices, n_tnts, bird_type, im, r_t, current_level = data
                print(str.format("----> observation from {}, level = {}", env_proxy.get_client_ip(), current_level))
                print(str.format("first shot:{}, reward:{}, episode done:{}", is_first_shot, r_t, done))
                print(str.format("# pigs={}, # stones={}, # woods={}, # ices={}, n_tnts={}, bird={}"
                                 , n_pigs, n_stones, n_woods, n_ices, n_tnts, bird_type))
                # print('im shape=', im.shape)
                s_t1 = [np.array(im), np.array([n_pigs, n_stones, n_woods, n_ices, n_tnts]), np.array([bird_type])]

                # store transition into replay buffer
                try:
                    self.buff.add(self.state_cache[env_proxy], self.action_cache[env_proxy], r_t, s_t1, done)
                    print("store transition into replay buffer")
                except Exception as e:
                    print("first shot of the game")
                    pass

                if self.buff.count() > 0:
                    print("Do the batch update...")

                    # Do the batch update
                    batch = self.buff.getBatch(globalConfig.BATCH_SIZE)
                    # states = np.asarray([e[0] for e in batch])
                    images = [e[0][0] for e in batch]
                    num_objects = [e[0][1] for e in batch]
                    birds = [e[0][2] for e in batch]
                    states = [np.array(images), np.array(num_objects), np.array(birds)]
                    actions = np.asarray([e[1] for e in batch])
                    rewards = np.asarray([e[2] for e in batch])
                    new_images = [e[3][0] for e in batch]
                    new_num_objects = [e[3][1] for e in batch]
                    new_birds = [e[3][2] for e in batch]
                    new_states = [np.array(new_images), np.array(new_num_objects), np.array(new_birds)]
                    dones = np.asarray([e[4] for e in batch])
                    y_t = np.asarray([e[1] for e in batch])

                    # print('batch update shape, size =', len(batch))
                    # print(np.array(images).shape)
                    # print(np.array(num_objects).shape)
                    # print(np.array(birds).shape)

                    new_a = self.actor.target_model.predict(states)
                    # print('new_a=\n', new_a)
                    target_q_values = self.critic.target_model.predict(new_states + [new_a])
                    # print('q values =\n', target_q_values)

                    for k in range(len(batch)):
                        if dones[k]:
                            y_t[k] = rewards[k]
                        else:
                            y_t[k] = rewards[k] + globalConfig.GAMMA * target_q_values[k]

                    if self.trainable:
                        print('loss =', self.critic.model.train_on_batch(states + [actions], y_t))
                        a_for_grad = self.actor.model.predict(states)
                        grads = self.critic.gradients(states, a_for_grad)
                        self.actor.train(states, grads)
                        self.actor.target_train()
                        self.critic.target_train()

                # select action a_t
                s_t = s_t1
                pixels = np.reshape(s_t[0], tuple([1]) + s_t[0].shape)
                num_objects = np.reshape(s_t[1], tuple([1]) + s_t[1].shape)
                input_bird = np.reshape(s_t[2], tuple([1]) + s_t[2].shape)

                # print(pixels.shape, num_objects.shape, input_bird.shape)

                a_t = self.actor.model.predict([pixels, num_objects, input_bird])

                target = math.floor(a_t[0][0] * np.sum(s_t[1] - 0.00001))  # avoid index out of range error
                high_shot = 1 if a_t[0][1] > 0.5 else 0
                tap_time = math.floor(65 + a_t[0][2] * 25)
                print('raw a_t w/o noise=', a_t)
                print(str.format("next action w/o noise: target({}), high_shot({}), tap time({})", target, high_shot, tap_time))

                if self.trainable == 1:
                    # random noise
                    noise1 = np.random.randn(1) * 0.2
                    noise2 = np.random.randn(1) * 0.3
                    noise3 = np.random.randn(1) * 0.2
                    print('random noise=', noise1, noise2, noise3)
                    a_t[0][0] += noise1
                    a_t[0][1] += noise2
                    a_t[0][2] += noise3
                    a_t[0][0] = min(1, a_t[0][0])
                    a_t[0][0] = max(0, a_t[0][0])
                    a_t[0][1] = min(1, a_t[0][1])
                    a_t[0][1] = max(0, a_t[0][1])
                    a_t[0][2] = min(1, a_t[0][2])
                    a_t[0][2] = max(0, a_t[0][2])

                    target = math.floor(a_t[0][0] * np.sum(s_t[1] - 0.00001)) # avoid index out of range error
                    high_shot = 1 if a_t[0][1] > 0.5 else 0
                    tap_time = math.floor(65 + a_t[0][2] * 25)
                    print('raw a_t w/ noise =', a_t)
                    print(str.format("next action w/ noise: target({}), high_shot({}), tap time({})", target, high_shot, tap_time))

                # cache
                self.state_cache[env_proxy] = s_t
                self.action_cache[env_proxy] = a_t[0]

                # execute an action
                env_proxy.execute(target, high_shot, tap_time)

                self.cnt += 1
                if self.cnt % globalConfig.model_save_interval == 0:
                    if self.trainable == 1:
                        print("Saving model....")
                        self.actor.model.save_weights("actormodel.h5", overwrite=True)
                        with open("actormodel.json", "w") as outfile:
                            json.dump(self.actor.model.to_json(), outfile)

                        self.critic.model.save_weights("criticmodel.h5", overwrite=True)
                        with open("criticmodel.json", "w") as outfile:
                            json.dump(self.critic.model.to_json(), outfile)
예제 #29
0
파일: DDPG.py 프로젝트: caoshiyi/AdaM
def playGame(train_indicator=0):  # 1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  # Target Network HyperParameters
    LRA = 0.00001  # Learning rate for Actor
    LRC = 0.0001  # Lerning rate for Critic

    server_number = 5
    # node_number = 18
    hot_node_number = 150
    action_dim = hot_node_number  # Number of servers
    state_dim = hot_node_number * (server_number + 1 + 10
                                   )  # 1000 node * 10 features
    # baseline = 4e-05 #load&locality of baselines

    np.random.seed(500)

    # vision = False

    EXPLORE = 100000.
    episode_count = 100
    max_steps = 100000
    line_number = 1000
    step_number = 35
    # reward = 0
    done = False
    step = 0
    epsilon = 1
    # indicator = 0

    # Tensorflow GPU optimization
    config = tf.ConfigProto()
    # config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  # Create replay buffer

    # Generate a MDS environment
    env = MetaEnvironment(server_number)

    # Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("model/actormodel-" + str(server_number) +
                                 ".h5")
        critic.model.load_weights("model/criticmodel-" + str(server_number) +
                                  ".h5")
        actor.target_model.load_weights("model/actormodel-" +
                                        str(server_number) + ".h5")
        critic.target_model.load_weights("model/criticmodel-" +
                                         str(server_number) + ".h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    print("Experiment Start.")

    f = open("query.txt")
    queryList = []
    for line in f.readlines():
        line = line.strip()
        queryList.append(line)
    f.close()

    sumLoc = 0
    sumLod = 0
    lossList = []
    mdsLoadList = [[] for x in range(server_number)]

    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        # if np.mod(i, 3) == 0:
        # ob = env.reset(relaunch=True)   #relaunch every 3 episode because of the memory leak error
        # else:
        # ob = env.reset()

        traceList = queryList[0:line_number]  # Reset
        s_t = env.state(traceList)  # Get State from env

        localityList = []
        loadList = []

        total_reward = 0.
        for j in range(max_steps):
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            # add noise
            a_t_original = actor.model.predict(s_t)
            for k in range(action_dim):
                noise_t[0][k] = train_indicator * max(
                    epsilon, 0) * OU.function(a_t_original[0][k], 0.0, 0.60,
                                              0.30)

            for m in range(action_dim):
                a_t[0][m] = a_t_original[0][m]  # + noise_t[0][m]

            migration = env.take_actions(a_t[0])
            print("migration", migration)

            tracelist = queryList[(j + 1) * line_number:(j + 2) * line_number]
            s_t1 = env.state(tracelist)  # Update state from env
            # r_t = 0.5*env.locality() + 50*env.load() - baseline
            # print("gagaga", 1e5*env.locality() + 1e7*env.load())
            # 1.5, 3, 2
            x = 1e5 * env.locality() + 1e7 * env.load() - 1.5 * migration
            # x = 1e5*env.locality() + 1.5 * 1e7*env.load()
            # r_t = 1.0 / (1.0 + np.exp(-(x/50)))
            r_t = x

            if j == step_number:
                done = True
            else:
                done = False

            buff.add(s_t, a_t[0], r_t, s_t1, done)  # Add replay buffer

            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])
            states = states.reshape(len(batch), -1)
            new_states = new_states.reshape(len(batch), -1)
            actions = actions.reshape(len(batch), -1)

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss, "Locality", env.locality(), "Load", env.load())
            print("Episode", i, "Step", step, "Reward", r_t, "Loss", loss,
                  "Locality", env.locality(), "Load", env.load())

            lossList.append(loss)
            localityList.append(env.locality())
            loadList.append(env.load())
            for index in range(server_number):
                mdsLoadList[index].append(env.loadList[index])

            step += 1
            if done:
                break

        curLocalitySum = sum(localityList)
        curLoadSum = sum(loadList)

        # f = open('' + str(server_number) + '.txt', 'w')
        # f.write(','.join(map(str, lossList)))
        # f.close()

        # f = open('anglecut-mdsload-' + str(server_number) + '.txt', 'w')
        # for i in range(server_number):
        #     f.write(','.join(map(str, mdsLoadList[i])))
        #     f.write('\n')
        # f.close()
        # print("写入成功")

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("model/actormodel-" +
                                         str(server_number) + ".h5",
                                         overwrite=True)
                with open("model/actormodel-" + str(server_number) + ".json",
                          "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("model/criticmodel-" +
                                          str(server_number) + ".h5",
                                          overwrite=True)
                with open("model/criticmodel-" + str(server_number) + ".json",
                          "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        # print("Final Locality:", env.final_locality(), "Final Load Balancing:", env.final_load())
        # env.clear()
        print("")

    # env.end()
    print("Finish.")
예제 #30
0
파일: ddpg.py 프로젝트: harshsha5/DDPG-TD3
class DDPG(object):
    """A class for running the DDPG algorithm."""
    def __init__(self, env, outfile_name, hindsight):
        """Initialize the DDPG object.

        Args:
            env: an instance of gym.Env on which we aim to learn a policy.
            outfile_name: (str) name of the output filename.
        """
        action_dim = len(env.action_space.low)
        state_dim = len(env.observation_space.low)
        np.random.seed(1337)
        self.env = env

        self.sess = tf.compat.v1.Session()
        tf.keras.backend.set_session(self.sess)
        self.batch_size = BATCH_SIZE
        self.buffer = ReplayBuffer(BUFFER_SIZE)
        self.burn_in_memory_size = BURN_IN_MEMORY
        self.Critic = CriticNetwork(self.sess,
                                    state_dim,
                                    action_dim,
                                    self.batch_size,
                                    tau=TAU,
                                    learning_rate=LEARNING_RATE_CRITIC)
        self.noise_mu = NOISE_MU
        self.Noise_sigma = NOISE_SIGMA * (env.action_space.high[0] -
                                          env.action_space.low[0])
        self.Actor = ActorNetwork(sess=self.sess,
                                  state_size=state_dim,
                                  action_size=action_dim,
                                  batch_size=self.batch_size,
                                  tau=TAU,
                                  learning_rate=LEARNING_RATE_ACTOR)

        # Defining a custom name for the Tensorboard summary.
        timestr = time.strftime("%Y%m%d-%H%M%S")

        if hindsight:
            save_path = "runs/HER_DDPG_" + timestr + '/'
        else:
            save_path = "runs/DDPG_" + timestr + '/'

        self.writer = SummaryWriter(save_path)
        self.outfile = outfile_name
        self.action_range = 1

    def generate_burn_in(self):
        num_actions = self.env.action_space.shape[0]
        state = self.env.reset()
        state = np.array(state)
        done = False
        for i in range(self.burn_in_memory_size):
            action = np.random.uniform(
                -1.0, 1.0, size=num_actions
            )  #Randomly generating actions for the buffer burn_in
            new_state, reward, done, info = self.env.step(action)
            new_state = np.array(new_state)
            self.buffer.add(state, action, reward, new_state, done)
            state = new_state
            if (done):
                state = self.env.reset()
                state = np.array(state)
                done = False

    def evaluate(self, num_episodes, num_iteration):
        """Evaluate the policy. Noise is not added during evaluation.

        Args:
            num_episodes: (int) number of evaluation episodes.
        Returns:
            success_rate: (float) fraction of episodes that were successful.
            average_return: (float) Average cumulative return.
        """
        test_rewards = []
        success_vec = []
        plt.figure(figsize=(12, 12))
        for i in range(num_episodes):
            s_vec = []
            state = self.env.reset()
            s_t = np.array(state)
            total_reward = 0.0
            done = False
            step = 0
            success = False
            while not done:
                s_vec.append(s_t)
                a_t = self.Actor.actor_network.predict(s_t[None])[0]
                # import pdb; pdb.set_trace()
                new_s, r_t, done, info = self.env.step(a_t)
                if done and "goal" in info["done"]:
                    success = True
                new_s = np.array(new_s)
                total_reward += r_t
                s_t = new_s
                step += 1
            success_vec.append(success)
            test_rewards.append(total_reward)
            if i < 9:
                plt.subplot(3, 3, i + 1)
                s_vec = np.array(s_vec)
                pusher_vec = s_vec[:, :2]
                puck_vec = s_vec[:, 2:4]
                goal_vec = s_vec[:, 4:]
                plt.plot(pusher_vec[:, 0],
                         pusher_vec[:, 1],
                         '-o',
                         label='pusher')
                plt.plot(puck_vec[:, 0], puck_vec[:, 1], '-o', label='puck')
                plt.plot(goal_vec[:, 0],
                         goal_vec[:, 1],
                         '*',
                         label='goal',
                         markersize=10)
                plt.plot([0, 5, 5, 0, 0], [0, 0, 5, 5, 0], 'k-', linewidth=3)
                plt.fill_between([-1, 6], [-1, -1], [6, 6],
                                 alpha=0.1,
                                 color='g' if success else 'r')
                plt.xlim([-1, 6])
                plt.ylim([-1, 6])
                if i == 0:
                    plt.legend(loc='lower left',
                               fontsize=28,
                               ncol=3,
                               bbox_to_anchor=(0.1, 1.0))
                if i == 8:
                    # Comment out the line below to disable plotting.
                    # plt.show()
                    buf = io.BytesIO()
                    plt.savefig(buf, format='png')
                    buf.seek(0)
        return np.mean(success_vec), np.mean(test_rewards), np.std(
            test_rewards), buf

    def train(self, num_episodes, hindsight=False):
        """Runs the DDPG algorithm.

        Args:
            num_episodes: (int) Number of training episodes.
            hindsight: (bool) Whether to use HER.
        """
        self.generate_burn_in()
        for i in range(num_episodes):
            state = self.env.reset()
            s_t = np.array(state)
            total_reward = 0.0
            done = False
            step = 0
            loss = 0
            store_current_states = []
            store_actions = []
            store_dones = []

            self.ActionNoise = EpsilonNormalActionNoise(
                self.noise_mu, self.Noise_sigma, EPSILON)

            while not done:
                # Collect one episode of experience, saving the states and actions
                # to store_states and store_actions, respectively.
                action = np.clip(
                    self.ActionNoise(
                        self.Actor.actor_network.predict(s_t[None])[0]),
                    -self.action_range, self.action_range)
                # import pdb; pdb.set_trace()

                new_state, reward, done, info = self.env.step(action)
                new_state = np.array(new_state)

                store_current_states.append(s_t)
                store_actions.append(action)
                store_dones.append(done)

                self.buffer.add(s_t, action, reward, new_state, done)
                transition_minibatch = np.asarray(
                    self.buffer.get_batch(self.batch_size))

                target_actions = self.Actor.target_actor_network.predict(
                    np.stack(transition_minibatch[:, 3]))
                target_Qs = self.Critic.target_critic_network.predict(
                    [np.stack(transition_minibatch[:, 3]), target_actions])

                target_values = np.stack(
                    transition_minibatch[:, 2]) + GAMMA * target_Qs.reshape(-1)
                reward_indices = np.where(transition_minibatch[:,
                                                               4] == True)[0]
                target_values[
                    reward_indices] = target_values[reward_indices] - GAMMA * (
                        target_Qs.reshape(-1)[reward_indices])

                # present_values = self.Critic.critic_network.predict([transition_minibatch[:,0][0][None],transition_minibatch[:,1][0][None]])
                history = self.Critic.critic_network.fit(
                    [
                        np.stack(transition_minibatch[:, 0]),
                        np.stack(transition_minibatch[:, 1])
                    ],
                    target_values,
                    batch_size=self.batch_size,
                    epochs=1,
                    verbose=0)

                #Update Actor Policy
                actor_actions = self.Actor.actor_network.predict(
                    np.stack(transition_minibatch[:, 0]))
                action_grads = self.Critic.gradients(
                    np.stack(transition_minibatch[:, 0]), actor_actions)[0]
                gradient_update = self.Actor.train(
                    np.stack(transition_minibatch[:, 0]), action_grads)
                # import pdb; pdb.set_trace()

                self.Critic.update_target()
                self.Actor.update_target()

                loss += history.history['loss'][-1]
                s_t = new_state
                step += 1
                total_reward += reward

            if hindsight:
                # For HER, we also want to save the final next_state.
                store_current_states.append(new_state)
                store_current_states_copy = copy.deepcopy(store_current_states)
                her_states, her_rewards = self.env.apply_hindsight(
                    store_current_states_copy)
                # her_states, her_rewards, her_actions = self.add_hindsight_replay_experience(store_current_states, store_actions)
                for k in range(len(her_states) - 1):
                    if her_rewards[k] == 0:
                        self.buffer.add(her_states[k], store_actions[k],
                                        her_rewards[k], her_states[k + 1],
                                        True)
                        break
                    else:
                        self.buffer.add(her_states[k], store_actions[k],
                                        her_rewards[k], her_states[k + 1],
                                        store_dones[k])

            del store_current_states, store_actions, store_dones
            store_states, store_actions, store_dones = [], [], []

            loss = loss / step

            self.writer.add_scalar('train/loss', loss, i)
            self.writer.add_scalar("Training Reward VS Episode", total_reward,
                                   i)

            # Logging
            print("Episode %d: Total reward = %d" % (i, total_reward))
            print("\tTD loss = %.2f" % (loss, ))
            print("\tSteps = %d; Info = %s" % (step, info['done']))
            if i % 100 == 0:
                successes, mean_rewards, std_rewards, buf = self.evaluate(
                    10, i)
                image = tf.image.decode_png(buf.getvalue(), channels=3)
                image = image.eval(session=self.sess)
                self.writer.add_image('Performance',
                                      image,
                                      i,
                                      dataformats='HWC')
                self.writer.add_scalar('mean_reward', mean_rewards, i)
                self.writer.add_scalar('std_reward', std_rewards, i)
                print('Evaluation: success = %.2f; return = %.2f' %
                      (successes, mean_rewards))
                with open(self.outfile, "a") as f:
                    f.write("%.2f, %.2f,\n" % (successes, mean_rewards))

    def add_hindsight_replay_experience(self, states, actions):
        """Relabels a trajectory using HER.

        Args:
            states: a list of states.
            actions: a list of states.
        """

        her_states, her_rewards = self.env.apply_hindsight(states)

        her_actions = actions

        # print('her_states size: ', len(her_states))
        # print('her_rewardssize: ', len(her_rewards))
        # print('her_states: ', her_states)
        # print('her_rewards: ', her_rewards)

        return her_states, her_rewards, her_actions
예제 #31
0
def playGame(train_indicator=1):    # 1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000  # 缓存能力,网络储存能力
    BATCH_SIZE = 32  # 批尺寸,一次处理样本数
    GAMMA = 0.99  # 折扣系数
    TAU = 0.001     # Target Network HyperParameters 目标网络超系数
    LRA = 0.0001    # Learning rate for Actor Actor网络学习率
    LRC = 0.001     # Lerning rate for Critic Critic网络学习率

    action_dim = 3  # Steering/Acceleration/Brake 加速/转向/刹车
    state_dim = 29  # of sensors input 29个传感器输入

    np.random.seed(1337)  # 随机数种子,如果使用相同的数字,则每次产生的随机数相同,应该是定义了一个随机的初始值。

    vision = False

    EXPLORE = 100000.
    episode_count = 2000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1
    indicator = 0

    # Tensorflow GPU 管理策略,此处使用动态内存申请策略
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    # 硬性限制GPU使用率为0.4
    # config.gpu_options.per_process_gpu_memory_fraction = 0.4
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)    # Create replay buffer

    #  Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True,gear_change=False)

    # Now load the weight
    print("Now we load the weight")
    try:
        actor.model.load_weights("actormodel.h5")
        critic.model.load_weights("criticmodel.h5")
        actor.target_model.load_weights("actormodel.h5")
        critic.target_model.load_weights("criticmodel.h5")
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    theTime = datetime.datetime.now()  # 获取系统当前时间
    theTime = theTime.strftime('%y-%m-%d_%H:%M:%S')  # 转换为字符串形式作为CSV文件头
    folder_path = "practise_progress/" + theTime + "/"  # 只适用于Linux系统
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
        print("folder created")
    else:
        print("folder existed")

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   # relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.

        csvfileHeader = "practise_progress/" + theTime + "/" + " Episode " + str(i) + ".csv"
        fileHeader = ["Step", "TrackPos", "SpeedX", "SpeedY", "SpeedZ",
                      "Action_Steering", "Action_Acceleration", "Action_Brake", "Reward", "Loss"]
        csvFile = open(csvfileHeader, "w")
        writer = csv.writer(csvFile)
        writer.writerow(fileHeader)

        for j in range(max_steps):
            loss = 0 
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0 , 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5 , 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1 , 1.00, 0.05)

            # The following code do the stochastic brake
            # if random.random() <= 0.1:
            #     print("********Now we apply the brake***********")
            #     noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0])

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      # Add replay buffer
            
            # Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict([new_states, actor.target_model.predict(new_states)])  
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.model.train_on_batch([states,actions], y_t) 
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            csvData = [step, ob.trackPos, ob.speedX * 300, ob.speedY * 300, ob.speedZ * 300,
                       a_t[0, 0], a_t[0, 1], a_t[0, 2], r_t, loss]
            """        参数记录
                       轮次  步骤计数  车辆位置  X轴速度  Y轴速度  Z轴速度
                       加速输出  转向输出  刹车输出  回报  损失函"""
            writer.writerow(csvData)
            print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)
            step += 1
            if done:
                csvFile.close()
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("actormodel.h5", overwrite=True)
                with open("actormodel.json", "w") as outfile:
                    json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("criticmodel.h5", overwrite=True)
                with open("criticmodel.json", "w") as outfile:
                    json.dump(critic.model.to_json(), outfile)



        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  #  This is for shutting down TORCS
    print("Finish.")
예제 #32
0
def playGame(train_indicator=1):    #1 means Train, 0 means simply Run
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001     #Target Network HyperParameters
    LRA = 0.00005    #Learning rate for Actor
    LRC = 0.0005     #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 29  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 200000.
    if train_indicator:
        episode_count = 1000
    else:
        episode_count = 20
    max_steps = 4000
    step = 0
    if train_indicator:
        epsilon = 1
    else:
        epsilon = 0
    min_laptime = 10000000

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)

    buff = ReplayBuffer(BUFFER_SIZE)    #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)

    #Now load the weight
    # loading networks
    print("Now we load the weight")
    saver = tf.train.Saver()
    checkpoint = tf.train.get_checkpoint_state("saved_networks/")
    if checkpoint and checkpoint.model_checkpoint_path:
        saver.restore(sess, checkpoint.model_checkpoint_path)
        print("Successfully loaded:", checkpoint.model_checkpoint_path)
    else:
        print("Could not find old network weights")
    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(relaunch=True)   #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY,  ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
     
        total_reward = 0.
        # totalLaptime = 0.
        for j in range(max_steps):
            loss = 0
            if train_indicator:
                epsilon -= 1.0 / EXPLORE
                epsilon = max(epsilon, 0.10)
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])
            
            a_t_original = actor.predict(s_t.reshape(1, s_t.shape[0]))
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][0],  0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][1],  0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]

            ob, r_t, done, info = env.step(a_t[0], train_indicator)

            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ, ob.wheelSpinVel/100.0, ob.rpm))
        
            buff.add(s_t, a_t[0], r_t, s_t1, done)      #Add replay buffer
            
            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_predict(new_states, actor.target_predict(new_states))
           
            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA*target_q_values[k]
       
            if (train_indicator):
                loss += critic.train_on_batch(states, actions, y_t)
                a_for_grad = actor.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            if np.mod(step, 100) == 0:
                print("Episode", i, "Step", step, "Epsilon", epsilon, "Action", a_t, "Reward", r_t, "Loss", loss) #, "curLapTime", ob.curLapTime)
        
            step += 1
            if i == 0:
                break
            if done:
                break

        # if np.mod(i, 3) == 0:
        if (train_indicator) and i > 0:
            if env.lapTime < min_laptime and env.num_lap == 10:
                min_laptime = env.lapTime
                print("Now we save model")
                saver.save(sess, 'saved_networks/' + 'network' + '-ddpg-{}'.format(i))

        print("TOTAL REWARD @ " + str(i) +"-th Episode  : Reward " + str(total_reward))
        print("Total Step: " + str(step))
        print("")

    env.end()  # This is for shutting down TORCS
    print("Finish.")
예제 #33
0
def playGame(train_indicator=0):  #1 means Train, 0 means simply Run
    time.sleep(1)
    BUFFER_SIZE = 100000
    BATCH_SIZE = 32
    GAMMA = 0.99
    TAU = 0.001  #Target Network HyperParameters
    LRA = 0.0001  #Learning rate for Actor
    LRC = 0.001  #Lerning rate for Critic

    action_dim = 3  #Steering/Acceleration/Brake
    state_dim = 24  #of sensors input

    np.random.seed(1337)

    vision = False

    EXPLORE = 300000.
    episode_count = 20000
    max_steps = 100000
    reward = 0
    done = False
    step = 0
    epsilon = 1.0
    # epsilon = 1
    indicator = 0

    #Tensorflow GPU optimization
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRA)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU, LRC)
    buff = ReplayBuffer(BUFFER_SIZE)  #Create replay buffer

    # Generate a Torcs environment
    env = TorcsEnv(vision=vision, throttle=True, gear_change=False)
    pre_model = load_model("weights_rescale_all-0000.hdf5")
    # x = np.array([ 4.82767379e-01,  5.92105016e-02,  3.61700505e-01,  2.74807483e-01,
    #     2.31401995e-01,  2.07236990e-01,  1.95800006e-01,  1.89892501e-01,
    #     1.84837490e-01,  1.81293502e-01,  1.77807003e-01,  1.74377009e-01,
    #     1.71005994e-01,  1.66384503e-01,  1.61247000e-01,  1.52030498e-01,
    #     1.35238498e-01,  1.11962005e-01,  8.79574940e-02,  4.76383008e-02,
    #     4.78339800e-01,  6.97819047e-01,  4.60800716e-01,  5.00754069e-01,
    #     -1.00000000e+00,  9.99979496e-01,  8.71338917e-13])
    # x_s = np.array([x, x])
    # pre_y = pre_model.predict(x_s)
    # print(x_s[0])
    # print(pre_y[0])

    #Now load the weight
    load_name = "sample_v0_40"
    print("Now we load the weight")
    try:
        actor.model.load_weights("saved/actormodel_{}.h5".format(load_name))
        critic.model.load_weights("saved/criticmodel_{}.h5".format(load_name))
        actor.target_model.load_weights(
            "saved/actormodel_{}.h5".format(load_name))
        critic.target_model.load_weights(
            "saved/criticmodel_{}.h5".format(load_name))
        print("Weight load successfully")
    except:
        print("Cannot find the weight")

    plt.figure()
    overall_scores = []
    model_name = "sample_v0"

    print("TORCS Experiment Start.")
    for i in range(episode_count):

        print("Episode : " + str(i) + " Replay Buffer " + str(buff.count()))

        if np.mod(i, 3) == 0:
            ob = env.reset(
                relaunch=True
            )  #relaunch TORCS every 3 episode because of the memory leak error
        else:
            ob = env.reset()

        s_t = np.hstack(
            (ob.angle, ob.track, ob.trackPos, ob.speedX, ob.speedY, ob.speedZ))

        total_reward = 0.
        cur_sample = []
        attack_valid = 1
        gap = (i / 10) / 100.0
        attack_step = -1
        attack_target = 0
        for j in range(max_steps):
            # if j == 50:
            # time.sleep(0.099)
            # continue
            loss = 0
            epsilon -= 1.0 / EXPLORE
            a_t = np.zeros([1, action_dim])
            noise_t = np.zeros([1, action_dim])

            a_t_original = actor.model.predict(s_t.reshape(1, s_t.shape[0]))
            # if j > 120:
            noise_t[0][0] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][0], 0.0, 0.60, 0.30)
            noise_t[0][1] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][1], 0.5, 1.00, 0.10)
            noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(
                a_t_original[0][2], -0.1, 1.00, 0.05)

            #The following code do the stochastic brake
            #if random.random() <= 0.1:
            #    print("********Now we apply the brake***********")
            #    noise_t[0][2] = train_indicator * max(epsilon, 0) * OU.function(a_t_original[0][2],  0.2 , 1.00, 0.10)

            a_t[0][0] = a_t_original[0][0] + noise_t[0][0]
            a_t[0][1] = a_t_original[0][1] + noise_t[0][1]
            a_t[0][2] = a_t_original[0][2] + noise_t[0][2]
            if j < 20 and train_indicator:
                a_t[0][1] += 0.5
            # os.system("scrot saved_pic/{}.png".format(j))
            if j == 80:
                print("cp attack!")
                a_t[0][0] = -1.0
            if j == 83:
                os.system("scrot saved_pic/{}.png".format(j))
            #    if a_t[0][0] > 0:
            #         a_t[0][0] = -0.3
            #     else:
            #         a_t[0][0] = 0.3
            # print("%.2f"%a_t[0][0])
            # a_t[0][2] += 0.7
            # if ob.speedX > 0.6:
            # a_t[0][1] = 0
            # if(step == 60):
            # a_t[0][0] = 1.0
            # s_t_scaled = rescale_state(s_t)
            # # print(s_t[0])
            # s_t_0 = restore_state(s_t_scaled)
            # # print(s_t_0[0])
            # new_a_t = actor.model.predict(s_t_0.reshape(1, s_t_0.shape[0]))
            # s_t_scaled_list = np.array([np.copy(s_t_scaled) for val in range(21)])
            # actions = np.array([np.copy(a_t[0]) for val in range(21)])
            # for val in range(21):
            #     actions[val][0] = -1.0 + val/10.0
            # # print(actions)
            # x_0 = np.hstack((s_t_scaled_list, actions))
            # # print(x_0.shape, s_t_scaled_list.shape, actions.shape)
            # pre_y = pre_model.predict(x_0)
            # # print(x_0[0])
            # # print(pre_y[0])

            # steer_index = int(a_t[0][0]*10.0 + 10.0)
            # for pre_step in range(2):
            #     restore_new_Y = restore_states(pre_y)
            #     actions = actor.model.predict(restore_new_Y)
            #     x_step1 = np.hstack((pre_y, actions))
            #     pre_y = pre_model.predict(x_step1)

            # for index in range(21):
            #     diff = calsulate_d(pre_y[index]) - calsulate_d(pre_y[steer_index])
            #     pro = np.random.random()
            #     if diff > gap and attack_valid == 1 and pro > 0.8 and j > 50:
            #         a_t[0][0] = -1.0 + index/10.0
            #         print("adv!", diff, "pro:", pro)
            #         attack_step = j
            #         attack_target = a_t[0][0]
            #         attack_valid -= 1

            # dis_list = np.array([(calsulate_d(st) - calsulate_d(pre_y[steer_index])) for st in pre_y])
            # print("{:.2f}".format(max(dis_list)*100000))
            # print("{}".format(max(dis_list)*100000))

            # s_t_scaled = np.copy(s_t1)
            # s_t_scaled[0] = rescale_data(s_t_scaled[0], 0.5)
            # s_t_scaled[20] = rescale_data(s_t_scaled[20], 2.5)
            # s_t_scaled[21] = rescale_data(s_t_scaled[21], 0.7)
            # s_t_scaled[22] = rescale_data(s_t_scaled[22], 0.7)
            # s_t_scaled[23] = rescale_data(s_t_scaled[23], 0.7)
            # actions = actor.model.predict(s_t_scaled.reshape(1, s_t_scaled.shape[0]))
            # print(actions[0][0])

            # ob, r_t, done, info = env.step(new_a_t[0])
            ob, r_t, done, info = env.step(a_t[0])
            print "step: {} reward: {:.5f} action: {:.5f} {:.5f} {:.5f} ".format(
                j, r_t, a_t[0][0], a_t[0][1], a_t[0][2])
            # print(a_t[0][0])

            # print "{:.5f} {:.5f} {:.5f} {:.5f} {:.5f}".format(r_t, ob.speedX, ob.speedY, ob.speedZ, ob.rpm)
            # if(r_t < -50):
            #     r_t -= 10000
            #     done = True
            if j > 20 and ob.rpm <= 0.09426:
                r_t -= 1000
                done = True

            theta = 0.1
            s_t1 = np.hstack((ob.angle, ob.track, ob.trackPos, ob.speedX,
                              ob.speedY, ob.speedZ))

            # action_states = []
            # for i in range(-5, 6):

            # s_t1_new = np.array([val + np.abs(val)*random.uniform(-1,1)*theta for val in s_t1])
            # print(np.linalg.norm(s_t1_new - s_t1))
            # s_t1 = s_t1_new

            buff.add(s_t, a_t[0], r_t, s_t1, done)  #Add replay buffer
            # cur_step_sample = [s_t.tolist(), a_t[0].tolist(), r_t, s_t1.tolist(), done]
            # cur_sample.append(cur_step_sample)

            #Do the batch update
            batch = buff.getBatch(BATCH_SIZE)
            states = np.asarray([e[0] for e in batch])
            actions = np.asarray([e[1] for e in batch])
            rewards = np.asarray([e[2] for e in batch])
            new_states = np.asarray([e[3] for e in batch])
            dones = np.asarray([e[4] for e in batch])
            y_t = np.asarray([e[1] for e in batch])

            target_q_values = critic.target_model.predict(
                [new_states,
                 actor.target_model.predict(new_states)])

            for k in range(len(batch)):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + GAMMA * target_q_values[k]

            if (train_indicator):
                loss += critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = actor.model.predict(states)
                grads = critic.gradients(states, a_for_grad)
                actor.train(states, grads)
                actor.target_train()
                critic.target_train()

            total_reward += r_t
            s_t = s_t1

            # print("Episode", i, "Step", step, "Action", a_t, "Reward", r_t, "Loss", loss)

            step += 1
            if done:
                break

            if j > 500:
                break

        if np.mod(i, 3) == 0:
            if (train_indicator):
                print("Now we save model")
                actor.model.save_weights("saved/actormodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                         overwrite=True)
                # with open("actormodel.json", "w") as outfile:
                #     json.dump(actor.model.to_json(), outfile)

                critic.model.save_weights("saved/criticmodel_{}_{}.h5".format(
                    model_name, int(step / 10000)),
                                          overwrite=True)
                # with open("criticmodel.json", "w") as outfile:
                #     json.dump(critic.model.to_json(), outfile)

        print("TOTAL REWARD @ " + str(i) + "-th Episode  : Reward " +
              str(total_reward))
        print("Total Step: " + str(step))
        print("")
        s = "{},{},{},{},{},{:.3f}\n".format(gap, attack_step, attack_target,
                                             i, j, total_reward)
        attack_valid = 1
        attack_step = -1
        attack_target = 0
        with open('logs/pm_adv_test.csv'.format(model_name), 'a') as the_file:
            the_file.write(s)
        overall_scores.append(total_reward)
        plt.clf()
        plt.plot(overall_scores)
        plt.savefig("train_plots/{}_{}.jpg".format(model_name,
                                                   int(step / 10000)))
        # with open('samples/{}_{:05d}.pk'.format(model_name, i), 'w') as outfile:
        # pickle.dump(cur_sample, outfile)

    env.end()  # This is for shutting down TORCS
    print("Finish.")