Ejemplo n.º 1
0
def load_leader_weights(leader_path, epoch, _outdir):
    resume_path = leader_path + epoch
    weights_path = resume_path + '.h5'
    monitor_path = resume_path
    params_json = resume_path + '.json'

    with open(params_json) as file:
        d = json.load(file)
        learnStart = d.get('learnStart')
        learningRate = d.get('learningRate')
        discountFactor = d.get('discountFactor')
        memorySize = d.get('memorySize')
        network_inputs = d.get('network_inputs')
        network_outputs = d.get('network_outputs')
        network_structure = d.get('network_structure')

    leader_deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize,
                               discountFactor, learningRate, learnStart)
    leader_deepQ.initNetworks(network_structure)

    leader_deepQ.loadWeights(weights_path)

    clear_monitor_files(_outdir)
    copy_tree(monitor_path, _outdir)
Ejemplo n.º 2
0
        epochs = 10000
        steps = 100
        updateTargetNetwork = 10000
        explorationRate = 1
        minibatch_size = 64
        learnStart = 64
        learningRate = 0.00025
        discountFactor = 0.99
        memorySize = 1000000
        # network_inputs = 100
        network_inputs = 3
        network_outputs = 21
        network_structure = [300,300]
        current_epoch = 0

        deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize, discountFactor, learningRate, learnStart)
        deepQ.initNetworks(network_structure)
    else:
        #Load weights, monitor info and parameter info.
        #ADD TRY CATCH fro this else
        with open(params_json) as outfile:
            d = json.load(outfile)
            epochs = d.get('epochs')
            steps = d.get('steps')
            updateTargetNetwork = d.get('updateTargetNetwork')
            explorationRate = d.get('explorationRate')
            minibatch_size = d.get('minibatch_size')
            learnStart = d.get('learnStart')
            learningRate = d.get('learningRate')
            discountFactor = d.get('discountFactor')
            memorySize = d.get('memorySize')
Ejemplo n.º 3
0
    def start(self):
        ns = '/burger/'

        task_and_robot_environment_name = rospy.get_param(
            ns + "task_and_robot_environment_name")
        env = gym.make(task_and_robot_environment_name)

        filepath = os.path.dirname(os.path.abspath(__file__))
        outdir = filepath + '/model/gazebo_gym_experiments/'
        path = filepath + '/model/burger_war_dqn_ep'

        dt_now = datetime.datetime.now()
        resultpath = filepath + '/model/' + dt_now.strftime(
            '%Y-%m-%d-%H:%M:%S') + '.csv'

        resume_epoch = rospy.get_param(
            ns + "resume_epoch")  # change to epoch to continue from
        resume_path = path + resume_epoch
        weights_path = resume_path + '.h5'
        monitor_path = resume_path
        params_json = resume_path + '.json'

        if resume_epoch == "0":
            #Each time we take a sample and update our weights it is called a mini-batch.
            #Each time we run through the entire dataset, it's called an epoch.
            #PARAMETER LIST
            save_interval = rospy.get_param(ns + "save_interval")
            epochs = rospy.get_param(ns + "epochs")
            steps = rospy.get_param(ns + "steps")
            explorationRate = rospy.get_param(ns + "explorationRate")
            minibatch_size = rospy.get_param(ns + "minibatch_size")
            learningRate = rospy.get_param(ns + "learningRate")
            discountFactor = rospy.get_param(ns + "discountFactor")
            memorySize = rospy.get_param(ns + "memorySize")
            updateTargetNetwork = rospy.get_param(ns + "updateTargetNetwork")
            learnStart = rospy.get_param(ns + "learnStart")
            network_inputs = rospy.get_param(ns + "network_inputs")
            network_outputs = rospy.get_param(ns + "network_outputs")
            network_structure = rospy.get_param(ns + "network_structure")
            current_epoch = 0

            epsilon_decay = rospy.get_param(ns + "epsilon_decay")

            vel_max_x = rospy.get_param(ns + "vel_max_x")
            vel_min_x = rospy.get_param(ns + "vel_min_x")
            vel_max_z = rospy.get_param(ns + "vel_max_z")

            deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize,
                                discountFactor, learningRate, learnStart)
            deepQ.initNetworks(network_structure)
        else:
            #Load weights, monitor info and parameter info.
            #ADD TRY CATCH fro this else
            with open(params_json) as outfile:
                d = json.load(outfile)
                save_interval = d.get('save_interval')
                epochs = d.get('epochs')
                steps = d.get('steps')
                if self.runMode == 'test':
                    explorationRate = 0
                else:
                    explorationRate = d.get('explorationRate')
                minibatch_size = d.get('minibatch_size')
                learnStart = d.get('learnStart')
                discountFactor = d.get('discountFactor')
                memorySize = d.get('memorySize')
                network_outputs = d.get('network_outputs')
                updateTargetNetwork = d.get('updateTargetNetwork')
                learningRate = d.get('learningRate')
                network_inputs = d.get('network_inputs')
                network_outputs = d.get('network_outputs')
                network_structure = d.get('network_structure')
                current_epoch = d.get('current_epoch')

                epsilon_decay = d.get('epsilon_decay')

                vel_max_x = d.get('vel_max_x')
                vel_min_x = d.get('vel_min_x')
                vel_max_z = d.get('vel_max_z')

            deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize,
                                discountFactor, learningRate, learnStart)
            deepQ.initNetworks(network_structure)

            deepQ.loadWeights(weights_path)

            if not os.path.exists(outdir):
                os.makedirs(outdir)
            self.clear_monitor_files(outdir)
            if not os.path.exists(monitor_path):
                os.makedirs(monitor_path)
            copy_tree(monitor_path, outdir)

        env._max_episode_steps = steps  # env returns done after _max_episode_steps
        prod = False
        if self.runMode == 'test':
            prod = True
        env = gym.wrappers.Monitor(env, outdir, force=not prod, resume=prod)

        lastScores = [0] * save_interval
        lastScoresIndex = 0
        lastFilled = False
        stepCounter = 0

        start_time = time.time()

        #start iterating from 'current epoch'.
        env.set_mode(self.side, self.runMode, self.collisionMode,
                     network_outputs, vel_max_x, vel_min_x, vel_max_z)
        for epoch in xrange(current_epoch + 1, epochs + 1, 1):
            observation = env.reset()

            cumulated_reward = 0
            done = False
            episode_step = 0

            # run until env returns done
            while not done:
                # env.render()
                qValues = deepQ.getQValues(observation)

                action = deepQ.selectAction(qValues, explorationRate)

                newObservation, reward, done, info = env.step(action)
                cumulated_reward += reward

                deepQ.addMemory(observation, action, reward, newObservation,
                                done)

                if ((self.runMode != 'test') and (stepCounter >= learnStart)):
                    if stepCounter <= updateTargetNetwork:
                        deepQ.learnOnMiniBatch(minibatch_size, False)
                    else:
                        deepQ.learnOnMiniBatch(minibatch_size, True)

                observation = newObservation

                if done:
                    lastScores[lastScoresIndex] = episode_step
                    lastScoresIndex += 1
                    if lastScoresIndex >= save_interval:
                        lastFilled = True
                        lastScoresIndex = 0
                    m, s = divmod(int(time.time() - start_time), 60)
                    h, m = divmod(m, 60)
                    if not lastFilled:
                        print("EP " + str(epoch) + " - " +
                              format(episode_step + 1) +
                              " Episode steps   Exploration=" +
                              str(round(explorationRate, 2)))
                    else:
                        print("EP " + str(epoch) + " - " +
                              format(episode_step + 1) +
                              " Episode steps - last Steps : " +
                              str((sum(lastScores) / len(lastScores))) +
                              " - Cumulated R: " + str(cumulated_reward) +
                              "   Eps=" + str(round(explorationRate, 2)) +
                              "     Time: %d:%02d:%02d" % (h, m, s))

                        if (epoch % save_interval) == 0:
                            #save model weights and monitoring data every save_interval epochs.
                            deepQ.saveModel(path + str(epoch) + '.h5')
                            env._flush()
                            copy_tree(outdir, path + str(epoch))
                            #save simulation parameters.
                            parameter_keys = [
                                'save_interval', 'epochs', 'steps',
                                'updateTargetNetwork', 'explorationRate',
                                'minibatch_size', 'learnStart', 'learningRate',
                                'discountFactor', 'memorySize',
                                'network_inputs', 'network_outputs',
                                'network_structure', 'current_epoch',
                                'epsilon_decay', 'vel_max_x', 'vel_min_x',
                                'vel_max_z'
                            ]
                            parameter_values = [
                                save_interval, epochs, steps,
                                updateTargetNetwork, explorationRate,
                                minibatch_size, learnStart, learningRate,
                                discountFactor, memorySize, network_inputs,
                                network_outputs, network_structure, epoch,
                                epsilon_decay, vel_max_x, vel_min_x, vel_max_z
                            ]
                            parameter_dictionary = dict(
                                zip(parameter_keys, parameter_values))
                            with open(path + str(epoch) + '.json',
                                      'w') as outfile:
                                json.dump(parameter_dictionary, outfile)

                    with open(resultpath, mode='a') as f:
                        f.write(
                            str(epoch) + "," + format(episode_step + 1) + "," +
                            str(cumulated_reward) + "," +
                            str(round(explorationRate, 2)) + "," +
                            "%d:%02d:%02d" % (h, m, s) + "\n")

                stepCounter += 1
                if stepCounter % updateTargetNetwork == 0:
                    deepQ.updateTargetNetwork()
                    print("updating target network")

                episode_step += 1

            if self.runMode == 'train':
                explorationRate *= epsilon_decay  #epsilon decay
                # explorationRate -= (2.0/epochs)
                explorationRate = max(0.05, explorationRate)

            if self.runMode == 'test':
                break

        env.close()
Ejemplo n.º 4
0
def train(env,
          algorithm,
          params=None,
          load=False,
          loadpath=None,
          loaditer=None):

    if algorithm == ppo:
        assert args.gym_env == "DubinsCarEnv-v0" or args.gym_env == "PlanarQuadEnv-v0"

        # Initialize policy
        ppo.create_session()
        init_policy = ppo.create_policy('pi', env)
        ppo.initialize()

        if load and loadpath is not None and loaditer is not None:
            # load trained policy
            pi = init_policy
            pi.load_model(loadpath, iteration=loaditer)
            pi.save_model(MODEL_DIR, iteration=0)
        else:
            # init policy
            pi = init_policy
            pi.save_model(MODEL_DIR, iteration=0)

        # init params
        with open(params) as params_file:
            d = json.load(params_file)
            num_iters = d.get('num_iters')
            num_ppo_iters = d.get('num_ppo_iters')
            timesteps_per_actorbatch = d.get('timesteps_per_actorbatch')
            clip_param = d.get('clip_param')
            entcoeff = d.get('entcoeff')
            optim_epochs = d.get('optim_epochs')
            optim_stepsize = d.get('optim_stepsize')
            optim_batchsize = d.get('optim_batchsize')
            gamma = d.get('gamma')
            lam = d.get('lam')
            max_iters = num_ppo_iters

        # record performance data
        overall_perf = list()
        ppo_reward = list()
        ppo_length = list()
        suc_percents = list()
        wall_clock_time = list()

        best_suc_percent = 0
        best_pi = None
        perf_flag = False

        eval_ppo_reward = list()
        eval_suc_percents = list()
        # index for num_iters loop
        i = 1
        while i <= num_iters:
            wall_clock_time.append(time())
            logger.info('overall training iteration %d' % i)
            # each learning step contains "num_ppo_iters" ppo-learning steps.
            # each ppo-learning steps == ppo-learning on single episode
            # each single episode is a single markov chain which contains many states, actions, rewards.
            pi, ep_mean_length, ep_mean_reward, suc_percent = algorithm.ppo_learn(
                env=env,
                policy=pi,
                timesteps_per_actorbatch=timesteps_per_actorbatch,
                clip_param=clip_param,
                entcoeff=entcoeff,
                optim_epochs=optim_epochs,
                optim_stepsize=optim_stepsize,
                optim_batchsize=optim_batchsize,
                gamma=gamma,
                lam=lam,
                max_iters=max_iters,
                schedule='constant')

            ppo_length.extend(ep_mean_length)
            ppo_reward.extend(ep_mean_reward)
            suc_percents.append(suc_percent)

            # perf_metric = evaluate()
            # overall_perf.append(perf_metric)
            # print('[Overall Iter %d]: perf_metric = %.2f' % (i, perf_metric))

            pi.save_model(MODEL_DIR, iteration=i)
            plot_performance(range(len(ppo_reward)),
                             ppo_reward,
                             ylabel=r'avg reward per ppo-learning step',
                             xlabel='ppo iteration',
                             figfile=os.path.join(FIGURE_DIR, 'ppo_reward'),
                             title='TRAIN')
            plot_performance(
                range(len(suc_percents)),
                suc_percents,
                ylabel=r'overall success percentage per algorithm step',
                xlabel='algorithm iteration',
                figfile=os.path.join(FIGURE_DIR, 'success_percent'),
                title="TRAIN")

            # --- for plotting evaluation perf on success rate using early stopping trick ---
            logger.record_tabular('suc_percent', suc_percent)
            logger.record_tabular('best_suc_percent', best_suc_percent)
            logger.record_tabular('perf_flag', perf_flag)
            logger.dump_tabular()
            if suc_percent > best_suc_percent:
                best_suc_percent = suc_percent
                best_pi = pi
            if suc_percent > 0.6:
                perf_flag = True
            if not perf_flag:
                # less timesteps_per_actorbatch to make eval faster.
                _, _, eval_ep_mean_reward, eval_suc_percent = algorithm.ppo_eval(
                    env,
                    pi,
                    timesteps_per_actorbatch // 2,
                    max_iters=5,
                    stochastic=False)
            else:
                _, _, eval_ep_mean_reward, eval_suc_percent = algorithm.ppo_eval(
                    env,
                    best_pi,
                    timesteps_per_actorbatch // 2,
                    max_iters=5,
                    stochastic=False)
            eval_ppo_reward.extend(eval_ep_mean_reward)
            eval_suc_percents.append(eval_suc_percent)

            plot_performance(range(len(eval_ppo_reward)),
                             eval_ppo_reward,
                             ylabel=r'avg reward per ppo-eval step',
                             xlabel='ppo iteration',
                             figfile=os.path.join(FIGURE_DIR,
                                                  'eval_ppo_reward'),
                             title='EVAL')
            plot_performance(
                range(len(eval_suc_percents)),
                eval_suc_percents,
                ylabel=r'overall eval success percentage per algorithm step',
                xlabel='algorithm iteration',
                figfile=os.path.join(FIGURE_DIR, 'eval_success_percent'),
                title="EVAL")
            # -------------------------------------------------------------------------------

            # save data which is accumulated UNTIL iter i
            with open(
                    RESULT_DIR + '/ppo_length_' + 'iter_' + str(i) + '.pickle',
                    'wb') as f1:
                pickle.dump(ppo_length, f1)
            with open(
                    RESULT_DIR + '/ppo_reward_' + 'iter_' + str(i) + '.pickle',
                    'wb') as f2:
                pickle.dump(ppo_reward, f2)
            with open(
                    RESULT_DIR + '/success_percent_' + 'iter_' + str(i) +
                    '.pickle', 'wb') as fs:
                pickle.dump(suc_percents, fs)
            with open(
                    RESULT_DIR + '/wall_clock_time_' + 'iter_' + str(i) +
                    '.pickle', 'wb') as ft:
                pickle.dump(wall_clock_time, ft)

            # save evaluation data accumulated until iter i
            with open(
                    RESULT_DIR + '/eval_ppo_reward_' + 'iter_' + str(i) +
                    '.pickle', 'wb') as f_er:
                pickle.dump(eval_ppo_reward, f_er)
            with open(
                    RESULT_DIR + '/eval_success_percent_' + 'iter_' + str(i) +
                    '.pickle', 'wb') as f_es:
                pickle.dump(eval_suc_percents, f_es)

            # Incrementing our algorithm's loop counter
            i += 1

        # plot_performance(range(len(overall_perf)), overall_perf, ylabel=r'overall performance per algorithm step',
        #                  xlabel='algorithm iteration',
        #                  figfile=os.path.join(FIGURE_DIR, 'overall_perf'))

        # overall, we need plot the time-to-reach for the best policy so far.

        env.close()

        return pi

    elif algorithm == deepq:
        assert args.gym_env == "DubinsCarEnv_dqn-v0" or args.gym_env == "PlanarQuadEnv_dqn-v0"
        # do something about dqn training
        tmp_path = MODEL_DIR + '/ep'

        continue_execution = False
        resume_epoch = '200'
        resume_path = tmp_path + resume_epoch
        weights_path = resume_path + '.h5'
        params_json = resume_path + '.json'

        epochs = steps = updateTargetNetwork = explorationRate = minibatch_size = learnStart = learningRate= \
        discountFactor = memorySize = network_inputs = network_outputs = network_structure = current_epoch = None

        if not continue_execution:
            # Each time we take a sample and update our weights it is called a mini-batch.
            # Each time we run through the entire dataset, it's called an epoch.
            epochs = 1000
            steps = 1000
            updateTargetNetwork = 10000
            explorationRate = 1
            minibatch_size = 128
            learnStart = 64
            learningRate = 0.00025
            discountFactor = 0.99
            memorySize = 1000000
            network_inputs = env.state_dim + env.num_lasers
            # network_outputs = 21
            network_outputs = 25
            network_structure = [300, 300]
            current_epoch = 0

            deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize,
                                discountFactor, learningRate, learnStart)
            deepQ.initNetworks(network_structure)
        else:
            # Load weights and parameter info.
            with open(params_json) as outfile:
                d = json.load(outfile)
                epochs = d.get('epochs')
                steps = d.get('steps')
                updateTargetNetwork = d.get('updateTargetNetwork')
                explorationRate = d.get('explorationRate')
                minibatch_size = d.get('minibatch_size')
                learnStart = d.get('learnStart')
                learningRate = d.get('learningRate')
                discountFactor = d.get('discountFactor')
                memorySize = d.get('memorySize')
                network_inputs = d.get('network_inputs')
                network_outputs = d.get('network_outputs')
                network_structure = d.get('network_structure')
                current_epoch = d.get('current_epoch')

            deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize,
                                discountFactor, learningRate, learnStart)
            deepQ.initNetworks(network_structure)

            deepQ.loadWeights(weights_path)

        env._max_episode_steps = steps
        last100Scores = [0] * 100
        last100ScoresIndex = 0
        last100Filled = False
        stepCounter = 0
        highest_reward = 0

        start_time = time()

        # start iterating from 'current epoch'.
        for epoch in np.arange(current_epoch + 1, epochs + 1, 1):
            observation = env.reset()
            cumulated_reward = 0
            done = False
            episode_step = 0

            # run until env returns done
            while not done:
                # env.render()
                qValues = deepQ.getQValues(observation)

                action = deepQ.selectAction(qValues, explorationRate)

                newObservation, reward, done, suc, info = env.step(action)

                cumulated_reward += reward
                if highest_reward < cumulated_reward:
                    highest_reward = cumulated_reward

                deepQ.addMemory(observation, action, reward, newObservation,
                                done)

                if stepCounter >= learnStart:
                    if stepCounter <= updateTargetNetwork:
                        deepQ.learnOnMiniBatch(minibatch_size, False)
                    else:
                        deepQ.learnOnMiniBatch(minibatch_size, True)

                observation = newObservation

                if done:
                    last100Scores[last100ScoresIndex] = episode_step
                    last100ScoresIndex += 1
                    if last100ScoresIndex >= 100:
                        last100Filled = True
                        last100ScoresIndex = 0
                    if not last100Filled:
                        print("EP " + str(epoch) + " - " +
                              format(episode_step + 1) + "/" + str(steps) +
                              " Episode steps   Exploration=" +
                              str(round(explorationRate, 2)))
                    else:
                        m, s = divmod(int(time() - start_time), 60)
                        h, m = divmod(m, 60)
                        print("EP " + str(epoch) + " - " +
                              format(episode_step + 1) + "/" + str(steps) +
                              " Episode steps - last100 Steps : " +
                              str((sum(last100Scores) / len(last100Scores))) +
                              " - Cumulated R: " + str(cumulated_reward) +
                              "   Eps=" + str(round(explorationRate, 2)) +
                              "     Time: %d:%02d:%02d" % (h, m, s))
                        if epoch % 100 == 0:
                            # save model weights and monitoring data every 100 epochs.
                            deepQ.saveModel(tmp_path + str(epoch) + '.h5')
                            # save simulation parameters.
                            # convert from numpy int64 type to python int type for json serialization
                            epoch = int(epoch)
                            parameter_keys = [
                                'epochs', 'steps', 'updateTargetNetwork',
                                'explorationRate', 'minibatch_size',
                                'learnStart', 'learningRate', 'discountFactor',
                                'memorySize', 'network_inputs',
                                'network_outputs', 'network_structure',
                                'current_epoch'
                            ]
                            parameter_values = [
                                epochs, steps, updateTargetNetwork,
                                explorationRate, minibatch_size, learnStart,
                                learningRate, discountFactor, memorySize,
                                network_inputs, network_outputs,
                                network_structure, epoch
                            ]
                            parameter_dictionary = dict(
                                zip(parameter_keys, parameter_values))
                            with open(tmp_path + str(epoch) + '.json',
                                      'w') as outfile:
                                json.dump(parameter_dictionary, outfile)

                stepCounter += 1
                if stepCounter % updateTargetNetwork == 0:
                    deepQ.updateTargetNetwork()
                    print("updating target network")

                episode_step += 1

            explorationRate *= 0.995  # epsilon decay
            # explorationRate -= (2.0/epochs)
            explorationRate = max(0.05, explorationRate)

        env.close()
        return 1

    else:
        raise ValueError("Please input an valid algorithm")
Ejemplo n.º 5
0
        epochs = 350
        steps = 1000
        updateTargetNetwork = 5000
        explorationRate = 1
        minibatch_size = 32
        learnStart = 64
        learningRate = 0.00025
        discountFactor = 0.99
        memorySize = 100000
        network_inputs = 9 + 3 * uav_count
        network_outputs = 7
        network_structure = [50, 50]
        current_epoch = 0

        # init and create deepQ
        leader_deepQ = deepq.DeepQ(network_inputs, network_outputs, memorySize,
                                   discountFactor, learningRate, learnStart)
        leader_deepQ.initNetworks(network_structure)

        follower_deepQ = deepq.DeepQ((network_inputs - 2), network_outputs,
                                     memorySize, discountFactor, learningRate,
                                     learnStart)
        follower_deepQ.initNetworks(network_structure)

        leader_deepQ.memory.liveplot = plotter

    # load weights, monitor info and parameter info,
    else:
        with open(params_json) as outfile:
            d = json.load(outfile)
            epochs = d.get('epochs')
            steps = d.get('steps')