Exemple #1
0
 def __init__(self, row_sz: int, col_sz: int):
     self.grid = Grid(row_sz, col_sz)
     self.policy = Policy(env.ACTIONS, env.ETA, env.GAMMA, env.EPSILON)
     self.robby = Agent(self.grid, np.random.randint(0, row_sz),
                        np.random.randint(0, col_sz), self.policy)
     self.epoch = 1
     self.rewards_per_episode = []
Exemple #2
0
class Visualizer(object):
    def __init__(self, game, network, train_directory):
        self.game = game
        env_name = '%sNoFrameskip-v4' % game
        env = gym.make(env_name)
        # env = gym.wrappers.Monitor(env, '/tmp/temp_%s' % game, mode='evaluation', force=True)

        vb_file = os.path.join(train_directory, "vb.npy")
        vb = np.load(vb_file)
        parameters_file = 'parameters_81'

        self.policy = Policy(env, network, "relu")

        parameters_path = os.path.join(train_directory, parameters_file)
        print('Using parameters file %s \n' % parameters_path)

        with open(parameters_path, 'rb') as f:
            parameters = pickle.load(f)['parameters']

        self.policy.set_parameters(parameters)
        self.policy.set_vb(vb)

    def play_game(self):
        rews = [0]*100
        for i in range(100):
            rew,step = self.policy.rollout()
            rews[i] = rew
        print(np.mean(rews))
        print(np.max(rews))
        print(rews)
Exemple #3
0
class Visualizer(object):
    def __init__(self, game, network, train_directory):
        self.game = game
        env_name = '%sNoFrameskip-v4' % game
        env = gym.make(env_name)
        env = gym.wrappers.Monitor(env,
                                   '/tmp/temp_%s' % game,
                                   mode='evaluation',
                                   force=True)

        vb_file = os.path.join(train_directory, "vb.npy")
        vb = np.load(vb_file)
        parameters_file = sorted(os.listdir(train_directory))[-3]

        self.policy = Policy(env, network, "elu")

        parameters_path = os.path.join(train_directory, parameters_file)
        print('Using parameters file %s \n' % parameters_path)

        with open(parameters_path, 'rb') as f:
            parameters = pickle.load(f)['params']

        self.policy.set_parameters(parameters)
        self.policy.set_vb(vb)

    def play_game(self):
        print(self.policy.rollout(render=True))
Exemple #4
0
def main():
    args = parse_args()
    args.rom_path = args.rom_path + utils.game_file(args.game_name)

    print(args)

    if args.mode != 1:
        drrn = DRRN(args)
        drrn.train(batch_size=args.batch_size, epochs=args.qnet_iter)        
        
    elif args.mode != 0:
        policy = Policy(args)
        policy.train(epochs=args.policy_epoch)           

    else:
        print("Argument Error!!")
Exemple #5
0
    def __init__(self, game, network, train_directory):
        self.game = game
        env_name = '%sNoFrameskip-v4' % game
        env = gym.make(env_name)
        # env = gym.wrappers.Monitor(env, '/tmp/temp_%s' % game, mode='evaluation', force=True)

        vb_file = os.path.join(train_directory, "vb.npy")
        vb = np.load(vb_file)
        parameters_file = 'parameters_81'

        self.policy = Policy(env, network, "relu")

        parameters_path = os.path.join(train_directory, parameters_file)
        print('Using parameters file %s \n' % parameters_path)

        with open(parameters_path, 'rb') as f:
            parameters = pickle.load(f)['parameters']

        self.policy.set_parameters(parameters)
        self.policy.set_vb(vb)
Exemple #6
0
def main():
    args = parse_args()
    print(args)

    args.rom_path = args.rom_path + utils.game_file(args.game_name)
    data_path = args.data_path.replace('GAME', args.game_name)

    if args.seed is None:
        import random
        args.seed = random.randint(0, 1000)

    np.random.seed(args.seed)

    import sentencepiece as spm
    sp = spm.SentencePieceProcessor()
    sp.Load('spm_models/unigram_8k.model')

    log_dir = data_path + '/%s_trial_%d/round_%d/' % (args.uct_type,
                                                      args.trial, args.round)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)

    env = JerichoEnv(args.rom_path, args.seed, args.env_step_limit)
    env.create()
    visited_transitions = []

    ob, info = env.reset()

    done = False
    cum_reward = info['score']
    step = 0

    if args.load_cache:
        try:
            valid_action_dict = np.load('cache/%s_valid_action_dict.npy' %
                                        args.game_name,
                                        allow_pickle=True)[()]
        except EOFError:
            print("EOFError: skip loading cache..")
            valid_action_dict = None
        except OSError:
            print("OSError: skip loading cache..")
            valid_action_dict = None

    else:
        valid_action_dict = None

    actions_info = None

    prev_action = '<START>'

    if args.round == 0:
        policy = None
    elif args.round > 0:
        policy = Policy(args)
        policy.load_weights(
            'weights/%s/round_%s/%s_weight_policy_best_seed%d.pickle' %
            (args.game_name, args.round - 1, args.uct_type, args.trial))
        args.load_path = 'weights/%s/round_%s/%s_weight_q_best_seed%d.pickle' % (
            args.game_name, args.round - 1, args.uct_type, args.trial)
    else:
        raise NotImplementedError

    import time
    start = time.time()

    log_file = log_dir + 'mcts_log_d%02d_s%d_e%d_%02d.txt'\
               % (args.max_depth, args.simulation_per_act, args.exploration_constant, args.seed)
    data = open(log_file, 'w')
    replay_buffer_filename = log_dir + 'mcts_replay_d%02d_%02d.txt' % (
        args.max_depth, args.seed)
    replay_buffer_file = open(replay_buffer_filename, 'w')

    for cur_depth in range(args.max_episode_len):
        agent = MCTSAgent(args,
                          env.copy(),
                          policy,
                          uct_type=args.uct_type,
                          valid_action_dict=valid_action_dict,
                          actions_info=actions_info,
                          log_dir=log_dir,
                          visited_transitions=visited_transitions,
                          replay_file=replay_buffer_file)
        prev_action_str = '[PREV_ACTION] ' + prev_action + '\n'
        root_node, action, visited_transitions = agent.search(
            ob, info, cur_depth)

        data.write('#######################################################\n')
        state_str = '[OBS] ' + ob + '\n' + '[LOOK] ' + info[
            'look'] + '\n' + '[INV] ' + info['inv'] + '\n'
        valid_action_strs = [
            '[VALID_ACTION] ' + valid + '\n' for valid in info['valid']
        ]
        action_str = '[ACTION] ' + action + '\n'

        data.write(state_str)
        for valid_action_str in valid_action_strs:
            data.write(valid_action_str)
        data.write(action_str)
        data.write(prev_action_str)

        ob, reward, done, info = env.step(action)

        cum_reward += reward
        score = info['score']
        step += 1

        next_ob_text = ob + info['look'] + info['inv']

        if '*** You have won ***' in next_ob_text or '*** You have died ***' in next_ob_text:
            score = int(
                next_ob_text.split('you scored ')[1].split(' out of')[0])
            reward = score - cum_reward

        data.write('Reward: %d, Cum_reward: %d \n' % (reward, score))

        for action_node in root_node.children:
            data.write('%s Q_val: %f Q_hat: %f count: %d \n' %
                       (action_node.action, action_node.Q, action_node.Q_hat,
                        action_node.N))

        prev_action = action

        print('##########################')
        print('STEP: %s' % step)
        print(root_node.state)
        print()
        print('BEST_ACTION: ', action)
        print()
        print('Valid actions:',
              [action.action for action in root_node.children])
        print('Q-values', [action.Q for action in root_node.children])
        print('Q-hat', [action.Q_hat for action in root_node.children])
        print('Final Q',
              [action.Q + action.Q_hat for action in root_node.children])
        print('Maximum Q', [
            0 if len(action.Rs) == 0 else max(action.Rs)
            for action in root_node.children
        ])
        print('Count of actions', [action.N for action in root_node.children])
        print('Action Probs:', [prob for prob in root_node.children_probs])
        print()
        print('Reward: %s, CUM_Reward: %s' % (reward, score))
        print()
        print(ob + info['look'] + info['inv'])
        print(flush=True)

        valid_action_dict = agent.valid_action_dict
        actions_info = [agent.actions, agent.actions_e]

        if args.save_cache:
            np.save('cache/%s_valid_action_dict.npy' % args.game_name,
                    valid_action_dict)

        if '*** You have won ***' in next_ob_text or '*** You have died ***' in next_ob_text:
            break

    print('TOTAL TIME: ', time.time() - start)
    data.close()
    replay_buffer_file.close()
Exemple #7
0
def main(ep_per_cpu, game, configuration_file, run_name):
    start_time = time.time()

    with open(configuration_file, 'r') as f:
        configuration = json.loads(f.read())

    env_name = '%sNoFrameskip-v4' % game

    # MPI stuff
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    cpus = comm.Get_size()

    # One cpu (rank 0) will evaluate results
    train_cpus = cpus - 1
    k = 10
    epoch = 5
    m = 1

    # Deduce population size
    lam = train_cpus * ep_per_cpu

    # Create environment
    env = gym.make(env_name)

    # Create policy (Deep Neural Network)
    # Internally it applies preprocessing to the environment state
    policy = Policy(env,
                    network=configuration['network'],
                    nonlin_name=configuration['nonlin_name'])

    # Create reference batch used for normalization
    # It will be overwritten with vb from worker with rank 0
    vb = policy.get_vb()

    # Extract vector with current parameters.
    #parameter have length of 1693380
    parameters = policy.get_parameters()
    shape = policy.parameter_shapes

    # Send parameters from worker 0 to all workers (MPI stuff)
    # to ensure that every worker starts in the same position

    #comm.Bcast([parameters, MPI.FLOAT], root=0)
    comm.Bcast([vb, MPI.FLOAT], root=0)

    # Create optimizer with user defined settings (hyperparameters)
    OptimizerClass = optimizer_dict[configuration['optimizer']]
    optimizer = OptimizerClass(train_cpus, parameters, shape, lam, rank,
                               configuration["settings"], epoch, m)

    # Set the same virtual batch for each worker
    if rank != 0:
        policy.set_vb(vb)
        rews = [0]
        e_r = 0
        p = optimizer.get_parameters()
        policy.set_parameters(p)
        for j in range(k):
            e_rew, e_len = policy.rollout()
            e_r += e_rew
        rews[0] = e_r / k
        optimizer.rew = e_r / k
        msg = np.array(rews)
        pp = p
    # Only rank 0 worker will log information from the training
    logger = Logger(
        optimizer.log_path(game, configuration['network'], run_name))
    if rank == 0:
        # Initialize logger, save virtual batch and save some basic stuff at the beginning
        logger.save_vb(vb)
        logger.log('Game'.ljust(25) + '%s' % game)
        logger.log('Network'.ljust(25) + '%s' % configuration['network'])
        logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer'])
        logger.log('Number of CPUs'.ljust(25) + '%d' % cpus)
        logger.log('Population'.ljust(25) + '%d' % lam)
        logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters))

        # Log basic info from the optimizer
        # optimizer.log_basic(logger)
        msg = np.zeros(1)
        pp = np.zeros(optimizer.n)
    results = np.empty((cpus, 1))
    ppp = np.empty((cpus, optimizer.n))
    comm.Allgather([msg, MPI.FLOAT], [results, MPI.FLOAT])
    comm.Allgather([pp, MPI.FLOAT], [ppp, MPI.FLOAT])
    results = results[1:, :]
    ppp = ppp[1:, :].flatten()
    rews = results[:, :1].flatten()
    BestScore = max(rews)
    Bestid = np.argmax(rews)
    BestFound = ppp[Bestid * optimizer.n:(Bestid + 1) * optimizer.n]
    if rank == 0:
        logger.log('Best'.ljust(25) + '%f' % BestScore)

    # We will count number of steps
    # frames = 4 * steps
    steps_passed = 0
    iteration = 1
    while steps_passed <= 25000000:
        # Iteration start time
        iter_start_time = time.time()
        if iteration % epoch == 1:
            optimizer.sigupdatelist = np.zeros(optimizer.n)
        llambda = np.random.normal(1, 0.1 - 0.1 * steps_passed / 25000000)
        # Workers that run train episodes
        optimizer.RandomGrouping()
        for ii in range(m):
            optimizer.groupnum = ii
            if rank != 0:
                # Empty arrays for each episode. We save: length, reward, noise index
                lens1 = [0]
                rews1 = [0]
                orew = [0]
                # sig1 = [0]
                # For each episode in this CPU we get new parameters,
                # update policy network and perform policy rollout
                e_r = 0
                e_l = 0
                p = optimizer.get_parameters1()
                policy.set_parameters(p)
                for j in range(k):
                    e_rew, e_len = policy.rollout()
                    e_r += e_rew
                    e_l += e_len
                lens1[0] = e_l
                rews1[0] = e_r / k
                optimizer.rew1 = e_r / k
                orew[0] = optimizer.rew
                if iteration % epoch == 1:
                    sig1 = optimizer.sigmalist
                # Aggregate information, will later send it to each worker using MPI
                msg1 = np.array(rews1 + lens1 + orew, dtype=np.float64)
                pp1 = optimizer.parameters1
                if iteration % epoch == 1:
                    sigmsg1 = sig1
            # Worker rank 0 that runs evaluation episodes
            else:
                # Empty array, evaluation results are not used for the update
                msg1 = np.zeros(3, dtype=np.float64)
                pp1 = optimizer.parameters
                if iteration % epoch == 1:
                    sigmsg1 = np.zeros(optimizer.n)
            # MPI stuff
            # Initialize array which will be updated with information from all workers using MPI
            results1 = np.empty((cpus, 3), dtype=np.float64)
            ppp1 = np.empty((cpus, optimizer.n))
            if iteration % epoch == 1:
                sigmsgs1 = np.empty((cpus, optimizer.n))
            comm.Allgather([msg1, MPI.FLOAT], [results1, MPI.FLOAT])
            comm.Allgather([pp1, MPI.FLOAT], [ppp1, MPI.FLOAT])
            if iteration % epoch == 1:
                comm.Allgather([sigmsg1, MPI.FLOAT], [sigmsgs1, MPI.FLOAT])
            ppp1 = ppp1[1:, :].flatten()
            if iteration % epoch == 1:
                sigmsgs1 = sigmsgs1[1:, :].flatten()
            # Skip empty evaluation results from worker with id 0
            results1 = results1[1:, :]
            # Extract IDs and rewards
            rews1 = results1[:, :1].flatten()
            lens1 = results1[:, 1:2].flatten()
            oreward = results1[:, 2:].flatten()
            newBestidx = np.argmax(rews1)

            if np.max(rews1) > BestScore:
                BestScore = rews1[newBestidx]
                BestFound = ppp1[newBestidx * optimizer.n:(newBestidx + 1) *
                                 optimizer.n]
            #uodate parameters, sigmas, rews
            if rank != 0:
                optimizer.update(ppp, BestScore, sigmsgs1, llambda)
            # Steps passed = Sum of episode steps from all offsprings
            steps = np.sum(lens1)
            steps_passed += steps
        # Write some logs for this iteration
        # Using logs we are able to recover solution saved
        # after 1 hour of training or after 1 billion frames
        if rank == 0:
            eval_mean_rew = np.mean(oreward)
            eval_mean_rew1 = np.mean(rews1)
            iteration_time = (time.time() - iter_start_time)
            time_elapsed = (time.time() - start_time) / 60
            logger.log('------------------------------------')
            logger.log('Iteration'.ljust(25) + '%f' % iteration)
            logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_mean_rew)
            logger.log('EvalMeanReward1'.ljust(25) + '%f' % eval_mean_rew1)
            logger.log('StepsThisIter'.ljust(25) + '%f' % steps)
            logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed)
            logger.log('IterationTime'.ljust(25) + '%f' % iteration_time)
            logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed)
            logger.log('Best'.ljust(25) + '%f' % BestScore)
            # Give optimizer a chance to log its own stuff
            # optimizer.log(logger)
            logger.log('------------------------------------')
            if iteration % 20 == 1:
                fin_rews = 0
                p = BestFound
                policy.set_parameters(p)
                for i in range(30):
                    e_rew, e_len = policy.rollout()
                    fin_rews += e_rew
                fin_eval = fin_rews / 30
            else:
                fin_eval = 0
            # Write stuff for training curve plot
            stat_string = "{},\t{},\t{},\t{}\n".\
                format(steps_passed, (time.time()-start_time),
                        eval_mean_rew1,  fin_eval)
            logger.write_general_stat(stat_string)
            logger.write_optimizer_stat(optimizer.stat_string())
            # Save currently proposed solution every 20 iterations
            if iteration % 20 == 1:
                logger.save_parameters(BestFound, iteration)
        else:
            if iteration % epoch == 0:
                optimizer.updatesigma()
        iteration += 1
    #test best
    if rank == 0:
        final_rews = []
        p = BestFound
        policy.set_parameters(p)
        for i in range(200):
            e_rew, e_len = policy.rollout()
            final_rews.append(e_rew)
        final_eval = np.mean(final_rews)
        logger.log('Final'.ljust(25) + '%f' % final_eval)
        logger.save_parameters(BestFound, iteration)
Exemple #8
0
class Problem:
    def __init__(self, row_sz: int, col_sz: int):
        self.grid = Grid(row_sz, col_sz)
        self.policy = Policy(env.ACTIONS, env.ETA, env.GAMMA, env.EPSILON)
        self.robby = Agent(self.grid, np.random.randint(0, row_sz),
                           np.random.randint(0, col_sz), self.policy)
        self.epoch = 1
        self.rewards_per_episode = []

    def run(self, n: int, m: int) -> (float, float):
        """
        Run a training session (runs the robot for m steps, n times) and create a plot, saved in src/training_plot.png.
        Then runs a test session, retuning the mean and standard deviation.
        :param n: The number of episodes
        :param m: The number of steps
        :return: mean, standard deviation
        """
        self._train(n, m)
        return self._test(n, m)

    def _train(self, episodes: int, steps: int):
        """
        Runs the robot for a certain number of steps (steps), and runs this a certain number of times (episodes).
        Each episode, the epsilon value is updated, a new grid is generated along with a starting position for the
        robot. The total reward accumulated per episode is tracked in the rewards_per_episode list. A training plot
        is then created and saved.
        """
        print("Beginning Training...\n")
        for i in range(episodes):
            reward_accumulated = self.run_n_steps(steps)
            self.policy.update_e()
            self.epoch += 1
            self.grid = Grid(env.GRID_BOUND, env.GRID_BOUND)
            self.robby = Agent(self.grid, np.random.randint(0, env.GRID_BOUND),
                               np.random.randint(0, env.GRID_BOUND),
                               self.policy)
            if self.epoch % 100 == 0:
                self.rewards_per_episode.append(reward_accumulated)
            print("Episode: {}, total reward for episode: {}".format(
                i + 1, reward_accumulated))
        self._create_training_plot()

    def _test(self, episodes: int, steps: int) -> (float, float):
        """
        Runs a test session where the epsilon value is fixed at a value, and the robot takes a certain number
        of steps (steps), for a certain number of episodes (episodes).  Total rewards per episode are tracked and
        the mean and standard deviation are returned.
        :param episodes:
        :param steps:
        """
        print("Beginning Test...\n")
        self.policy.reset_e()
        self.rewards_per_episode = []
        for i in range(episodes):
            self.grid = Grid(env.GRID_BOUND, env.GRID_BOUND)
            self.robby = Agent(self.grid, np.random.randint(0, env.GRID_BOUND),
                               np.random.randint(0, env.GRID_BOUND),
                               self.policy)
            reward_accumulated = self.run_n_steps(steps)
            self.rewards_per_episode.append(reward_accumulated)
            print("Episode: {}, total reward for episode: {}".format(
                i + 1, reward_accumulated))
        return self._calculate_test_mean_std()

    def run_n_steps(self, steps: int) -> int:
        """
        Run the robot for a certain number of steps.
        :param steps: number of steps to run the simulation
        :return: the reward accumulated
        """
        for i in range(steps):
            self.robby.take_action()
        return self.robby.total_reward

    def _create_training_plot(self):
        episodes = [x * 100 for x in range(1, 51)]
        plt.plot(episodes, self.rewards_per_episode, 'ro')
        plt.savefig('training_plot.png', bbox_inches='tight')

    def _calculate_test_mean_std(self) -> (float, float):
        mean = np.mean(self.rewards_per_episode)
        std = np.std(self.rewards_per_episode)
        return mean, std
Exemple #9
0
def main(ep_per_cpu, game, configuration_file, run_name):
    start_time = time.time()

    with open(configuration_file, 'r') as f:
        configuration = json.loads(f.read())

    env_name = '%sNoFrameskip-v4' % game

    # MPI stuff
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    cpus = comm.Get_size()

    # One cpu (rank 0) will evaluate results
    train_cpus = cpus - 1

    # Deduce population size
    lam = train_cpus * ep_per_cpu

    # Create environment
    env = gym.make(env_name)

    # Create policy (Deep Neural Network)
    # Internally it applies preprocessing to the environment state
    policy = Policy(env,
                    network=configuration['network'],
                    nonlin_name=configuration['nonlin_name'])

    # Create reference batch used for normalization
    # It will be overwritten with vb from worker with rank 0
    vb = policy.get_vb()

    # Extract vector with current parameters.
    parameters = policy.get_parameters()

    # Send parameters from worker 0 to all workers (MPI stuff)
    # to ensure that every worker starts in the same position
    comm.Bcast([parameters, MPI.FLOAT], root=0)
    comm.Bcast([vb, MPI.FLOAT], root=0)

    # Set the same virtual batch for each worker
    if rank != 0:
        policy.set_vb(vb)

    # Create optimizer with user defined settings (hyperparameters)
    OptimizerClass = optimizer_dict[configuration['optimizer']]
    optimizer = OptimizerClass(parameters, lam, rank,
                               configuration["settings"])

    # Only rank 0 worker will log information from the training
    logger = None
    if rank == 0:
        # Initialize logger, save virtual batch and save some basic stuff at the beginning
        logger = Logger(
            optimizer.log_path(game, configuration['network'], run_name))
        logger.save_vb(vb)

        # Log basic stuff
        logger.log('Game'.ljust(25) + '%s' % game)
        logger.log('Network'.ljust(25) + '%s' % configuration['network'])
        logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer'])
        logger.log('Number of CPUs'.ljust(25) + '%d' % cpus)
        logger.log('Population'.ljust(25) + '%d' % lam)
        logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters))

        # Log basic info from the optimizer
        optimizer.log_basic(logger)

    # We will count number of steps
    # frames = 4 * steps (3 * steps for SpaceInvaders)
    steps_passed = 0
    while steps_passed <= 25000000:
        # Iteration start time
        iter_start_time = time.time()
        # Workers that run train episodes
        if rank != 0:
            # Empty arrays for each episode. We save: length, reward, noise index
            lens = [0] * ep_per_cpu
            rews = [0] * ep_per_cpu
            inds = [0] * ep_per_cpu

            # For each episode in this CPU we get new parameters,
            # update policy network and perform policy rollout
            for i in range(ep_per_cpu):
                ind, p = optimizer.get_parameters()
                policy.set_parameters(p)
                e_rew, e_len = policy.rollout()
                lens[i] = e_len
                rews[i] = e_rew
                inds[i] = ind

            # Aggregate information, will later send it to each worker using MPI
            msg = np.array(rews + lens + inds, dtype=np.int32)

        # Worker rank 0 that runs evaluation episodes
        else:
            rews = [0] * ep_per_cpu
            lens = [0] * ep_per_cpu
            for i in range(ep_per_cpu):
                ind, p = optimizer.get_parameters()
                policy.set_parameters(p)
                e_rew, e_len = policy.rollout()
                rews[i] = e_rew
                lens[i] = e_len

            eval_mean_rew = np.mean(rews)
            eval_max_rew = np.max(rews)

            # Empty array, evaluation results are not used for the update
            msg = np.zeros(3 * ep_per_cpu, dtype=np.int32)

        # MPI stuff
        # Initialize array which will be updated with information from all workers using MPI
        results = np.empty((cpus, 3 * ep_per_cpu), dtype=np.int32)
        comm.Allgather([msg, MPI.INT], [results, MPI.INT])

        # Skip empty evaluation results from worker with id 0
        results = results[1:, :]

        # Extract IDs and rewards
        rews = results[:, :ep_per_cpu].flatten()
        lens = results[:, ep_per_cpu:(2 * ep_per_cpu)].flatten()
        ids = results[:, (2 * ep_per_cpu):].flatten()

        # Update parameters
        optimizer.update(ids=ids, rewards=rews)

        # Steps passed = Sum of episode steps from all offsprings
        steps = np.sum(lens)
        steps_passed += steps

        # Write some logs for this iteration
        # Using logs we are able to recover solution saved
        # after 1 hour of training or after 1 billion frames
        if rank == 0:
            iteration_time = (time.time() - iter_start_time)
            time_elapsed = (time.time() - start_time) / 60
            train_mean_rew = np.mean(rews)
            train_max_rew = np.max(rews)
            logger.log('------------------------------------')
            logger.log('Iteration'.ljust(25) + '%f' % optimizer.iteration)
            logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_mean_rew)
            logger.log('EvalMaxReward'.ljust(25) + '%f' % eval_max_rew)
            logger.log('TrainMeanReward'.ljust(25) + '%f' % train_mean_rew)
            logger.log('TrainMaxReward'.ljust(25) + '%f' % train_max_rew)
            logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed)
            logger.log('StepsThisIter'.ljust(25) + '%f' % steps)
            logger.log('IterationTime'.ljust(25) + '%f' % iteration_time)
            logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed)

            # Give optimizer a chance to log its own stuff
            optimizer.log(logger)
            logger.log('------------------------------------')

            # Write stuff for training curve plot
            stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n".\
                format(steps_passed, (time.time()-start_time),
                       eval_mean_rew, eval_max_rew, train_mean_rew, train_max_rew)
            logger.write_general_stat(stat_string)
            logger.write_optimizer_stat(optimizer.stat_string())
            # Save currently proposed solution every 20 iterations
            if optimizer.iteration % 20 == 1:
                logger.save_parameters(optimizer.parameters,
                                       optimizer.iteration)
    #test best
    if rank == 0:
        final_rews = []
        for i in range(200):
            indd, p = optimizer.get_parameters()
            policy.set_parameters(p)
            e_rew, e_len = policy.rollout()
            final_rews.append(e_rew)
        final_eval = np.mean(final_rews)
        logger.log('Final'.ljust(25) + '%f' % final_eval)
        logger.save_parameters(optimizer.parameters, optimizer.iteration)
Exemple #10
0
def main(ep_per_cpu, game, configuration_file, run_name):
    start_time = time.time()

    with open(configuration_file, 'r') as f:
        configuration = json.loads(f.read())

    env_name = '%sNoFrameskip-v4' % game

    # MPI stuff
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    cpus = comm.Get_size()

    # Meta Population
    meta_pop_size = 5
    meta_pop_active = list(range(meta_pop_size))
    next_havling_time = 40 # minutes for next


    mu_list = [5, 10, 20, 50, 100]

    # One cpu (rank 0) will evaluate results
    train_cpus = cpus - meta_pop_size

    # Deduce population size
    lam = train_cpus * ep_per_cpu

    # Create environment
    env = gym.make(env_name)

    # Create policy (Deep Neural Network)
    # Internally it applies preprocessing to the environment state
    policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name'])

    # Create reference batch used for normalization
    # It will be overwritten with vb from worker with rank 0
    vb = policy.get_vb()

    # Extract vector with current parameters.
    parameters_list = [policy.get_parameters() for count in range(meta_pop_size)]

    # Send parameters from worker 0 to all workers (MPI stuff)
    # to ensure that every worker starts in the same position
    for i in range(meta_pop_size):
        comm.Bcast([parameters_list[i], MPI.FLOAT], root=0)
    comm.Bcast([vb, MPI.FLOAT], root=0)

    # Set the same virtual batch for each worker
    if rank != 0:
        policy.set_vb(vb)

    if rank < meta_pop_size:
        parent_id = rank
        eval_moving_avg = 0
        forget_factor = 0.9

    if rank >= meta_pop_size:
        parent_id = int((rank-meta_pop_size)//(train_cpus/meta_pop_size))

    # Create optimizer with user defined settings (hyperparameters)
    OptimizerClass = optimizer_dict[configuration['optimizer']]
    optimizer = OptimizerClass(parameters_list, lam, rank, meta_pop_size, parent_id, mu_list[parent_id], configuration["settings"])

    # Only rank 0 worker will log information from the training
    logger = None
    if rank < meta_pop_size:    # TODO: Improve logger for meta pop
        # Initialize logger, save virtual batch and save some basic stuff at the beginning
        logger = Logger(optimizer.log_path(game, configuration['network'], run_name))
        if rank == 0:
            logger.save_vb(vb)

        # Log basic stuff
        logger.log('Game'.ljust(25) + '%s' % game, rank)
        logger.log('Network'.ljust(25) + '%s' % configuration['network'], rank)
        logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer'], rank)
        logger.log('Number of CPUs'.ljust(25) + '%d' % cpus, rank)
        logger.log('Population'.ljust(25) + '%d' % lam, rank)
        logger.log('Dimensionality'.ljust(25) + '%d' % len(parameters_list[0]), rank)

        # Log basic info from the optimizer
        #optimizer.log_basic(logger)

    # We will count number of steps
    # frames = 4 * steps (3 * steps for SpaceInvaders)
    steps_passed = 0
    while True:
        # Iteration start time
        iter_start_time = time.time()
        # Workers that run train episodes
        if rank >= meta_pop_size:
            # Empty arrays for each episode. We save: length, reward, noise index
            lens = [0] * ep_per_cpu
            rews = [0] * ep_per_cpu
            inds = [0] * ep_per_cpu
            parent_id_arr = [0] * ep_per_cpu

            # For each episode in this CPU we get new parameters,
            # update policy network and perform policy rollout
            for i in range(ep_per_cpu):
                ind, p = optimizer.get_parameters()
                policy.set_parameters(p)
                e_rew, e_len = policy.rollout()
                lens[i] = e_len
                rews[i] = e_rew
                inds[i] = ind
                parent_id_arr[i] = parent_id


            # Aggregate information, will later send it to each worker using MPI
            msg = np.array(rews + lens + inds + parent_id_arr, dtype=np.int32)

        # Worker rank 0 that runs evaluation episodes
        else:
            rews = [0] * ep_per_cpu
            lens = [0] * ep_per_cpu
            for i in range(ep_per_cpu):
                ind, p = optimizer.get_parameters()
                policy.set_parameters(p)
                e_rew, e_len = policy.rollout()
                rews[i] = e_rew
                lens[i] = e_len

            eval_mean_rew = np.mean(rews)
            eval_max_rew = np.max(rews)
            print("real mean {}".format(eval_mean_rew))
            eval_moving_avg = eval_mean_rew + forget_factor*(eval_moving_avg-eval_mean_rew)
            print("mean eval for rank {} is {}".format(rank, eval_moving_avg))

            # Empty array, evaluation results are not used for the update
            msg = np.array(eval_moving_avg, dtype=np.int32)
            #msg = np.zeros(3 * ep_per_cpu, dtype=np.int32)

        # MPI stuff
        # Initialize array which will be updated with information from all workers using MPI
        results = np.empty((cpus, 4 * ep_per_cpu), dtype=np.int32)
        comm.Allgather([msg, MPI.INT], [results, MPI.INT])

        eval_results = results[:meta_pop_size, 0]

        # Skip empty evaluation results from worker with id 0
        results = results[meta_pop_size:, :]

        # Extract IDs and rewards
        rews = results[:, :ep_per_cpu].flatten()
        lens = results[:, ep_per_cpu:(2*ep_per_cpu)].flatten()
        ids = results[:, (2 * ep_per_cpu):(3 * ep_per_cpu)].flatten()
        par_id = results[:, (3 * ep_per_cpu):].flatten()

        rews_list = [0] * meta_pop_size
        ids_list = [0] * meta_pop_size
        train_mean_reward = [0] * meta_pop_size
        train_max_reward = [0] * meta_pop_size
        for id in meta_pop_active:
            rewards_id = [i for i, x in enumerate(par_id) if x == id]
            if not rewards_id:
                print("shittttttttttt {}".format(rewards_id))
            rews_list[id] = ([rews[i] for i in rewards_id])
            train_mean_reward[id] = (np.mean(rews_list[id]))
            train_max_reward[id] = (np.max(rews_list[id]))
            ids_list[id] = ([ids[i] for i in rewards_id])


        # Update parameters
        for i in meta_pop_active:
            optimizer.update(ids=ids_list[i], rewards=rews_list[i])

        #===============Sucssesive Halving==================
        if next_havling_time <= ((time.time()-start_time)/60):
            print("Assigning good weights to bad {}".format(((time.time() - start_time) / 60)))
            print("Eval rewards list {}".format(eval_results))
            ranking = sorted(range(len(eval_results)), key=lambda k: eval_results[k], reverse=True)
            print("ranking {}".format(ranking))
            bottom = ranking[int(0.6*meta_pop_size):]
            print("bottom {}".format(bottom))
            if parent_id in bottom:
                optimizer.assign_weights(ranking[int(len(ranking) - ranking.index(parent_id) - 1)])
                print("rank {} switch from {} to {}".format(rank,parent_id, ranking[int(len(ranking) - ranking.index(parent_id) - 1)]))
            next_havling_time += 40

        #         print("Halving now time passed {}".format(((time.time()-start_time)/60)))
        #         eval_mean = []
        #         for rank_i in range(meta_population):
        #             # print(eval_results[rank_i, :ep_per_cpu])
        #             eval_mean.append(np.mean(eval_results[rank_i, :ep_per_cpu]))
        #         print("halving rewards list {}".format(eval_mean))
        #         ranking = sorted(range(len(eval_mean)), key=lambda k: eval_mean[k], reverse=True)
        #         print("ranking {}".format(ranking))
        #         bottom = ranking[int(half_pop // 2):]
        #         print("bottom {}".format(bottom))
        #         if parent_id in bottom:
        #             old = parent_id
        #             parent_id = int(ranking.index(parent_id)-len(ranking)//2)
        #             print("switch from {} to {}".format(old, parent_id))
        #         next_havling_time *= 2
        #         half_pop /= 2
        #         ep_per_cpu //=2



        # Steps passed = Sum of episode steps from all offsprings
        steps = np.sum(lens)
        steps_passed += steps

        # Write some logs for this iteration
        # Using logs we are able to recover solution saved
        # after 1 hour of training or after 1 billion frames
        if rank < meta_pop_size:
            iteration_time = (time.time() - iter_start_time)
            time_elapsed = (time.time() - start_time) / 60
            train_mean_rew = np.mean(rews)
            train_max_rew = np.max(rews)
            logger.log('------------------------------------', rank)
            logger.log('Iteration'.ljust(25) + '%f' % (optimizer.iteration//meta_pop_size), rank)
            logger.log('EvalMeanReward'.ljust(25) + '%f' % eval_moving_avg, rank)
            logger.log('EvalMaxReward'.ljust(25) + '%f' % eval_max_rew, rank)
            logger.log('TrainMeanReward'.ljust(25) + '%f' % train_mean_rew, rank)
            logger.log('TrainMaxReward'.ljust(25) + '%f' % train_max_rew, rank)
            logger.log('StepsSinceStart'.ljust(25) + '%f' % steps_passed, rank)
            logger.log('StepsThisIter'.ljust(25) + '%f' % steps, rank)
            logger.log('IterationTime'.ljust(25) + '%f' % iteration_time, rank)
            logger.log('TimeSinceStart'.ljust(25) + '%f' % time_elapsed, rank)

            # Give optimizer a chance to log its own stuff
            # optimizer.log(logger)
            logger.log('------------------------------------', rank)

            # Write stuff for training curve plot
            stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n". \
                format(steps_passed, (time.time() - start_time),
                       eval_moving_avg, eval_max_rew, train_mean_rew, train_max_rew)
            logger.write_general_stat(stat_string, rank)
            # logger.write_optimizer_stat(optimizer.stat_string())

            # Save currently proposed solution every 20 iterations
            if optimizer.iteration % 20 == 1:
                logger.save_parameters(optimizer.parameters, optimizer.iteration, rank)
Exemple #11
0
from src.agent import Agent

torch.manual_seed(1)
np.random.seed(1)

# Setting up bounds
position_bounds = (-1.2, 0.5)
velocity_bounds = (-0.07, 0.07)
actions = [-1.0, 0.0, 1.0]

episodes = 1500
epoches = 150
greed_factor = 0.1

# Instanced Policy
policy = Policy(2, len(actions))

# Instanced Model
model = Model(
    position_bounds,  # Position bounds
    velocity_bounds  # Velocity bounds
)

# Instanced Agent
agent = Agent(
    policy,  # NeuralNetwork class
    model,
    actions,  # Actions array (after discretization)
    episodes,  # Max number of episodes
    epoches,  # Max number of epoches per episode
    greed_factor  # Greed factor
Exemple #12
0
def main(ep_per_cpu, game, configuration_file, run_name):
    start_time = time.time()

    with open(configuration_file, 'r') as f:
        configuration = json.loads(f.read())

    env_name = '%sNoFrameskip-v4' % game

    # MPI stuff
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    cpus = comm.Get_size()

    # One cpu (rank 0) will evaluate results
    train_cpus = cpus-1

    # Deduce population size
    lam = train_cpus * ep_per_cpu

    # Create environment
    env = gym.make(env_name)

    # Create policy (Deep Neural Network)
    # Internally it applies preprocessing to the environment state
    policy = Policy(env, network=configuration['network'], nonlin_name=configuration['nonlin_name'])

    # Create reference batch used for normalization
    # It will be overwritten with vb from worker with rank 0
    vb = policy.get_vb()

    # Extract vector with current parameters.
    parameters = policy.get_parameters()

    # Send parameters from worker 0 to all workers (MPI stuff)
    # to ensure that every worker starts in the same position
    comm.Bcast([parameters, MPI.FLOAT], root=0)
    comm.Bcast([vb, MPI.FLOAT], root=0)

    # Set the same virtual batch for each worker
    if rank != 0:
        policy.set_vb(vb)

    # Create optimizer with user defined settings (hyperparameters)
    OptimizerClass = optimizer_dict[configuration['optimizer']]
    optimizer = OptimizerClass(parameters, lam, rank, configuration["settings"])

    # Only rank 0 worker will log information from the training
    logger = None
    if rank == 0:
        # Initialize logger, save virtual batch and save some basic stuff at the beginning
        logger = Logger(optimizer.log_path(game, configuration['network'], run_name))
        logger.save_vb(vb)

        # Log basic stuff
        logger.log('Game'.ljust(25) + '%s' % game)
        logger.log('Network'.ljust(25) + '%s' % configuration['network'])
        logger.log('Optimizer'.ljust(25) + '%s' % configuration['optimizer'])
            # Write stuff for training curve plot
            stat_string = "{},\t{},\t{},\t{},\t{},\t{}\n".\
                format(steps_passed, (time.time()-start_time),
                       eval_mean_rew, eval_mean_rew1,BestScore,f_eval)
            logger.write_general_stat(stat_string)
            logger.write_optimizer_stat(optimizer.stat_string())

            # Save currently proposed solution every 20 iterations
            if iteration % 20 == 1:
                logger.save_parameters(BestFound, iteration)
        else:
            if iteration%5 ==0:
                optimizer.updatesigma(updateCount)
        comm.Bcast([ppp, MPI.FLOAT], root=0)
        comm.Bcast([rews, MPI.FLOAT], root=0)
        comm.Bcast([sigmas, MPI.FLOAT], root=0)


        iteration+=1