Example #1
0
def main(**exp):
    log_dir = tlogger.log_dir()

    tlogger.info(json.dumps(exp, indent=4, sort_keys=True))
    tlogger.info('Logging to: {}'.format(log_dir))
    Model = neuroevolution.models.__dict__[exp['model']]
    all_tstart = time.time()

    noise = SharedNoiseTable()
    rs = np.random.RandomState()

    def make_env0(b):
        return gym_tensorflow.make(game=exp["games"][0], batch_size=b)

    def make_env1(b):
        return gym_tensorflow.make(game=exp["games"][1], batch_size=b)

    workers = [
        ConcurrentWorkers(make_env0, Model, batch_size=64),
        ConcurrentWorkers(make_env1, Model, batch_size=64)
    ]

    saver = tf.train.Saver()

    tlogger.info('Start timing')
    tstart = time.time()
    tf_sess = tf.Session()
    tf_sess.run(tf.global_variables_initializer())
    state = TrainingState(exp)
    state.initialize(rs, noise, workers[0].model)

    workers[0].initialize(tf_sess)
    workers[1].initialize(tf_sess)

    for iteration in range(exp['iterations']):
        tlogger.info("BEGINNING ITERATION: {}".format(iteration))

        ##############
        ### GAME 0 ###
        ##############
        worker = workers[0]
        frames_computed_so_far = tf_sess.run(worker.steps_counter)
        game0_results = []
        game0_rewards = []
        game0_episode_lengths = []

        iterator = iter(
            worker.monitor_eval(make_offspring(exp, noise, rs, worker, state),
                                max_frames=state.tslimit * 4))

        for pos_seeds, pos_reward, pos_length in iterator:
            neg_seeds, neg_reward, neg_length = next(iterator)
            assert pos_seeds == neg_seeds
            result = Offspring(pos_seeds, [pos_reward, neg_reward],
                               [pos_length, neg_length])
            rewards = result.rewards
            game0_results.append(result)
            game0_rewards.append(rewards)
            game0_episode_lengths.append(result.ep_len)
        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far
        game0_returns_n2 = np.array([a.rewards for a in game0_results])
        game0_noise_inds_n = [a.seeds for a in game0_results]
        # tlogger.info("game0 rewards: {}".format(np.mean(game0_rewards)))
        # tlogger.info("game0 eplens: {}".format(game0_episode_lengths))
        save_pickle(iteration, log_dir, "game0_rewards", game0_rewards)
        save_pickle(iteration, log_dir, "game0_episode_lengths",
                    game0_episode_lengths)

        ##############
        ### GAME 1 ###
        ##############
        worker = workers[1]
        frames_computed_so_far = tf_sess.run(worker.steps_counter)
        game1_results = []
        game1_rewards = []
        game1_episode_lengths = []
        seeds_vector = np.array(game0_noise_inds_n)
        iterator = iter(
            worker.monitor_eval(make_offspring(exp, noise, rs, worker, state,
                                               seeds_vector),
                                max_frames=state.tslimit * 4))

        for pos_seeds, pos_reward, pos_length in iterator:
            neg_seeds, neg_reward, neg_length = next(iterator)
            assert pos_seeds == neg_seeds
            result = Offspring(pos_seeds, [pos_reward, neg_reward],
                               [pos_length, neg_length])
            rewards = result.rewards
            game1_results.append(result)
            game1_rewards.append(rewards)
            game1_episode_lengths.append(result.ep_len)
        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far
        game1_returns_n2 = np.array([a.rewards for a in game1_results])
        game1_noise_inds_n = [a.seeds for a in game1_results]
        # tlogger.info("game1 rewards: {}".format(np.mean(game1_rewards)))
        # tlogger.info("game1 eplens: {}".format(game0_episode_lengths))
        save_pickle(iteration, log_dir, "game1_rewards", game1_rewards)
        save_pickle(iteration, log_dir, "game1_episode_lengths",
                    game1_episode_lengths)

        tlogger.info("Saving offsprings seeds")
        save_pickle(iteration, log_dir, "offsprings_seeds", game1_noise_inds_n)

        ####################
        ### UPDATE THETA ###
        ####################
        game_returns = [game0_returns_n2, game1_returns_n2]
        proc_returns = obtain_proc_returns(exp['learn_option'], game_returns)

        assert game0_noise_inds_n == game1_noise_inds_n
        noise_inds_n = game0_noise_inds_n + game1_noise_inds_n  # concatenate the two lists

        # TOP 100 offspring
        #        dx = proc_returns[:, 0]
        #        dy = proc_returns[:, 1]
        #        dist_squared = (np.ones(dx.shape) - np.abs(dx))**2 + (np.ones(dy.shape) - np.abs(dy))**2
        #        top_n_rewards = dist_squared.argsort()[-100:][::-1]
        #        batched_weighted_indices = (noise.get(idx, worker.model.num_params) for idx in noise_inds_n)
        #        proc_returns = proc_returns[top_n_rewards, :]
        #        batched_weighted_args = {
        #            'deltas': proc_returns[:, 0] - proc_returns[:, 1],
        #            'indices': [myval for myidx, myval in enumerate(batched_weighted_indices) if myidx in top_n_rewards]
        #        }
        #        noise_inds_n = batched_weighted_args['indices']
        #        g, count = batched_weighted_sum(batched_weighted_args['deltas'], batched_weighted_args['indices'], batch_size=len(batched_weighted_args['deltas']))

        # ALL offspring
        g, count = batched_weighted_sum(
            proc_returns[:, 0] - proc_returns[:, 1],
            (noise.get(idx, worker.model.num_params) for idx in noise_inds_n),
            batch_size=500)

        # NOTE: gradients are scaled by \theta
        returns_n2 = np.array([a.rewards for a in game0_results] +
                              [a.rewards for a in game1_results])

        # Only if using top 100
        #        returns_n2 = returns_n2[top_n_rewards]

        g /= returns_n2.size

        assert g.shape == (
            worker.model.num_params,
        ) and g.dtype == np.float32 and count == len(noise_inds_n)
        update_ratio, state.theta = state.optimizer.update(-g +
                                                           exp['l2coeff'] *
                                                           state.theta)

        save_pickle(iteration, log_dir, "state", state)

        ######################
        ### EVALUATE ELITE ###
        ######################
        _, test_evals, test_timesteps = workers[0].monitor_eval_repeated(
            [(state.theta, 0)],
            max_frames=None,
            num_episodes=exp['num_test_episodes'] // 2)[0]
        tlogger.info("game0 elite: {}".format(np.mean(test_evals)))
        save_pickle(iteration, log_dir, 'game0_elite', test_evals)
        save_pickle(iteration, log_dir, 'game0_elite_timestemps',
                    test_timesteps)

        _, test_evals, test_timesteps = workers[1].monitor_eval_repeated(
            [(state.theta, 0)],
            max_frames=None,
            num_episodes=exp['num_test_episodes'] // 2)[0]
        tlogger.info("game1 elite: {}".format(np.mean(test_evals)))
        save_pickle(iteration, log_dir, "game1_elite", test_evals)
        save_pickle(iteration, log_dir, 'game1_elite_timestemps',
                    test_timesteps)

        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far

        saver.save(tf_sess, "{}/model-{}".format(log_dir, state.it))

        state.it += 1

    os.kill(os.getpid(), signal.SIGTERM)
Example #2
0
def main(**exp):
    log_dir = tlogger.log_dir()

    tlogger.info(json.dumps(exp, indent=4, sort_keys=True))
    tlogger.info('Logging to: {}'.format(log_dir))
    Model = neuroevolution.models.__dict__[exp['model']]
    all_tstart = time.time()

    def make_env(b):
        return gym_tensorflow.make(game=exp["game"], batch_size=b)

    worker = ConcurrentWorkers(make_env, Model, batch_size=64)
    with WorkerSession(worker) as sess:
        noise = SharedNoiseTable()
        rs = np.random.RandomState()
        tlogger.info('Start timing')
        tstart = time.time()

        try:
            load_file = os.path.join(log_dir, 'snapshot.pkl')
            with open(load_file, 'rb+') as file:
                state = pickle.load(file)
            tlogger.info("Loaded iteration {} from {}".format(
                state.it, load_file))
        except FileNotFoundError:
            tlogger.info('Failed to load snapshot')
            state = TrainingState(exp)

            if 'load_from' in exp:
                dirname = os.path.join(os.path.dirname(__file__), '..',
                                       'neuroevolution', 'ga_legacy.py')
                load_from = exp['load_from'].format(**exp)
                os.system('python {} {} seeds.pkl'.format(dirname, load_from))
                with open('seeds.pkl', 'rb+') as file:
                    seeds = pickle.load(file)
                    state.set_theta(
                        worker.model.compute_weights_from_seeds(noise, seeds))
                tlogger.info('Loaded initial theta from {}'.format(load_from))
            else:
                state.initialize(rs, noise, worker.model)

        def make_offspring(state):
            for i in range(exp['population_size'] // 2):
                idx = noise.sample_index(rs, worker.model.num_params)
                mutation_power = state.sample(state.mutation_power)
                pos_theta = worker.model.compute_mutation(
                    noise, state.theta, idx, mutation_power)

                yield (pos_theta, idx)
                neg_theta = worker.model.compute_mutation(
                    noise, state.theta, idx, -mutation_power)
                diff = (np.max(
                    np.abs((pos_theta + neg_theta) / 2 - state.theta)))
                assert diff < 1e-5, 'Diff too large: {}'.format(diff)

                yield (neg_theta, idx)

        tlogger.info('Start training')
        _, initial_performance, _ = worker.monitor_eval_repeated(
            [(state.theta, 0)],
            max_frames=None,
            num_episodes=exp['num_test_episodes'])[0]
        while True:
            tstart_iteration = time.time()
            if state.timesteps_so_far >= exp['timesteps']:
                tlogger.info('Training terminated after {} timesteps'.format(
                    state.timesteps_so_far))
                break
            frames_computed_so_far = sess.run(worker.steps_counter)

            tlogger.info('Evaluating perturbations')
            iterator = iter(
                worker.monitor_eval(make_offspring(state),
                                    max_frames=state.tslimit * 4))
            results = []
            for pos_seeds, pos_reward, pos_length in iterator:
                neg_seeds, neg_reward, neg_length = next(iterator)
                assert pos_seeds == neg_seeds
                results.append(
                    Offspring(pos_seeds, [pos_reward, neg_reward],
                              [pos_length, neg_length]))
            state.num_frames += sess.run(
                worker.steps_counter) - frames_computed_so_far

            state.it += 1
            tlogger.record_tabular('Iteration', state.it)
            tlogger.record_tabular('MutationPower',
                                   state.sample(state.mutation_power))
            tlogger.record_tabular('TimestepLimitPerEpisode', state.tslimit)

            # Trim unwanted results
            results = results[:exp['population_size'] // 2]
            assert len(results) == exp['population_size'] // 2
            rewards = np.array([b for a in results for b in a.rewards])

            results_timesteps = np.array([a.training_steps for a in results])
            timesteps_this_iter = sum([a.training_steps for a in results])
            state.timesteps_so_far += timesteps_this_iter

            tlogger.record_tabular('PopulationEpRewMax', np.max(rewards))
            tlogger.record_tabular('PopulationEpRewMean', np.mean(rewards))
            tlogger.record_tabular('PopulationEpRewMedian', np.median(rewards))
            tlogger.record_tabular('PopulationEpCount', len(rewards))
            tlogger.record_tabular('PopulationTimesteps', timesteps_this_iter)

            # Update Theta
            returns_n2 = np.array([a.rewards for a in results])
            noise_inds_n = [a.seeds for a in results]

            if exp['return_proc_mode'] == 'centered_rank':
                proc_returns_n2 = compute_centered_ranks(returns_n2)
            else:
                raise NotImplementedError(exp['return_proc_mode'])
            # Compute and take step
            g, count = batched_weighted_sum(
                proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
                (noise.get(idx, worker.model.num_params)
                 for idx in noise_inds_n),
                batch_size=500)
            # NOTE: gradients are scaled by \theta
            g /= returns_n2.size

            assert g.shape == (
                worker.model.num_params,
            ) and g.dtype == np.float32 and count == len(noise_inds_n)
            update_ratio, state.theta = state.optimizer.update(-g +
                                                               exp['l2coeff'] *
                                                               state.theta)

            time_elapsed_this_iter = time.time() - tstart_iteration
            state.time_elapsed += time_elapsed_this_iter
            tlogger.info('Evaluate elite')
            _, test_evals, test_timesteps = worker.monitor_eval_repeated(
                [(state.theta, 0)],
                max_frames=None,
                num_episodes=exp['num_test_episodes'])[0]
            test_timesteps = sum(test_timesteps)
            # Log Results
            tlogger.record_tabular('TestRewMean', np.mean(test_evals))
            tlogger.record_tabular('TestRewMedian', np.median(test_evals))
            tlogger.record_tabular('TestEpCount', len(test_evals))
            tlogger.record_tabular('TestEpLenSum', test_timesteps)
            tlogger.record_tabular('InitialRewMax',
                                   np.max(initial_performance))
            tlogger.record_tabular('InitialRewMean',
                                   np.mean(initial_performance))
            tlogger.record_tabular('InitialRewMedian',
                                   np.median(initial_performance))

            tlogger.record_tabular('TimestepsThisIter', timesteps_this_iter)
            tlogger.record_tabular(
                'TimestepsPerSecondThisIter',
                timesteps_this_iter / (time.time() - tstart_iteration))
            tlogger.record_tabular('TimestepsComputed', state.num_frames)
            tlogger.record_tabular('TimestepsSoFar', state.timesteps_so_far)
            tlogger.record_tabular('TimeElapsedThisIter',
                                   time_elapsed_this_iter)
            tlogger.record_tabular('TimeElapsedThisIterTotal',
                                   time.time() - tstart_iteration)
            tlogger.record_tabular('TimeElapsed', state.time_elapsed)
            tlogger.record_tabular('TimeElapsedTotal',
                                   time.time() - all_tstart)

            tlogger.dump_tabular()
            fps = state.timesteps_so_far / (time.time() - tstart)
            tlogger.info(
                'Timesteps Per Second: {:.0f}. Elapsed: {:.2f}h ETA {:.2f}h'.
                format(fps, (time.time() - all_tstart) / 3600,
                       (exp['timesteps'] - state.timesteps_so_far) / fps /
                       3600))

            if state.adaptive_tslimit:
                if np.mean(
                    [a.training_steps >= state.tslimit
                     for a in results]) > state.incr_tslimit_threshold:
                    state.tslimit = min(
                        state.tslimit * state.tslimit_incr_ratio,
                        state.tslimit_max)
                    tlogger.info('Increased threshold to {}'.format(
                        state.tslimit))

            os.makedirs(log_dir, exist_ok=True)
            save_file = os.path.join(log_dir, 'snapshot.pkl')
            with open(save_file, 'wb+') as file:
                pickle.dump(state, file)
            #copyfile(save_file, os.path.join(log_dir, 'snapshot_gen{:04d}.pkl'.format(state.it)))
            tlogger.info("Saved iteration {} to {}".format(
                state.it, save_file))

            if state.timesteps_so_far >= exp['timesteps']:
                tlogger.info('Training terminated after {} timesteps'.format(
                    state.timesteps_so_far))
                break
            results.clear()
Example #3
0
def main(**exp):

    log_dir = tlogger.log_dir()

    tlogger.info(json.dumps(exp, indent=4, sort_keys=True))
    tlogger.info('Logging to: {}'.format(log_dir))
    Model = neuroevolution.models.__dict__[exp['model']]
    all_tstart = time.time()

    noise = SharedNoiseTable()
    rs = np.random.RandomState()

    def make_env0(b):
        return gym_tensorflow.make(game=exp["games"][0], batch_size=b)

    def make_env1(b):
        return gym_tensorflow.make(game=exp["games"][1], batch_size=b)

    workers = [
        ConcurrentWorkers(make_env0, Model, batch_size=64),
        ConcurrentWorkers(make_env1, Model, batch_size=64)
    ]

    tlogger.info('Start timing')
    tstart = time.time()
    tf_sess = tf.Session()
    tf_sess.run(tf.global_variables_initializer())
    state = TrainingState(exp)
    state.initialize(rs, noise, workers[0].model)

    workers[0].initialize(tf_sess)
    workers[1].initialize(tf_sess)

    for iteration in range(exp['iterations']):
        tlogger.info("BEGINNING ITERATION: {}".format(iteration))

        ##############
        ### GAME 0 ###
        ##############
        worker = workers[0]
        frames_computed_so_far = tf_sess.run(worker.steps_counter)
        game0_results = []
        game0_rewards = []
        game0_episode_lengths = []

        iterator = iter(
            worker.monitor_eval(make_offspring(exp, noise, rs, worker, state),
                                max_frames=state.tslimit * 4))

        for pos_seeds, pos_reward, pos_length in iterator:
            neg_seeds, neg_reward, neg_length = next(iterator)
            assert pos_seeds == neg_seeds
            result = Offspring(pos_seeds, [pos_reward, neg_reward],
                               [pos_length, neg_length])
            rewards = result.rewards
            game0_results.append(result)
            game0_rewards.append(rewards)
            game0_episode_lengths.append(result.ep_len)
        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far
        game0_returns_n2 = np.array([a.rewards for a in game0_results])
        game0_noise_inds_n = [a.seeds for a in game0_results]
        save_pickle(iteration, log_dir, "game0_rewards", game0_rewards)
        save_pickle(iteration, log_dir, "game0_episode_lengths",
                    game0_episode_lengths)

        ##############
        ### GAME 1 ###
        ##############
        if f_isSingleTask(exp):
            game1_results = []
            game1_rewards = []
            game1_episode_lengths = []
            game1_returns_n2 = game0_returns_n2
            game1_noise_inds_n = game0_noise_inds_n
        else:
            worker = workers[1]
            frames_computed_so_far = tf_sess.run(worker.steps_counter)
            game1_results = []
            game1_rewards = []
            game1_episode_lengths = []
            seeds_vector = np.array(game0_noise_inds_n)
            iterator = iter(
                worker.monitor_eval(make_offspring(exp, noise, rs, worker,
                                                   state, seeds_vector),
                                    max_frames=state.tslimit * 4))

            for pos_seeds, pos_reward, pos_length in iterator:
                neg_seeds, neg_reward, neg_length = next(iterator)
                assert pos_seeds == neg_seeds
                result = Offspring(pos_seeds, [pos_reward, neg_reward],
                                   [pos_length, neg_length])
                rewards = result.rewards
                game1_results.append(result)
                game1_rewards.append(rewards)
                game1_episode_lengths.append(result.ep_len)
            state.num_frames += tf_sess.run(
                worker.steps_counter) - frames_computed_so_far
            game1_returns_n2 = np.array([a.rewards for a in game1_results])
            game1_noise_inds_n = [a.seeds for a in game1_results]
        save_pickle(iteration, log_dir, "game1_rewards", game1_rewards)
        save_pickle(iteration, log_dir, "game1_episode_lengths",
                    game1_episode_lengths)

        tlogger.info("Saving offsprings seeds")
        save_pickle(iteration, log_dir, "offsprings_seeds", game1_noise_inds_n)

        ####################
        ### UPDATE THETA ###
        ####################

        if f_isSingleTask(exp):
            proc_frames = compute_centered_ranks(
                np.asarray(game0_episode_lengths))
            proc_returns = compute_centered_ranks(game0_returns_n2)
            noise_inds_n = game0_noise_inds_n
        else:
            game_returns = [game0_returns_n2, game1_returns_n2]
            proc_returns = obtain_proc_returns(exp['learn_option'],
                                               game_returns)

            assert game0_noise_inds_n == game1_noise_inds_n
            noise_inds_n = game0_noise_inds_n + game1_noise_inds_n  # concatenate the two lists

        g_returns, count_returns = batched_weighted_sum(
            proc_returns[:, 0] - proc_returns[:, 1],
            (noise.get(idx, worker.model.num_params) for idx in noise_inds_n),
            batch_size=500)

        g_frames, count_frames = batched_weighted_sum(
            proc_frames[:, 0] - proc_frames[:, 1],
            (noise.get(idx, worker.model.num_params) for idx in noise_inds_n),
            batch_size=500)

        assert count_frames == count_returns
        count = count_returns

        w = exp['w']
        g = w * g_returns + (1 - w) * g_frames

        returns_n2 = np.array([a.rewards for a in game0_results] +
                              [a.rewards for a in game1_results])
        g /= returns_n2.size

        assert g.shape == (
            worker.model.num_params,
        ) and g.dtype == np.float32 and count == len(noise_inds_n)
        update_ratio, state.theta = state.optimizer.update(-g +
                                                           exp['l2coeff'] *
                                                           state.theta)

        save_pickle(iteration, log_dir, "state", state)

        ######################
        ### EVALUATE ELITE ###
        ######################
        _, test_evals, test_timesteps = workers[0].monitor_eval_repeated(
            [(state.theta, 0)],
            max_frames=None,
            num_episodes=exp['num_test_episodes'] //
            (2**(1 - f_isSingleTask(exp))))[0]
        tlogger.info("game0 elite: {}".format(np.mean(test_evals)))
        tlogger.info("game0 elite frames max: {}".format(
            np.max(test_timesteps)))
        tlogger.info("game0 elite frames mean: {}".format(
            np.mean(test_timesteps)))
        tlogger.info("game0 elite frames min: {}".format(
            np.min(test_timesteps)))
        tlogger.info("game0 offspring frames max: {}".format(
            np.max(game0_episode_lengths)))
        tlogger.info("game0 offspring frames mean: {}".format(
            np.mean(game0_episode_lengths)))
        tlogger.info("game0 offspring frames min: {}".format(
            np.min(game0_episode_lengths)))
        save_pickle(iteration, log_dir, 'game0_elite', test_evals)
        save_pickle(iteration, log_dir, 'game0_elite_timestemps',
                    test_timesteps)

        if not (f_isSingleTask(exp)):
            _, test_evals, test_timesteps = workers[1].monitor_eval_repeated(
                [(state.theta, 0)],
                max_frames=None,
                num_episodes=exp['num_test_episodes'] // 2)[0]

        tlogger.info("game1 elite: {}".format(np.mean(test_evals)))
        save_pickle(iteration, log_dir, "game1_elite", test_evals)
        save_pickle(iteration, log_dir, 'game1_elite_timestemps',
                    test_timesteps)

        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far
        state.it += 1

    os.kill(os.getpid(), signal.SIGTERM)