Beispiel #1
0
def main(exp, log_dir):
    log_dir = tlogger.log_dir(log_dir)

    snap_idx = 0
    snapshots = []

    tlogger.info(json.dumps(exp, indent=4, sort_keys=True))
    tlogger.info('Logging to: {}'.format(log_dir))
    Model = neuroevolution.models.__dict__[exp['model']]
    all_tstart = time.time()

    def make_env(b):
        return gym_tensorflow.make(game=exp["game"], batch_size=b)

    worker = ConcurrentWorkers(make_env, Model, batch_size=64)
    with WorkerSession(worker) as sess:
        rs = np.random.RandomState()
        noise = None
        state = None
        cached_parents = []
        results = []

        def make_offspring():
            if len(cached_parents) == 0:
                return worker.model.randomize(rs, noise)
            else:
                assert len(cached_parents) == exp['selection_threshold']
                parent = cached_parents[rs.randint(len(cached_parents))]
                return worker.model.mutate(parent,
                                           rs,
                                           noise,
                                           mutation_power=state.sample(
                                               state.mutation_power))

        tlogger.info('Start timing')
        tstart = time.time()

        load_file = os.path.join(log_dir, 'snapshot.pkl')

        if 'load_from' in exp:
            filename = os.path.join(log_dir, exp['load_from'])
            with open(filename, 'rb+') as file:
                state = pickle.load(file)
                state.timesteps_so_far = 0  # Reset timesteps to 0
                state.it = 0
                state.max_reward = 0
                state.max_avg = 0
                state.max_sd = 0
            tlogger.info('Loaded initial policy from {}'.format(filename))
        elif os.path.exists(load_file):
            try:
                with open(load_file, 'rb+') as file:
                    state = pickle.load(file)
                tlogger.info("Loaded iteration {} from {}".format(
                    state.it, load_file))
            except FileNotFoundError:
                tlogger.info('Failed to load snapshot')

        if not noise:
            tlogger.info("Generating new noise table")
            noise = SharedNoiseTable()
        else:
            tlogger.info("Using noise table from snapshot")

        if not state:
            tlogger.info("Generation new TrainingState")
            state = TrainingState(exp)

        if 'load_population' in exp:
            state.copy_population(exp['load_population'])

        # Cache first population if needed (on restart)
        if state.population and exp['selection_threshold'] > 0:
            tlogger.info("Caching parents")
            cached_parents.clear()
            if state.elite in state.population[:exp['selection_threshold']]:
                cached_parents.extend([
                    (worker.model.compute_weights_from_seeds(noise,
                                                             o.seeds), o.seeds)
                    for o in state.population[:exp['selection_threshold']]
                ])
            else:
                cached_parents.append((worker.model.compute_weights_from_seeds(
                    noise, state.elite.seeds), state.elite.seeds))
                cached_parents.extend([
                    (worker.model.compute_weights_from_seeds(noise,
                                                             o.seeds), o.seeds)
                    for o in state.population[:exp['selection_threshold'] - 1]
                ])
            tlogger.info("Done caching parents")

        while True:
            tstart_iteration = time.time()
            if state.timesteps_so_far >= exp['timesteps']:
                tlogger.info('Training terminated after {} timesteps'.format(
                    state.timesteps_so_far))
                break
            frames_computed_so_far = sess.run(worker.steps_counter)
            assert (len(cached_parents) == 0 and state.it
                    == 0) or len(cached_parents) == exp['selection_threshold']

            tasks = [make_offspring() for _ in range(exp['population_size'])]
            for seeds, episode_reward, episode_length in worker.monitor_eval(
                    tasks, max_frames=state.tslimit * 4):
                results.append(
                    Offspring(seeds, [episode_reward], [episode_length]))
            state.num_frames += sess.run(
                worker.steps_counter) - frames_computed_so_far

            state.it += 1
            tlogger.record_tabular('Iteration', state.it)
            tlogger.record_tabular('MutationPower',
                                   state.sample(state.mutation_power))

            # Trim unwanted results
            results = results[:exp['population_size']]
            assert len(results) == exp['population_size']
            rewards = np.array([a.fitness for a in results])
            population_timesteps = sum([a.training_steps for a in results])
            state.population = sorted(results,
                                      key=lambda x: x.fitness,
                                      reverse=True)
            state.max_reward = save_best_pop_member(state.max_reward,
                                                    np.max(rewards), state,
                                                    state.population[0])
            tlogger.record_tabular('PopulationEpRewMax', np.max(rewards))
            tlogger.record_tabular('PopulationEpRewMean', np.mean(rewards))
            tlogger.record_tabular('PopulationEpCount', len(rewards))
            tlogger.record_tabular('PopulationTimesteps', population_timesteps)
            tlogger.record_tabular('NumSelectedIndividuals',
                                   exp['selection_threshold'])

            tlogger.info('Evaluate population')
            validation_population = state.population[:exp[
                'validation_threshold']]
            if state.elite is not None:
                validation_population = [state.elite
                                         ] + validation_population[:-1]

            validation_tasks = [(worker.model.compute_weights_from_seeds(
                noise, validation_population[x].seeds,
                cache=cached_parents), validation_population[x].seeds)
                                for x in range(exp['validation_threshold'])]
            _, population_validation, population_validation_len = zip(
                *worker.monitor_eval_repeated(
                    validation_tasks,
                    max_frames=state.tslimit * 4,
                    num_episodes=exp['num_validation_episodes']))

            it_max_avg = np.max([np.mean(x) for x in population_validation])
            it_max_sd = np.max([np.std(x) for x in population_validation])

            state.max_avg = np.max([state.max_avg, it_max_avg])
            state.max_sd = np.max([state.max_sd, it_max_sd])

            tlogger.info("Max Average: {}".format(state.max_avg))
            tlogger.info("Max Std: {}".format(state.max_sd))

            fitness_results = [(np.mean(x), np.std(x))
                               for x in population_validation]
            with open(os.path.join(log_dir, 'fitness.log'), 'a') as f:
                f.write("{},{},{}: {}\n".format(
                    state.it, state.max_avg, state.max_sd, ','.join([
                        "({},{})".format(x[0], x[1]) for x in fitness_results
                    ])))

            population_fitness = [
                fitness(x[0], x[1], state.max_avg, state.max_sd)
                for x in fitness_results
            ]
            tlogger.info("Fitness: {}".format(population_fitness))
            population_validation_len = [
                np.sum(x) for x in population_validation_len
            ]

            time_elapsed_this_iter = time.time() - tstart_iteration
            state.time_elapsed += time_elapsed_this_iter

            population_elite_idx = np.argmin(population_fitness)
            state.elite = validation_population[population_elite_idx]
            elite_theta = worker.model.compute_weights_from_seeds(
                noise, state.elite.seeds, cache=cached_parents)
            _, population_elite_evals, population_elite_evals_timesteps = worker.monitor_eval_repeated(
                [(elite_theta, state.elite.seeds)],
                max_frames=None,
                num_episodes=exp['num_test_episodes'])[0]

            # Log Results
            validation_timesteps = sum(population_validation_len)
            timesteps_this_iter = population_timesteps + validation_timesteps
            state.timesteps_so_far += timesteps_this_iter
            state.validation_timesteps_so_far += validation_timesteps

            # Log
            tlogger.record_tabular(
                'TruncatedPopulationRewMean',
                np.mean([a.fitness for a in validation_population]))
            tlogger.record_tabular('TruncatedPopulationValidationFitMean',
                                   np.mean(population_fitness))
            tlogger.record_tabular('TruncatedPopulationValidationFitMax',
                                   np.max(population_fitness))
            tlogger.record_tabular('TruncatedPopulationValidationFitMin',
                                   np.min(population_fitness))
            tlogger.record_tabular('TruncatedPopulationValidationMaxAvg',
                                   state.max_avg)
            tlogger.record_tabular('TruncatedPopulationValidationMaxStd',
                                   state.max_sd)
            tlogger.record_tabular('TruncatedPopulationEliteValidationFitMin',
                                   np.min(population_fitness))
            tlogger.record_tabular("TruncatedPopulationEliteIndex",
                                   population_elite_idx)
            tlogger.record_tabular('TruncatedPopulationEliteSeeds',
                                   state.elite.seeds)
            tlogger.record_tabular('TruncatedPopulationEliteTestRewMean',
                                   np.mean(population_elite_evals))
            tlogger.record_tabular('TruncatedPopulationEliteTestRewStd',
                                   np.std(population_elite_evals))
            tlogger.record_tabular('TruncatedPopulationEliteTestEpCount',
                                   len(population_elite_evals))
            tlogger.record_tabular('TruncatedPopulationEliteTestEpLenSum',
                                   np.sum(population_elite_evals_timesteps))

            if np.mean(population_validation) > state.curr_solution_val:
                state.curr_solution = state.elite.seeds
                state.curr_solution_val = np.mean(population_validation)
                state.curr_solution_test = np.mean(population_elite_evals)

            tlogger.record_tabular('ValidationTimestepsThisIter',
                                   validation_timesteps)
            tlogger.record_tabular('ValidationTimestepsSoFar',
                                   state.validation_timesteps_so_far)
            tlogger.record_tabular('TimestepsThisIter', timesteps_this_iter)
            tlogger.record_tabular(
                'TimestepsPerSecondThisIter',
                timesteps_this_iter / (time.time() - tstart_iteration))
            tlogger.record_tabular('TimestepsComputed', state.num_frames)
            tlogger.record_tabular('TimestepsSoFar', state.timesteps_so_far)
            tlogger.record_tabular('TimeElapsedThisIter',
                                   time_elapsed_this_iter)
            tlogger.record_tabular('TimeElapsedThisIterTotal',
                                   time.time() - tstart_iteration)
            tlogger.record_tabular('TimeElapsed', state.time_elapsed)
            tlogger.record_tabular('TimeElapsedTotal',
                                   time.time() - all_tstart)

            tlogger.dump_tabular()
            # tlogger.info('Current elite: {}'.format(state.elite.seeds))
            fps = state.timesteps_so_far / (time.time() - tstart)
            tlogger.info(
                'Timesteps Per Second: {:.0f}. Elapsed: {:.2f}h ETA {:.2f}h'.
                format(fps, (time.time() - all_tstart) / 3600,
                       (exp['timesteps'] - state.timesteps_so_far) / fps /
                       3600))

            if state.adaptive_tslimit:
                if np.mean(
                    [a.training_steps >= state.tslimit
                     for a in results]) > state.incr_tslimit_threshold:
                    state.tslimit = min(
                        state.tslimit * state.tslimit_incr_ratio,
                        state.tslimit_max)
                    tlogger.info('Increased threshold to {}'.format(
                        state.tslimit))

            snap_idx, snapshots = save_snapshot(state, log_dir, snap_idx,
                                                snapshots)
            # os.makedirs(log_dir, exist_ok=True)
            # copyfile(save_file, os.path.join(log_dir, 'snapshot_gen{:04d}.pkl'.format(state.it)))
            tlogger.info("Saved iteration {} to {}".format(
                state.it, snapshots[snap_idx - 1]))

            if state.timesteps_so_far >= exp['timesteps']:
                tlogger.info('Training terminated after {} timesteps'.format(
                    state.timesteps_so_far))
                break
            results.clear()

            if exp['selection_threshold'] > 0:
                tlogger.info("Caching parents")
                new_parents = []
                if state.elite in state.population[:
                                                   exp['selection_threshold']]:
                    new_parents.extend([
                        (worker.model.compute_weights_from_seeds(
                            noise, o.seeds, cache=cached_parents), o.seeds)
                        for o in state.population[:exp['selection_threshold']]
                    ])
                else:
                    new_parents.append(
                        (worker.model.compute_weights_from_seeds(
                            noise, state.elite.seeds,
                            cache=cached_parents), state.elite.seeds))
                    new_parents.extend([
                        (worker.model.compute_weights_from_seeds(
                            noise, o.seeds, cache=cached_parents), o.seeds)
                        for o in state.population[:exp['selection_threshold'] -
                                                  1]
                    ])

                cached_parents.clear()
                cached_parents.extend(new_parents)
                tlogger.info("Done caching parents")
    return float(state.curr_solution_test), {
        'val': float(state.curr_solution_val)
    }
Beispiel #2
0
def main(**exp):
    log_dir = tlogger.log_dir()

    tlogger.info(json.dumps(exp, indent=4, sort_keys=True))
    tlogger.info('Logging to: {}'.format(log_dir))
    Model = neuroevolution.models.__dict__[exp['model']]
    all_tstart = time.time()

    def make_env_game0(b):
        return gym_tensorflow.make(game=exp['games'][0], batch_size=b)

    def make_env_game1(b):
        return gym_tensorflow.make(game=exp['games'][1], batch_size=b)

    def make_offspring(state):
        for i in range(exp['population_size'] // 2):
            idx = noise.sample_index(rs, worker.model.num_params)
            mutation_power = state.sample(state.mutation_power)
            pos_theta = worker.model.compute_mutation(noise, state.theta, idx,
                                                      mutation_power)

            yield (pos_theta, idx)
            neg_theta = worker.model.compute_mutation(noise, state.theta, idx,
                                                      -mutation_power)
            diff = (np.max(np.abs((pos_theta + neg_theta) / 2 - state.theta)))
            assert diff < 1e-5, 'Diff too large: {}'.format(diff)

            yield (neg_theta, idx)

    worker = MTConcurrentWorkers([make_env_game0, make_env_game1],
                                 Model,
                                 batch_size=32)

    print("=== [mtes] worker.sess = {}".format(worker.sess))
    with WorkerSession(worker) as sess:
        print("=== [mtes] worker.sess = {}".format(worker.sess))
        noise = SharedNoiseTable()
        rs = np.random.RandomState()
        tlogger.info('Start timing')
        tstart = time.time()

        state = TrainingState(exp)
        state.initialize(rs, noise, worker.model)

        tlogger.info('Start training')
        game_index, _, initial_performance, _ = worker.monitor_eval_repeated(
            [(state.theta, 0)],
            max_frames=None,
            num_episodes=exp['num_test_episodes'])[0]

        print("=== past worker.monitor_eval_repeated")
        while True:
            print("=== next cycle in while loop")
            tstart_iteration = time.time()
            if state.timesteps_so_far >= exp['timesteps']:
                tlogger.info('Training terminated after {} timesteps'.format(
                    state.timesteps_so_far))
                break
            frames_computed_so_far = sess.run(worker.steps_counter)
            print("=== frames_captured_so_far = {}".format(
                frames_computed_so_far))
            tlogger.info('Evaluating perturbations')
            iterator = iter(
                worker.monitor_eval(make_offspring(state),
                                    max_frames=state.tslimit * 4))
            results = []
            for game_index, pos_seeds, pos_reward, pos_length in iterator:
                game_index, neg_seeds, neg_reward, neg_length = next(iterator)
                assert pos_seeds == neg_seeds
                results.append(
                    Offspring(pos_seeds, [pos_reward, neg_reward],
                              [pos_length, neg_length]))
            state.num_frames += sess.run(
                worker.steps_counter) - frames_computed_so_far

            state.it += 1
            tlogger.record_tabular('Iteration', state.it)
            tlogger.record_tabular('MutationPower',
                                   state.sample(state.mutation_power))
            tlogger.record_tabular('TimestepLimitPerEpisode', state.tslimit)

            # Trim unwanted results
            results = results[:exp['population_size'] // 2]
            assert len(results) == exp['population_size'] // 2
            rewards = np.array([b for a in results for b in a.rewards])

            results_timesteps = np.array([a.training_steps for a in results])
            timesteps_this_iter = sum([a.training_steps for a in results])
            state.timesteps_so_far += timesteps_this_iter

            tlogger.record_tabular('PopulationEpRewMax', np.max(rewards))
            tlogger.record_tabular('PopulationEpRewMean', np.mean(rewards))
            tlogger.record_tabular('PopulationEpRewMedian', np.median(rewards))
            tlogger.record_tabular('PopulationEpCount', len(rewards))
            tlogger.record_tabular('PopulationTimesteps', timesteps_this_iter)

            # Update Theta
            returns_n2 = np.array([a.rewards for a in results])
            noise_inds_n = [a.seeds for a in results]

            if exp['return_proc_mode'] == 'centered_rank':
                proc_returns_n2 = compute_centered_ranks(returns_n2)
            else:
                raise NotImplementedError(exp['return_proc_mode'])
            # Compute and take step
            g, count = batched_weighted_sum(
                proc_returns_n2[:, 0] - proc_returns_n2[:, 1],
                (noise.get(idx, worker.model.num_params)
                 for idx in noise_inds_n),
                batch_size=500)
            # NOTE: gradients are scaled by \theta
            g /= returns_n2.size

            assert g.shape == (
                worker.model.num_params,
            ) and g.dtype == np.float32 and count == len(noise_inds_n)
            update_ratio, state.theta = state.optimizer.update(-g +
                                                               exp['l2coeff'] *
                                                               state.theta)

            time_elapsed_this_iter = time.time() - tstart_iteration
            state.time_elapsed += time_elapsed_this_iter
            tlogger.info('Evaluate elite')
            #_, test_evals, test_timesteps = worker.monitor_eval_repeated([(state.theta, 0)], max_frames=None, num_episodes=exp['num_test_episodes'])[0]
            game_index, _, test_evals, test_timesteps = worker.monitor_eval_repeated(
                [(state.theta, 0)],
                max_frames=None,
                num_episodes=exp['num_test_episodes'])[0]
            test_timesteps = sum(test_timesteps)
            # Log Results
            tlogger.record_tabular('TestRewMin', np.min(test_evals))
            tlogger.record_tabular('TestRewMean', np.mean(test_evals))
            tlogger.record_tabular('TestRewMedian', np.median(test_evals))
            tlogger.record_tabular('TestRewMax', np.max(test_evals))
            tlogger.record_tabular('TestEpCount', len(test_evals))

            game_stats = [[], []]
            for k, v in enumerate(game_index):
                game_stats[v].append(test_evals[k])

            tlogger.record_tabular('Game0TestRewMin', np.min(game_stats[0]))
            tlogger.record_tabular('Game0TestRewMean', np.mean(game_stats[0]))
            tlogger.record_tabular('Game0TestRewMedian',
                                   np.median(game_stats[0]))
            tlogger.record_tabular('Game0TestRewMax', np.max(game_stats[0]))
            tlogger.record_tabular('Game0TestEpCount', len(game_stats[0]))

            tlogger.record_tabular('Game1TestRewMin', np.min(game_stats[1]))
            tlogger.record_tabular('Game1TestRewMean', np.mean(game_stats[1]))
            tlogger.record_tabular('Game1TestRewMedian',
                                   np.median(game_stats[1]))
            tlogger.record_tabular('Game1TestRewMax', np.max(game_stats[1]))
            tlogger.record_tabular('Game1TestEpCount', len(game_stats[1]))

            tlogger.record_tabular('TestEpLenSum', test_timesteps)
            tlogger.record_tabular('InitialRewMax',
                                   np.max(initial_performance))
            tlogger.record_tabular('InitialRewMean',
                                   np.mean(initial_performance))
            tlogger.record_tabular('InitialRewMedian',
                                   np.median(initial_performance))

            tlogger.record_tabular('TimestepsThisIter', timesteps_this_iter)
            tlogger.record_tabular(
                'TimestepsPerSecondThisIter',
                timesteps_this_iter / (time.time() - tstart_iteration))
            tlogger.record_tabular('TimestepsComputed', state.num_frames)
            tlogger.record_tabular('TimestepsSoFar', state.timesteps_so_far)
            tlogger.record_tabular('TimeElapsedThisIter',
                                   time_elapsed_this_iter)
            tlogger.record_tabular('TimeElapsedThisIterTotal',
                                   time.time() - tstart_iteration)
            tlogger.record_tabular('TimeElapsed', state.time_elapsed)
            tlogger.record_tabular('TimeElapsedTotal',
                                   time.time() - all_tstart)

            tlogger.dump_tabular()

            fps = state.timesteps_so_far / (time.time() - tstart)
            tlogger.info(
                'Timesteps Per Second: {:.0f}. Elapsed: {:.2f}h ETA {:.2f}h'.
                format(fps, (time.time() - all_tstart) / 3600,
                       (exp['timesteps'] - state.timesteps_so_far) / fps /
                       3600))

            if state.adaptive_tslimit:
                if np.mean(
                    [a.training_steps >= state.tslimit
                     for a in results]) > state.incr_tslimit_threshold:
                    state.tslimit = min(
                        state.tslimit * state.tslimit_incr_ratio,
                        state.tslimit_max)
                    tlogger.info('Increased threshold to {}'.format(
                        state.tslimit))

            os.makedirs(log_dir, exist_ok=True)
            save_file = os.path.join(log_dir, 'snapshot.pkl')
            with open(save_file, 'wb+') as file:
                pickle.dump(state, file, pickle.HIGHEST_PROTOCOL)
            copyfile(
                save_file,
                os.path.join(log_dir,
                             'snapshot_gen{:04d}.pkl'.format(state.it)))
            tlogger.info("Saved iteration {} to {}".format(
                state.it, save_file))

            if state.timesteps_so_far >= exp['timesteps']:
                tlogger.info('Training terminated after {} timesteps'.format(
                    state.timesteps_so_far))
                break
            results.clear()
Beispiel #3
0
def main(game,
         filename=None,
         outfile=None,
         model_name="LargeModel",
         no_video=False,
         add_text=False,
         num_runs=RUNS,
         graph=None):

    seeds = default_seeds
    outvid = None
    viewer = None
    iteration = None
    state = None

    if filename:
        with open(filename, 'rb+') as file:
            state = pickle.load(file)
            #if hasattr(state, 'best_score'):
            #    seeds = state.best_score.seeds
            #    iteration = len(seeds)
            #    print("Loading GA snapshot from best_score, iteration: ", len(seeds))
            if hasattr(state, 'elite'):
                seeds = state.elite.seeds
                iteration = state.it
                print("Loading GA snapshot from elite, iteration: {} / {}",
                      len(seeds), iteration)
            else:
                seeds = None
                iteration = state.it
                print("Loading ES snapshot, iteration: {}".format(state.it))

    if outfile:
        pass
        fourcc = cv.VideoWriter_fourcc(*'MJPG')
        outvid = cv.VideoWriter(outfile, fourcc, 16, (VIDEO_SIZE, VIDEO_SIZE))

    env = gym_tensorflow.make(game, 1)

    model = get_model(model_name)
    obs_op = env.observation()
    reset_op = env.reset()

    if model.requires_ref_batch:

        def make_env(b):
            return gym_tensorflow.make(game=game, batch_size=1)

        with tf.Session() as sess:
            ref_batch = gym_tensorflow.get_ref_batch(make_env, sess, 128)
            ref_batch = ref_batch[:, ...]
    else:
        ref_batch = None

    action_op = model.make_net(tf.expand_dims(obs_op, axis=1),
                               env.action_space,
                               batch_size=1,
                               ref_batch=ref_batch)
    if env.discrete_action:
        action_op = tf.argmax(action_op, axis=-1, output_type=tf.int32)
    rew_op, done_op = env.step(action_op)

    if not no_video:
        from gym.envs.classic_control import rendering
        viewer = rendering.SimpleImageViewer()

    if hasattr(env.unwrapped, 'render'):
        obs_op = env.unwrapped.render()

        def display_obs(im):
            # pdb.set_trace()
            if im.shape[1] > 1:
                im = np.bitwise_or(im[0, 0, ...], im[0, 1, ...])
            else:
                im = im[0, 0, ...]
            handle_frame(im, outvid, viewer, game, iteration, add_text)
    else:

        def display_obs(im):
            pdb.set_trace()
            im = im[0, :, :, -1]
            im = np.stack([im] * 3, axis=-1)
            im = (im * 255).astype(np.uint8)
            handle_frame(im, outvid, viewer, game, iteration, add_text)

    rewards = []

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        model.initialize()
        tlogger.info(model.description)

        import pdb
        pdb.set_trace()
        if seeds:
            noise = SharedNoiseTable()
            weights = model.compute_weights_from_seeds(noise, seeds)
            model.load(sess, 0, weights, seeds)
        else:
            weights = state.theta
            model.load(sess, 0, weights, (weights, 0))

        if graph:
            saver.save(sess, graph)

        for i in range(num_runs):
            sess.run(reset_op)
            sess.run(obs_op)
            #recorder.capture_frame()
            display_obs(sess.run(obs_op))

            total_rew = 0
            num_frames = 0
            while True:
                rew, done = sess.run([rew_op, done_op])
                num_frames += 1
                total_rew += rew[0]
                display_obs(sess.run(obs_op))
                time.sleep(4 / 60)
                if done[0]:
                    break

            rewards += [total_rew]
            print('Final reward: ', total_rew, 'after', num_frames, 'steps')

    print(rewards)
    print("Mean: ", np.mean(rewards))
    print("Std: ", np.std(rewards))

    if outvid:
        outvid.release()
Beispiel #4
0
def main(**exp):

    log_dir = tlogger.log_dir()

    tlogger.info(json.dumps(exp, indent=4, sort_keys=True))
    tlogger.info('Logging to: {}'.format(log_dir))
    Model = neuroevolution.models.__dict__[exp['model']]
    all_tstart = time.time()

    noise = SharedNoiseTable()
    rs = np.random.RandomState()

    def make_env0(b):
        return gym_tensorflow.make(game=exp["games"][0], batch_size=b)

    def make_env1(b):
        return gym_tensorflow.make(game=exp["games"][1], batch_size=b)


#    if f_isSingleTask(exp):
#        workers = [ ConcurrentWorkers(make_env0, Model, batch_size=64) ]
#    else:
#        workers = [
#            ConcurrentWorkers(make_env0, Model, batch_size=32),
#            ConcurrentWorkers(make_env1, Model, batch_size=32)
#        ]

    workers = [
        ConcurrentWorkers(make_env0,
                          Model,
                          batch_size=32 * (1 + f_isSingleTask(exp))),
        ConcurrentWorkers(make_env1,
                          Model,
                          batch_size=32 * (1 - f_isSingleTask(exp)))
    ]

    tlogger.info('Start timing')
    tstart = time.time()
    tf_sess = tf.Session()
    tf_sess.run(tf.global_variables_initializer())
    state = TrainingState(exp)
    state.initialize(rs, noise, workers[0].model)

    workers[0].initialize(tf_sess)
    workers[1].initialize(tf_sess)

    if not f_isSingleTask(exp):
        raise RuntimeError(
            "This script is only designed to run single task. Check your configuration and make sure both games have the same name"
        )

    for iteration in range(exp['iterations']):
        tlogger.info("BEGINNING ITERATION: {}".format(iteration))

        # override state using one from a previous run
        state = load_pickle(iteration, exp['model_logdir'], "state")

        ##############
        ### GAME 0 ###
        ##############
        worker = workers[0]
        frames_computed_so_far = tf_sess.run(worker.steps_counter)
        game0_results = []
        game0_rewards = []
        game0_episode_lengths = []

        iterator = iter(
            worker.monitor_eval(make_offspring(exp, noise, rs, worker, state),
                                max_frames=state.tslimit * 4))

        for pos_seeds, pos_reward, pos_length in iterator:
            neg_seeds, neg_reward, neg_length = next(iterator)
            assert pos_seeds == neg_seeds
            result = Offspring(pos_seeds, [pos_reward, neg_reward],
                               [pos_length, neg_length])
            rewards = result.rewards
            game0_results.append(result)
            game0_rewards.append(rewards)
            game0_episode_lengths.append(result.ep_len)
        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far
        game0_returns_n2 = np.array([a.rewards for a in game0_results])
        game0_noise_inds_n = [a.seeds for a in game0_results]
        tlogger.info("game0 rewards: {}".format(np.mean(game0_rewards)))
        tlogger.info("game0 eplens: {}".format(np.mean(game0_episode_lengths)))
        save_pickle(iteration, log_dir, "game0_rewards", game0_rewards)
        save_pickle(iteration, log_dir, "game0_episode_lengths",
                    game0_episode_lengths)

        tlogger.info("Saving offsprings seeds")
        save_pickle(iteration, log_dir, "offsprings_seeds", game0_noise_inds_n)
        save_pickle(iteration, log_dir, "state", state)

        ######################
        ### EVALUATE ELITE ###
        ######################
        _, test_evals, test_timesteps = workers[0].monitor_eval_repeated(
            [(state.theta, 0)],
            max_frames=None,
            num_episodes=exp['num_test_episodes'] //
            (2**(1 - f_isSingleTask(exp))))[0]
        tlogger.info("game0 elite: {}".format(np.mean(test_evals)))
        save_pickle(iteration, log_dir, 'game0_elite', test_evals)
        save_pickle(iteration, log_dir, 'game0_elite_timestemps',
                    test_timesteps)

        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far
        state.it += 1

    os.kill(os.getpid(), signal.SIGTERM)
Beispiel #5
0
def main(**exp):
    log_dir = tlogger.log_dir()

    tlogger.info(json.dumps(exp, indent=4, sort_keys=True))
    tlogger.info('Logging to: {}'.format(log_dir))
    Model = neuroevolution.models.__dict__[exp['model']]
    all_tstart = time.time()

    noise = SharedNoiseTable()
    rs = np.random.RandomState()

    def make_env0(b):
        return gym_tensorflow.make(game=exp["games"][0], batch_size=b)

    def make_env1(b):
        return gym_tensorflow.make(game=exp["games"][1], batch_size=b)

    workers = [
        ConcurrentWorkers(make_env0, Model, batch_size=64),
        ConcurrentWorkers(make_env1, Model, batch_size=64)
    ]

    saver = tf.train.Saver()

    tlogger.info('Start timing')
    tstart = time.time()
    tf_sess = tf.Session()
    tf_sess.run(tf.global_variables_initializer())
    state = TrainingState(exp)
    state.initialize(rs, noise, workers[0].model)

    workers[0].initialize(tf_sess)
    workers[1].initialize(tf_sess)

    for iteration in range(exp['iterations']):
        tlogger.info("BEGINNING ITERATION: {}".format(iteration))

        ##############
        ### GAME 0 ###
        ##############
        worker = workers[0]
        frames_computed_so_far = tf_sess.run(worker.steps_counter)
        game0_results = []
        game0_rewards = []
        game0_episode_lengths = []

        iterator = iter(
            worker.monitor_eval(make_offspring(exp, noise, rs, worker, state),
                                max_frames=state.tslimit * 4))

        for pos_seeds, pos_reward, pos_length in iterator:
            neg_seeds, neg_reward, neg_length = next(iterator)
            assert pos_seeds == neg_seeds
            result = Offspring(pos_seeds, [pos_reward, neg_reward],
                               [pos_length, neg_length])
            rewards = result.rewards
            game0_results.append(result)
            game0_rewards.append(rewards)
            game0_episode_lengths.append(result.ep_len)
        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far
        game0_returns_n2 = np.array([a.rewards for a in game0_results])
        game0_noise_inds_n = [a.seeds for a in game0_results]
        save_pickle(iteration, log_dir, "game0_rewards", game0_rewards)
        save_pickle(iteration, log_dir, "game0_episode_lengths",
                    game0_episode_lengths)

        ##############
        ### GAME 1 ###
        ##############
        worker = workers[1]
        frames_computed_so_far = tf_sess.run(worker.steps_counter)
        game1_results = []
        game1_rewards = []
        game1_episode_lengths = []
        seeds_vector = np.array(game0_noise_inds_n)
        iterator = iter(
            worker.monitor_eval(make_offspring(exp, noise, rs, worker, state,
                                               seeds_vector),
                                max_frames=state.tslimit * 4))

        for pos_seeds, pos_reward, pos_length in iterator:
            neg_seeds, neg_reward, neg_length = next(iterator)
            assert pos_seeds == neg_seeds
            result = Offspring(pos_seeds, [pos_reward, neg_reward],
                               [pos_length, neg_length])
            rewards = result.rewards
            game1_results.append(result)
            game1_rewards.append(rewards)
            game1_episode_lengths.append(result.ep_len)
        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far
        game1_returns_n2 = np.array([a.rewards for a in game1_results])
        game1_noise_inds_n = [a.seeds for a in game1_results]
        save_pickle(iteration, log_dir, "game1_rewards", game1_rewards)
        save_pickle(iteration, log_dir, "game1_episode_lengths",
                    game1_episode_lengths)

        tlogger.info("Saving offsprings seeds")
        save_pickle(iteration, log_dir, "offsprings_seeds", game1_noise_inds_n)

        ####################
        ### UPDATE THETA ###
        ####################
        game_returns = [game0_returns_n2, game1_returns_n2]
        proc_returns = obtain_proc_returns(exp['learn_option'], game_returns)

        assert game0_noise_inds_n == game1_noise_inds_n
        noise_inds_n = game0_noise_inds_n + game1_noise_inds_n  # concatenate the two lists

        g, count = batched_weighted_sum(
            proc_returns[:, 0] - proc_returns[:, 1],
            (noise.get(idx, worker.model.num_params) for idx in noise_inds_n),
            batch_size=500)

        returns_n2 = np.array([a.rewards for a in game0_results] +
                              [a.rewards for a in game1_results])

        g /= returns_n2.size

        assert g.shape == (
            worker.model.num_params,
        ) and g.dtype == np.float32 and count == len(noise_inds_n)
        update_ratio, state.theta = state.optimizer.update(-g +
                                                           exp['l2coeff'] *
                                                           state.theta)

        save_pickle(iteration, log_dir, "state", state)

        ######################
        ### EVALUATE ELITE ###
        ######################
        _, test_evals, test_timesteps = workers[0].monitor_eval_repeated(
            [(state.theta, 0)],
            max_frames=None,
            num_episodes=exp['num_test_episodes'] // 2)[0]
        tlogger.info("game0 elite: {}".format(np.mean(test_evals)))
        save_pickle(iteration, log_dir, 'game0_elite', test_evals)
        save_pickle(iteration, log_dir, 'game0_elite_timestemps',
                    test_timesteps)

        _, test_evals, test_timesteps = workers[1].monitor_eval_repeated(
            [(state.theta, 0)],
            max_frames=None,
            num_episodes=exp['num_test_episodes'] // 2)[0]
        tlogger.info("game1 elite: {}".format(np.mean(test_evals)))
        save_pickle(iteration, log_dir, "game1_elite", test_evals)
        save_pickle(iteration, log_dir, 'game1_elite_timestemps',
                    test_timesteps)

        state.num_frames += tf_sess.run(
            worker.steps_counter) - frames_computed_so_far

        saver.save(tf_sess, "{}/model-{}".format(log_dir, state.it))

        state.it += 1

    os.kill(os.getpid(), signal.SIGTERM)
Beispiel #6
0
def main(game, filename=None, out_dir=None, model_name='LargeModel',
         add_text=False, num_runs=RUNS, layer=None):

    seeds = default_seeds
    outvid = None
    viewer = None
    iteration = None
    state = None

    if filename:
        with open(filename, 'rb+') as file:
            state = pickle.load(file)
            #if hasattr(state, 'best_score'):
            #    seeds = state.best_score.seeds
            #    iteration = len(seeds)
            #    print("Loading GA snapshot from best_score, iteration: ", len(seeds))
            if hasattr(state, 'elite'):
                seeds = state.elite.seeds
                iteration = state.it
                print("Loading GA snapshot from elite, iteration: {} / {}".format(len(seeds), iteration))
            else:
                seeds = None
                iteration = state.it
                print("Loading ES snapshot, iteration: {}", state.it)

    fourcc = cv.VideoWriter_fourcc(*'H264')

    env = gym_tensorflow.make(game, 1)

    model = get_model(model_name)
    obs_op = env.observation()
    reset_op = env.reset()

    if model.requires_ref_batch:
        def make_env(b):
            return gym_tensorflow.make(game=game, batch_size=1)
        with tf.Session() as sess:
            ref_batch = gym_tensorflow.get_ref_batch(make_env, sess, 128)
            ref_batch = ref_batch[:, ...]
    else:
        ref_batch = None

    input_op = tf.expand_dims(obs_op, axis=1)
    action_op = model.make_net(input_op, env.action_space, batch_size=1, ref_batch=ref_batch)
    if env.discrete_action:
        action_op = tf.argmax(action_op, axis=-1, output_type=tf.int32)
    rew_op, done_op = env.step(action_op)

    out_vids = {'all': cv.VideoWriter(os.path.join(out_dir, 'all.mp4'),
                                      fourcc, 16, (VIDEO_SIZE, VIDEO_SIZE))}

    if hasattr(env.unwrapped, 'render'):
        obs_op = env.unwrapped.render()

        def display_obs(im, viz):
            # pdb.set_trace()
            if im.shape[1] > 1:
                im = np.bitwise_or(im[0, 0, ...], im[0, 1, ...])
            else:
                im = im[0, 0, ...]
            for key in out_vids.keys():
                im = combine_viz(im, viz, key)
                handle_frame(im, out_vids[key], viewer, game, iteration, add_text)
    else:
        def display_obs(im, viz):
            im = im[0, :, :, -1]
            im = np.stack([im] * 3, axis=-1)
            im = (im * 255).astype(np.uint8)
            for key in out_vids.keys():
                im = combine_viz(im, viz, key)
                handle_frame(im, out_vids[key], viewer, game, iteration, add_text)

    rewards = []

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        model.initialize()
        tlogger.info(model.description)

        if seeds:
            noise = SharedNoiseTable()
            weights = model.compute_weights_from_seeds(noise, seeds)
            model.load(sess, 0, weights, seeds)
        else:
            weights = state.theta
            model.load(sess, 0, weights, (weights, 0))

        success, images = get_nn_images(sess, input_op, model)

        for key in images.keys():
            out_vids[key] = cv.VideoWriter(
                os.path.join(out_dir, '{}.mp4'.format(key.replace('/', '-'))),
                fourcc, 16, (VIDEO_SIZE, VIDEO_SIZE))

        for i in range(num_runs):
            sess.run(reset_op)
            # recorder.capture_frame()

            total_rew = 0
            num_frames = 0
            while True:
                img = sess.run(obs_op)
                success, images = get_nn_images(sess, input_op, model)

                rew, done = sess.run([rew_op, done_op])
                num_frames += 1
                total_rew += rew[0]
                display_obs(img, images)
                # time.sleep(4/60)
                if done[0] or num_frames == 50:
                    rewards += [total_rew]
                    print('Final reward: ', total_rew, 'after', num_frames, 'steps')
                    break

    print(rewards)
    print("Mean: ", np.mean(rewards))
    print("Std: ", np.std(rewards))

    for key in out_vids:
        out_vids[key].release()