Esempio n. 1
0
def train_function(config):

    # ----------- Alg parameters ----------------- #
    experiment = config['experiment']
    if experiment == "particle":
        scenario_name = config['scenario']
    seed = config['seed']
    np.random.seed(seed)
    random.seed(seed)

    # Curriculum stage
    stage = config['stage']
    port = config['port']
    dir_name = config['dir_name']
    dir_restore = config['dir_restore']
    use_alg_credit = config['use_alg_credit']
    use_qmix = config['use_qmix']
    use_Q_credit = config['use_Q_credit']
    # If 1, then uses Q-net and global reward
    use_Q = config['use_Q']
    use_V = config['use_V']
    if experiment == "sumo":
        dimensions = config['dimensions_sumo']
    elif experiment == "particle":
        dimensions = config['dimensions_particle']
    # If 1, then restores variables from same stage
    restore_same_stage = config['restore_same_stage']
    # If 1, then does not restore variables, even if stage > 1
    train_from_nothing = config['train_from_nothing']
    # Name of model to restore
    model_name = config['model_name']
    # Total number of training episodes
    N_train = config['N_train']
    period = config['period']
    # Number of evaluation episodes to run every <period>
    N_eval = config['N_eval']
    summarize = config['summarize']
    alpha = config['alpha']
    lr_Q = config['lr_Q']
    lr_V = config['lr_V']
    lr_actor = config['lr_actor']
    dual_buffer = config['dual_buffer']
    buffer_size = config['buffer_size']
    threshold = config['threshold']
    batch_size = config['batch_size']
    pretrain_episodes = config['pretrain_episodes']
    steps_per_train = config['steps_per_train']
    max_steps = config['max_steps']
    # Probability of using random configuration
    prob_random = config['prob_random']

    epsilon_start = config['epsilon_start']
    epsilon_end = config['epsilon_end']
    epsilon_div = config['epsilon_div']
    epsilon_step = (epsilon_start - epsilon_end) / float(epsilon_div)

    if experiment == "sumo":
        # ----------- SUMO parameters ---------------- #
        with open('config_sumo_stage%d.json' % stage) as f:
            config_sumo = json.load(f)
        n_agents = config_sumo["n_agents"]
        list_goals_fixed = config_sumo['goal_lane']
        list_routes_fixed = config_sumo['route']
        list_lanes_fixed = config_sumo['lane']
        list_goal_pos = config_sumo['goal_pos']
        list_speeds = config_sumo['speed']
        init_positions = config_sumo['init_position']
        list_id = config_sumo['id']
        list_vtypes = config_sumo['vtypes']
        depart_mean = config_sumo['depart_mean']
        depart_stdev = config_sumo['depart_stdev']
        total_length = config_sumo['total_length']
        total_width = config_sumo['total_width']
        save_threshold = config_sumo['save_threshold']
        map_route_idx = {'route_ramp': 0, 'route_straight': 1}

        sim = sumo_simulator.Simulator(port,
                                       list_id=list_id,
                                       other_lc_mode=0b1000000001,
                                       sublane_res=0.8,
                                       seed=seed)
        for i in range(int(2 / sim.dt)):
            sim.step()
    elif experiment == 'particle':
        with open(config["particle_config"]) as f:
            config_particle = json.load(f)
        n_agents = config_particle['n_agents']
        scenario = scenarios.load(scenario_name + ".py").Scenario()
        world = scenario.make_world(n_agents, config_particle, prob_random)
        env = MultiAgentEnv(world,
                            scenario.reset_world,
                            scenario.reward,
                            scenario.observation,
                            None,
                            scenario.done,
                            max_steps=max_steps)
    elif experiment == 'checkers':
        with open("config_checkers_stage%d.json" % stage) as f:
            config_checkers = json.load(f)
        n_agents = config_checkers['n_agents']
        dimensions = config_checkers['dimensions']
        init = config_checkers['init']
        env = checkers.Checkers(init['n_rows'], init['n_columns'],
                                init['n_obs'], init['agents_r'],
                                init['agents_c'], n_agents, max_steps)

    l_action = dimensions['l_action']
    l_goal = dimensions['l_goal']

    # Create entire computational graph
    # Creation of new trainable variables for new curriculum
    # stage is handled by networks.py, given the stage number
    if use_alg_credit:
        if experiment == 'checkers':
            alg = alg_credit_checkers.Alg(experiment,
                                          dimensions,
                                          stage,
                                          n_agents,
                                          lr_V=lr_V,
                                          lr_Q=lr_Q,
                                          lr_actor=lr_actor,
                                          use_Q_credit=use_Q_credit,
                                          use_V=use_V,
                                          nn=config_checkers['nn'])
        else:
            alg = alg_credit.Alg(experiment,
                                 dimensions,
                                 stage,
                                 n_agents,
                                 lr_V=lr_V,
                                 lr_Q=lr_Q,
                                 lr_actor=lr_actor,
                                 use_Q_credit=use_Q_credit,
                                 use_V=use_V,
                                 nn=config['nn'])
    elif not use_qmix:
        if experiment == 'checkers':
            alg = alg_baseline_checkers.Alg(experiment,
                                            dimensions,
                                            stage,
                                            n_agents,
                                            lr_V=lr_V,
                                            lr_Q=lr_Q,
                                            lr_actor=lr_actor,
                                            use_Q=use_Q,
                                            use_V=use_V,
                                            alpha=alpha,
                                            nn=config_checkers['nn'],
                                            IAC=config['IAC'])
        else:
            alg = alg_baseline.Alg(experiment,
                                   dimensions,
                                   stage,
                                   n_agents,
                                   lr_V=lr_V,
                                   lr_Q=lr_Q,
                                   lr_actor=lr_actor,
                                   use_Q=use_Q,
                                   use_V=use_V,
                                   alpha=alpha,
                                   nn=config['nn'],
                                   IAC=config['IAC'])
    else:
        print("Using QMIX")
        if experiment == 'checkers':
            alg = alg_qmix_checkers.Alg(experiment,
                                        dimensions,
                                        stage,
                                        n_agents,
                                        lr_Q=lr_Q,
                                        nn=config_checkers['nn'])
        else:
            alg = alg_qmix.Alg(experiment,
                               dimensions,
                               stage,
                               n_agents,
                               lr_Q=lr_Q)

    print("Initialized computational graph")

    list_variables = tf.trainable_variables()
    if stage == 1 or restore_same_stage or train_from_nothing:
        saver = tf.train.Saver()
    elif stage == 2:
        # to_restore = [v for v in list_variables if ('stage-%d'%stage not in v.name.split('/') and 'Policy_target' not in v.name.split('/'))]
        to_restore = []
        for v in list_variables:
            list_split = v.name.split('/')
            if ('stage-%d' % stage not in list_split
                ) and ('Policy_target' not in list_split) and (
                    'Q_credit_main' not in list_split) and ('Q_credit_target'
                                                            not in list_split):
                to_restore.append(v)
        saver = tf.train.Saver(to_restore)
    else:
        # restore only those variables that were not
        # just created at this curriculum stage
        to_restore = [
            v for v in list_variables
            if 'stage-%d' % stage not in v.name.split('/')
        ]
        saver = tf.train.Saver(to_restore)

    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    tf.set_random_seed(seed)
    sess = tf.Session(config=config)

    writer = tf.summary.FileWriter('../saved/%s' % dir_name, sess.graph)

    sess.run(tf.global_variables_initializer())
    print("Initialized variables")

    if train_from_nothing == 0:
        print("Restoring variables from %s" % dir_restore)
        saver.restore(sess, '../saved/%s/%s' % (dir_restore, model_name))
        if stage == 2 and use_alg_credit and use_Q_credit:
            # Copy weights of Q_global to Q_credit at the start of Stage 2
            sess.run(alg.list_initialize_credit_ops)
            for var in list_variables:
                if var.name == 'Q_global_main/Q_branch1/kernel:0':
                    print("Q_global")
                    print(sess.run(var))
                    print("")
                if var.name == 'Q_credit_main/Q_branch1/kernel:0':
                    print("Q_credit")
                    print(sess.run(var))
                    print("")

    # initialize target networks to equal main networks
    sess.run(alg.list_initialize_target_ops)

    # save everything without exclusion
    saver = tf.train.Saver(max_to_keep=None)

    epsilon = epsilon_start
    # For computing average over 100 episodes
    reward_local_century = np.zeros(n_agents)
    reward_global_century = 0

    # Write log headers
    header = "Episode,r_global"
    header_c = "Century,r_global_avg"
    for idx in range(n_agents):
        header += ',r_%d' % idx
        header_c += ',r_avg_%d' % idx
    header_c += ",r_global_eval"
    for idx in range(n_agents):
        header_c += ',r_eval_%d' % idx

    if experiment == 'sumo':
        for idx in range(n_agents):
            header += ',route_%d,lane_%d,goal_%d' % (idx, idx, idx)
    header_c += ',r_eval_local,duration (s)'
    header += '\n'
    header_c += '\n'
    if not os.path.exists('../log/%s' % dir_name):
        os.makedirs('../log/%s' % dir_name)
    with open('../log/%s/log.csv' % dir_name, 'w') as f:
        f.write(header)
    with open('../log/%s/log_century.csv' % dir_name, 'w') as f:
        f.write(header_c)

    if dual_buffer:
        buf = replay_buffer_dual.Replay_Buffer(size=buffer_size)
    else:
        buf = replay_buffer.Replay_Buffer(size=buffer_size)

    t_start = time.time()

    dist_action = np.zeros(l_action)
    step = 0
    # Each iteration is a training episode
    for idx_episode in range(1, N_train + 1):
        # print("Episode", idx_episode)
        if experiment == "sumo":
            t_ms = sim.traci.simulation.getCurrentTime()
            # SUMO time functions return negative values afer 24 days (in millisecond) of simulation time
            # Hence use 0 for departure time, essentially triggering an immediate departure
            if 0 < t_ms and t_ms < 2073600e3:
                depart_times = [
                    np.random.normal(t_ms / 1000.0 + depart_mean[idx],
                                     depart_stdev) for idx in range(n_agents)
                ]
            else:
                depart_times = [0 for idx in range(n_agents)]

            # Goals for input to policy and value function
            goals = np.zeros([n_agents, l_goal])
            list_routes = ['route_straight'] * n_agents
            list_lanes = [0] * n_agents
            list_goal_lane = [0] * n_agents
            rand_num = random.random()
            if rand_num < prob_random:
                # Random settings for route, lane and goal
                init = 'Random'
                for idx in range(n_agents):
                    route = 'route_straight'
                    lane = np.random.choice([0, 1, 2, 3], p=np.ones(4) * 0.25)
                    goal_lane = np.random.choice(np.arange(l_goal),
                                                 p=np.ones(l_goal) /
                                                 float(l_goal))
                    list_routes[idx] = route
                    list_lanes[idx] = lane
                    list_goal_lane[idx] = goal_lane
                    goals[idx, goal_lane] = 1
            else:
                init = 'Preset'
                # Use predetermined values for route, lane, goal
                for idx in range(n_agents):
                    list_routes[idx] = list_routes_fixed[idx]
                    goal_lane = list_goals_fixed[idx]
                    list_goal_lane[idx] = goal_lane
                    list_lanes[idx] = list_lanes_fixed[idx]
                    goals[idx, goal_lane] = 1

            env = multicar_simple.Multicar(sim,
                                           n_agents,
                                           list_goal_lane,
                                           list_goal_pos,
                                           list_routes,
                                           list_speeds,
                                           list_lanes,
                                           init_positions,
                                           list_id,
                                           list_vtypes,
                                           depart_times,
                                           total_length=total_length,
                                           total_width=total_width,
                                           safety=True)
            global_state, local_others, local_self, done = env.reset()
        elif experiment == "particle":
            global_state, local_others, local_self, done = env.reset()
            goals = np.zeros([n_agents, l_goal])
            for idx in range(n_agents):
                goals[idx] = env.world.landmarks[idx].state.p_pos
        elif experiment == "checkers":
            if n_agents == 1:
                if np.random.randint(2) == 0:
                    goals = np.array([[1, 0]])
                else:
                    goals = np.array([[0, 1]])
            else:
                goals = np.eye(n_agents)
            global_state, local_others, local_self_t, local_self_v, done = env.reset(
                goals)
            actions_prev = np.zeros(n_agents, dtype=np.int)

        reward_global = 0
        reward_local = np.zeros(n_agents)

        # step = 0
        summarized = False
        if dual_buffer:
            buf_episode = []
        while not done:

            if idx_episode < pretrain_episodes and (stage == 1 or
                                                    train_from_nothing == 1):
                # Random actions when filling replay buffer
                actions = np.random.randint(0, l_action, n_agents)
            else:
                # Run actor network for all agents as batch
                if experiment == 'checkers':
                    actions = alg.run_actor(actions_prev, local_others,
                                            local_self_t, local_self_v, goals,
                                            epsilon, sess)
                else:
                    actions = alg.run_actor(local_others, local_self, goals,
                                            epsilon, sess)

            dist_action[actions[0]] += 1
            if experiment == 'sumo':
                # check feasible actions
                actions = env.check_actions(actions)

            # step environment
            if experiment == 'checkers':
                next_global_state, next_local_others, next_local_self_t, next_local_self_v, reward, local_rewards, done = env.step(
                    actions)
            else:
                next_global_state, next_local_others, next_local_self, reward, local_rewards, done = env.step(
                    actions)

            step += 1

            # store transition into memory
            if dual_buffer:
                if experiment == 'checkers':
                    buf_episode.append(
                        np.array([
                            global_state[0], global_state[1],
                            np.array(local_others),
                            np.array(local_self_t),
                            np.array(local_self_v), actions_prev, actions,
                            reward, local_rewards, next_global_state[0],
                            next_global_state[1],
                            np.array(next_local_others),
                            np.array(next_local_self_t),
                            np.array(next_local_self_v), done, goals
                        ]))
                else:
                    buf_episode.append(
                        np.array([
                            global_state,
                            np.array(local_others),
                            np.array(local_self), actions, reward,
                            local_rewards, next_global_state,
                            np.array(next_local_others),
                            np.array(next_local_self), done, goals
                        ]))
            else:
                if experiment == 'checkers':
                    buf.add(
                        np.array([
                            global_state[0], global_state[1],
                            np.array(local_others),
                            np.array(local_self_t),
                            np.array(local_self_v), actions_prev, actions,
                            reward, local_rewards, next_global_state[0],
                            next_global_state[1],
                            np.array(next_local_others),
                            np.array(next_local_self_t),
                            np.array(next_local_self_v), done, goals
                        ]))
                else:
                    buf.add(
                        np.array([
                            global_state,
                            np.array(local_others),
                            np.array(local_self), actions, reward,
                            local_rewards, next_global_state,
                            np.array(next_local_others),
                            np.array(next_local_self), done, goals
                        ]))

            if (idx_episode >= pretrain_episodes) and (step % steps_per_train
                                                       == 0):
                # Sample batch of transitions from replay buffer
                batch = buf.sample_batch(batch_size)

                if summarize and idx_episode % period == 0 and not summarized:
                    # Write TF summary every <period> episodes,
                    # at the first <steps_per_train> step
                    alg.train_step(sess,
                                   batch,
                                   epsilon,
                                   idx_episode,
                                   summarize=True,
                                   writer=writer)
                    summarized = True
                else:
                    alg.train_step(sess,
                                   batch,
                                   epsilon,
                                   idx_episode,
                                   summarize=False,
                                   writer=None)

            global_state = next_global_state
            local_others = next_local_others
            if experiment == 'checkers':
                local_self_t = next_local_self_t
                local_self_v = next_local_self_v
                actions_prev = actions
            else:
                local_self = next_local_self

            reward_local += local_rewards
            reward_global += reward

        if dual_buffer:
            if experiment == 'sumo':
                buf.add(buf_episode, np.sum(reward_local) < threshold)
            elif experiment == 'particle':
                buf.add(buf_episode, scenario.collisions != 0)

        if idx_episode >= pretrain_episodes and epsilon > epsilon_end:
            epsilon -= epsilon_step

        reward_local_century += reward_local
        reward_global_century += reward_global

        # ----------- Log performance --------------- #

        if idx_episode % period == 0:
            dist_action = dist_action / np.sum(dist_action)
            t_end = time.time()
            print("\n Evaluating")
            if experiment == 'sumo':
                r_local_eval, r_global_eval = evaluate.test(
                    N_eval, sim, sess, depart_mean, depart_stdev, n_agents,
                    l_goal, list_routes_fixed, list_lanes_fixed,
                    list_goals_fixed, prob_random, list_goal_pos, list_speeds,
                    init_positions, list_id, list_vtypes, alg)
                if np.all(r_local_eval > save_threshold):
                    saver.save(
                        sess, '../saved/%s/model_good_%d.ckpt' %
                        (dir_name, idx_episode))
            elif experiment == 'particle':
                r_local_eval, r_global_eval = evaluate.test_particle(
                    N_eval, env, sess, n_agents, l_goal, alg, render=False)
            elif experiment == 'checkers':
                r_local_eval, r_global_eval = evaluate.test_checkers(
                    N_eval, env, sess, n_agents, alg)
                if stage == 1 and np.sum(r_local_eval) > 9.0:
                    saver.save(
                        sess, '../saved/%s/model_good_%d.ckpt' %
                        (dir_name, idx_episode))
            s = '%d,%.2f,' % (idx_episode,
                              reward_global_century / float(period))
            s += ','.join([
                '{:.2f}'.format(val / float(period))
                for val in reward_local_century
            ])
            s += ',%.2f,' % (r_global_eval)
            s += ','.join(['{:.2f}'.format(val) for val in r_local_eval])
            s += ',%.2f,%d' % (np.sum(r_local_eval), int(t_end - t_start))
            s += '\n'
            print(s)
            with open('../log/%s/log_century.csv' % dir_name, 'a') as f:
                f.write(s)
            reward_local_century = np.zeros(n_agents)
            reward_global_century = 0
            print("Action distribution ", dist_action)
            if dual_buffer:
                print(
                    "length buffer good %d, length buffer others %d, epsilon %.3f"
                    % (len(buf.memory_2), len(buf.memory_1), epsilon))
            else:
                print("epsilon %.3f" % epsilon)
            dist_action = np.zeros(l_action)

            t_start = time.time()

        s = '%d,%.2f,' % (idx_episode, reward_global)
        s += ','.join(['{:.2f}'.format(val) for val in reward_local])
        if experiment == 'sumo':
            for idx in range(n_agents):
                s += ',%d,%d,%d' % (map_route_idx[list_routes[idx]],
                                    list_lanes[idx], list_goal_lane[idx])
        s += '\n'
        with open('../log/%s/log.csv' % dir_name, 'a') as f:
            f.write(s)

    print("Saving stage %d variables" % stage)
    if not os.path.exists('../saved/%s' % dir_name):
        os.makedirs('../saved/%s' % dir_name)
    saver.save(sess, '../saved/%s/model_final.ckpt' % dir_name)
def train_function(config):

    config_env = config['env']
    config_main = config['main']
    config_alg = config['alg']

    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    dir_name = config_main['dir_name']
    model_name = config_main['model_name']
    summarize = config_main['summarize']
    save_period = config_main['save_period']

    os.makedirs('../results/%s' % dir_name, exist_ok=True)
    with open('../results/%s/%s' % (dir_name, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)

    N_train = config_alg['N_train']
    N_eval = config_alg['N_eval']
    period = config_alg['period']
    buffer_size = config_alg['buffer_size']
    batch_size = config_alg['batch_size']
    pretrain_episodes = config_alg['pretrain_episodes']
    steps_per_train = config_alg['steps_per_train']

    epsilon_start = config_alg['epsilon_start']
    epsilon_end = config_alg['epsilon_end']
    epsilon_div = config_alg['epsilon_div']
    epsilon_step = (epsilon_start - epsilon_end) / float(epsilon_div)
    epsilon = epsilon_start

    env = env_wrapper.Env(config_env, config_main)
    config_env_mod = config_env.copy()
    config_env_mod[
        'self_play'] = False  # test against stock AI during evaluation episodes
    config_env_mod['num_away_ai_players'] = config_env_mod[
        'num_away_players']  # set number of stock AI
    env_eval = env_wrapper.Env(config_env_mod, config_main)
    self_play = config_env['self_play']
    if self_play:
        assert (config_env['num_away_ai_players'] == 0)

    l_state = env.state_dim
    l_action = env.action_dim
    l_obs = env.obs_dim
    N_home = config_env['num_home_players']

    if config_main['alg_name'] == 'qmix':
        alg = alg_qmix.Alg(config_alg, N_home, l_state, l_obs, l_action,
                           config['nn_qmix'])
    elif config_main['alg_name'] == 'iql':
        alg = alg_iql.Alg(config_alg, N_home, l_state, l_obs, l_action,
                          config['nn_iql'])

    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())

    sess.run(alg.list_initialize_target_ops)

    if summarize:
        writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph)
    saver = tf.train.Saver(max_to_keep=config_main['max_to_keep'])

    buf = replay_buffer.Replay_Buffer(size=buffer_size)

    # Logging
    header = "Episode,Step,Step_train,R_avg,R_eval,Steps_per_eps,Opp_win_rate,Win_rate,T_env,T_alg\n"
    with open("../results/%s/log.csv" % dir_name, 'w') as f:
        f.write(header)

    t_start = time.time()
    t_env = 0
    t_alg = 0

    reward_period = 0

    step = 0
    step_train = 0
    for idx_episode in range(1, N_train + 1):

        state_home, state_away, list_obs_home, list_obs_away, done = env.reset(
        )

        reward_episode = 0
        summarized = 0
        while not done:

            if idx_episode < pretrain_episodes:
                if self_play:
                    actions_int_h, actions_int_a = env.random_actions()
                    actions_int = (actions_int_h, actions_int_a)
                else:
                    actions_int = env.random_actions()
            else:
                t_alg_start = time.time()
                if self_play:
                    actions_int_h = alg.run_actor(list_obs_home, epsilon, sess)
                    actions_int_a = alg.run_actor(list_obs_away, epsilon, sess)
                    actions_int = (actions_int_h, actions_int_a)
                else:
                    actions_int = alg.run_actor(list_obs_home, epsilon, sess)
                t_alg += time.time() - t_alg_start

            t_env_start = time.time()
            state_home_next, state_away_next, list_obs_home_next, list_obs_away_next, reward, local_rewards, done, info = env.step(
                actions_int)
            t_env += time.time() - t_env_start

            step += 1

            if self_play:
                buf.add(
                    np.array([
                        state_home,
                        np.array(list_obs_home), actions_int_h, reward[0],
                        state_home_next,
                        np.array(list_obs_home_next), done
                    ]))
                buf.add(
                    np.array([
                        state_away,
                        np.array(list_obs_away), actions_int_a, reward[1],
                        state_away_next,
                        np.array(list_obs_away_next), done
                    ]))
            else:
                buf.add(
                    np.array([
                        state_home,
                        np.array(list_obs_home), actions_int, reward,
                        state_home_next,
                        np.array(list_obs_home_next), done
                    ]))

            if (idx_episode >= pretrain_episodes) and (step % steps_per_train
                                                       == 0):
                batch = buf.sample_batch(batch_size)
                t_alg_start = time.time()
                if summarize and idx_episode % period == 0 and not summarized:
                    alg.train_step(sess,
                                   batch,
                                   step_train,
                                   summarize=True,
                                   writer=writer)
                    summarized = True
                else:
                    alg.train_step(sess,
                                   batch,
                                   step_train,
                                   summarize=False,
                                   writer=None)
                step_train += 1
                t_alg += time.time() - t_alg_start

            state_home = state_home_next
            list_obs_home = list_obs_home_next
            state_away = state_away_next
            list_obs_away = list_obs_away_next
            if self_play:
                reward_episode += reward[0]
            else:
                reward_episode += reward

        if idx_episode >= pretrain_episodes and epsilon > epsilon_end:
            epsilon -= epsilon_step

        reward_period += reward_episode

        if idx_episode == 1 or idx_episode % (5 * period) == 0:
            print(
                '{:>10s}{:>10s}{:>12s}{:>8s}{:>8s}{:>15s}{:>15s}{:>10s}{:>12s}{:>12s}'
                .format(*(header.strip().split(','))))

        if idx_episode % period == 0:
            # Evaluation episodes
            r_avg_eval, steps_per_episode, win_rate, win_rate_opponent = evaluate.test(
                N_eval, env_eval, sess, alg)
            if win_rate >= config_main['save_threshold']:
                saver.save(
                    sess, '../results/%s/%s-%d' %
                    (dir_name, "model_good.ckpt", idx_episode))

            s = '%d,%d,%d,%.2f,%.2f,%d,%.2f,%.2f,%.5e,%.5e\n' % (
                idx_episode, step, step_train, reward_period / float(period),
                r_avg_eval, steps_per_episode, win_rate_opponent, win_rate,
                t_env, t_alg)
            with open('../results/%s/log.csv' % dir_name, 'a') as f:
                f.write(s)
            print(
                '{:10d}{:10d}{:12d}{:8.2f}{:8.2f}{:15d}{:15.2f}{:10.2f}{:12.5e}{:12.5e}\n'
                .format(idx_episode, step, step_train,
                        reward_period / float(period), r_avg_eval,
                        int(steps_per_episode), win_rate_opponent, win_rate,
                        t_env, t_alg))
            reward_period = 0

        if idx_episode % save_period == 0:
            saver.save(
                sess,
                '../results/%s/%s-%d' % (dir_name, "model.ckpt", idx_episode))

    saver.save(sess, '../results/%s/%s' % (dir_name, model_name))

    with open('../results/%s/time.txt' % dir_name, 'a') as f:
        f.write('t_env_total,t_env_per_step,t_alg_total,t_alg_per_step\n')
        f.write('%.5e,%.5e,%.5e,%.5e' %
                (t_env, t_env / step, t_alg, t_alg / step))
Esempio n. 3
0
def train_function(config):

    config_env = config['env']
    config_main = config['main']
    config_alg = config['alg']
    config_h = config['h_params']
    
    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)

    alg_name = config_main['alg_name']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']
    summarize = config_main['summarize']
    save_period = config_main['save_period']
    
    os.makedirs('../results/%s' % dir_name, exist_ok=True)
    with open('../results/%s/%s'
              % (dir_name, 'config.json'), 'w') as f:
        json.dump(config, f, indent=4)

    N_train = config_alg['N_train']
    N_eval = config_alg['N_eval']
    period = config_alg['period']
    buffer_size = config_alg['buffer_size']
    batch_size = config_alg['batch_size']
    pretrain_episodes = config_alg['pretrain_episodes']
    steps_per_train = config_alg['steps_per_train']
    
    epsilon_start = config_alg['epsilon_start']
    epsilon_end = config_alg['epsilon_end']
    epsilon_div = config_alg['epsilon_div']
    epsilon_step = (epsilon_start - epsilon_end)/float(epsilon_div)
    epsilon = epsilon_start
    
    # Final number of roles
    N_roles = config_h['N_roles']
    steps_per_assign = config_h['steps_per_assign']

    # Number of roles increases according to a curriculum
    N_roles_current = config_h['N_roles_start']
    assert(N_roles_current <= N_roles)
    curriculum_threshold = config_h['curriculum_threshold']

    # Reward coefficient
    alpha = config_h['alpha_start']
    alpha_end = config_h['alpha_end']
    alpha_step = config_h['alpha_step']
    alpha_threshold = config_h['alpha_threshold']

    # Number of single-agent trajectory segments used for each decoder training step 
    N_batch_hsd = config_h['N_batch_hsd']

    env = env_wrapper.Env(config_env, config_main)
    
    l_state = env.state_dim
    l_action = env.action_dim
    l_obs = env.obs_dim
    N_home = config_env['num_home_players']

    alg = alg_hsd.Alg(config_alg, config_h, N_home, l_state, l_obs, l_action, N_roles, config['nn_hsd'])
    
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())
    
    sess.run(alg.list_initialize_target_ops)

    if summarize:
        writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph)
    saver = tf.train.Saver(max_to_keep=config_main['max_to_keep'])

    # Replay buffer for high level policy
    buf_high = replay_buffer.Replay_Buffer(size=buffer_size)
    # Buffer for low level agent policy
    buf_low = replay_buffer.Replay_Buffer(size=buffer_size)

    # Dataset of [obs traj, z] for training decoder
    dataset = []

    header = "Episode,Step,Step_train,N_z,alpha,Exp_prob,R_avg,R_eval,Steps_per_eps,Opp_win_rate,Win_rate,T_env,T_alg\n"
    with open("../results/%s/log.csv" % dir_name, 'w') as f:
        f.write(header)
    
    t_env = 0
    t_alg = 0
    
    reward_period = 0
    expected_prob = 0

    step = 0
    step_train = 0
    step_h = 0

    for idx_episode in range(1, N_train+1):
        
        state_home, state_away, list_obs_home, list_obs_away, done = env.reset()

        # Variables with suffix '_h' are used for training the high level policy
        state_home_h, state_away_h, list_obs_home_h, list_obs_away_h = state_home, state_away, list_obs_home, list_obs_away
        # Cumulative discounted reward for high-level policy
        reward_h = 0
        # Action taken by high-level policy, limited to current number of roles in curriculum
        roles_int = np.random.randint(0, N_roles_current, N_home)

        # List of lists, where each sublist is a trajectory of observations by an agent
        # Each trajectory contains a history of length at most <steps_per_assign> previous steps,
        # Used for computing intrinsic reward
        list_obs_traj = [ [] for idx_agent in range(N_home) ]

        reward_episode = 0
        summarized = 0
        summarized_h = 0
        step_episode = 0 # steps within an episode
        while not done:
            
            reward_intrinsic = np.zeros(N_home)
            # Run high-level policy as usual to select the subgoal
            if step_episode % steps_per_assign == 0:
                if step_episode != 0:
                    # The environment state at this point, e.g. <state_home>,
                    # acts like the "next state" for the high-level policy
                    # All of the intervening environment steps act as a single step for the high-level policy
                    r_discounted = reward_h * (config_alg['gamma']**steps_per_assign)
                    buf_high.add( np.array([ state_home_h, np.array(list_obs_home_h), roles_int, r_discounted, state_home, np.array(list_obs_home), done ]) )

                    # Store all agents' observation trajectories into dataset for training decoder
                    for idx in range(N_home):
                        dataset.append( np.array([ np.array(list_obs_traj[idx][-steps_per_assign:]), roles[idx]]) )

                    # Compute intrinsic reward for all agents, returns an np.array of N_agents scalar values
                    reward_intrinsic = alg.compute_reward( sess, np.array( list_obs_traj ), roles )
                    
                step_h += 1

                # Take high-level action for all agents
                if idx_episode < pretrain_episodes:
                    roles_int = np.random.randint(0, N_roles_current, N_home)
                else:
                    t_alg_start = time.time()
                    roles_int = alg.assign_roles(list_obs_home, epsilon, sess, N_roles_current)
                    t_alg += time.time() - t_alg_start
                roles = np.zeros([N_home, N_roles])
                roles[np.arange(N_home), roles_int] = 1

                if (idx_episode >= pretrain_episodes) and (step_h % steps_per_train == 0):
                    # Conduct training of high-level policy
                    batch = buf_high.sample_batch(batch_size)
                    t_alg_start = time.time()
                    if summarize and idx_episode % period == 0 and not summarized_h:
                        alg.train_policy_high(sess, batch, step_train, summarize=True, writer=writer)
                        summarized_h = True
                    else:
                        alg.train_policy_high(sess, batch, step_train, summarize=False, writer=None)
                    step_train += 1
                    t_alg += time.time() - t_alg_start
    
                # Update high-level state
                state_home_h, state_away_h, list_obs_home_h, list_obs_away_h = state_home, state_away, list_obs_home, list_obs_away

                reward_h = 0

            # Take low-level actions, conditioned on high level role assignment
            if idx_episode < pretrain_episodes:
                actions_int = env.random_actions()
            else:
                t_alg_start = time.time()
                actions_int = alg.run_actor(list_obs_home, roles, epsilon, sess)
                t_alg += time.time() - t_alg_start

            t_env_start = time.time()
            state_home_next, state_away_next, list_obs_home_next, list_obs_away_next, reward, local_rewards, done, info = env.step(actions_int)
            t_env += time.time() - t_env_start

            # Ignore local_rewards from environment. Compute local rewards using intrinsic and global environment reward
            local_rewards = np.array( [reward] * N_home )
            local_rewards = alpha * local_rewards + (1 - alpha) * reward_intrinsic
            
            # Collect low-level observation into trajectories
            for idx_agent in range(N_home):
                list_obs_traj[idx_agent].append( list_obs_home[idx_agent] )
                # Limit to be max length <steps_per_assign>
                list_obs_traj[idx_agent] = list_obs_traj[idx_agent][-steps_per_assign:]

            step += 1
            step_episode += 1

            l_temp = [np.array(list_obs_home), actions_int, local_rewards, np.array(list_obs_home_next), roles, done]
            a_temp = np.empty(len(l_temp), dtype=object)
            a_temp[:] = l_temp
            buf_low.add( a_temp )

            if (idx_episode >= pretrain_episodes) and (step % steps_per_train == 0):
                # Train low-level policies
                batch = buf_low.sample_batch(batch_size)
                t_alg_start = time.time()
                if summarize and idx_episode % period == 0 and not summarized:
                    alg.train_policy_low(sess, batch, step_train, summarize=True, writer=writer)
                    summarized = True
                else:
                    alg.train_policy_low(sess, batch, step_train, summarize=False, writer=None)
                step_train += 1
                t_alg += time.time() - t_alg_start

            state_home = state_home_next
            list_obs_home = list_obs_home_next
            reward_episode += reward
            reward_h += reward

            if done:
                # Since the episode is done, we also terminate the current role assignment period,
                # even if not all <steps_per_assign> have been completed
                r_discounted = reward_h * config_alg['gamma']**(step_episode % steps_per_assign)
                buf_high.add( np.array([ state_home_h, np.array(list_obs_home_h), roles_int, r_discounted, state_home, np.array(list_obs_home), done]) )
                
                # Append trajectories into dataset, so that decoder sees states that get termination reward
                if step_episode >= steps_per_assign:
                    for idx in range(N_home):
                        dataset.append( np.array([ np.array(list_obs_traj[idx][-steps_per_assign:]), roles[idx]]) )

        # If dataset size is large enough, then train decoder
        if len(dataset) >= N_batch_hsd:
            t_alg_start = time.time()
            if summarize:
                expected_prob = alg.train_decoder(sess, dataset[ : N_batch_hsd], step_train, summarize=True, writer=writer)
            else:
                expected_prob = alg.train_decoder(sess, dataset[ : N_batch_hsd], step_train, summarize=False, writer=None)
            step_train += 1
            t_alg += time.time() - t_alg_start
            # Decide whether to increase the number of subgoals
            if expected_prob >= curriculum_threshold:
                N_roles_current = min(int(1.5 * N_roles_current + 1), N_roles)
            # Empty the dataset
            dataset = []

        if idx_episode >= pretrain_episodes and epsilon > epsilon_end:
            epsilon -= epsilon_step
    
        reward_period += reward_episode
            
        if idx_episode == 1 or idx_episode % (5*period) == 0:
            print('{:>10s}{:>10s}{:>12s}{:>5s}{:>8s}{:>10s}{:>8s}{:>8s}{:>15s}{:>15s}{:>10s}{:>12s}{:>12s}'.format(*(header.strip().split(','))))
    
        if idx_episode % period == 0:
            # Evaluation episodes
            r_avg_eval, steps_per_episode, win_rate, win_rate_opponent = evaluate.test_hierarchy(alg_name, N_eval, env, sess, alg, steps_per_assign)
            if win_rate >= config_main['save_threshold']:
                saver.save(sess, '../results/%s/%s-%d' % (dir_name, "model_good.ckpt", idx_episode))
    
            # Adjust alpha coefficient for environment reward versus intrinsic reward
            if win_rate >= alpha_threshold:
                alpha = max(alpha_end, alpha - alpha_step)

            s = '%d,%d,%d,%d,%.2f,%.3e,%.2f,%.2f,%d,%.2f,%.2f,%.5e,%.5e\n' % (idx_episode, step, step_train, N_roles_current, alpha, expected_prob, reward_period/float(period), r_avg_eval, steps_per_episode, win_rate_opponent, win_rate, t_env, t_alg)
            with open('../results/%s/log.csv' % dir_name, 'a') as f:
                f.write(s)
            print('{:10d}{:10d}{:12d}{:5d}{:8.2f}{:10.3e}{:8.2f}{:8.2f}{:15d}{:15.2f}{:10.2f}{:12.5e}{:12.5e}\n'.format(idx_episode, step, step_train, N_roles_current, alpha, expected_prob, reward_period/float(period), r_avg_eval, int(steps_per_episode), win_rate_opponent, win_rate, t_env, t_alg))
            reward_period = 0
    
        if idx_episode % save_period == 0:
            saver.save(sess, '../results/%s/%s-%d' % (dir_name, "model.ckpt", idx_episode))
            
    saver.save(sess, '../results/%s/%s' % (dir_name, model_name))
    
    with open('../results/%s/time.txt' % dir_name, 'a') as f:
        f.write('t_env_total,t_env_per_step,t_alg_total,t_alg_per_step\n')
        f.write('%.5e,%.5e,%.5e,%.5e' % (t_env, t_env/step, t_alg, t_alg/step))
def train_function(config):

    config_env = config['env']
    config_main = config['main']
    config_alg = config['alg']
    config_h = config['h_params']
    
    seed = config_main['seed']
    np.random.seed(seed)
    random.seed(seed)
    tf.set_random_seed(seed)
    
    alg_name = config_main['alg_name']
    dir_name = config_main['dir_name']
    model_name = config_main['model_name']
    summarize = config_main['summarize']
    save_period = config_main['save_period']
    
    os.makedirs('../results/%s'%dir_name, exist_ok=True)
    
    N_train = config_alg['N_train']
    N_eval = config_alg['N_eval']
    period = config_alg['period']
    buffer_size = config_alg['buffer_size']
    batch_size = config_alg['batch_size']
    pretrain_episodes = config_alg['pretrain_episodes']
    steps_per_train = config_alg['steps_per_train']
    
    epsilon_start = config_alg['epsilon_start']
    epsilon_end = config_alg['epsilon_end']
    epsilon_div = config_alg['epsilon_div']
    epsilon_step = (epsilon_start - epsilon_end)/float(epsilon_div)
    epsilon = epsilon_start
    
    N_roles = config_h['N_roles']
    steps_per_assign = config_h['steps_per_assign']
    # Each <steps_per_assign> is one "step" for the high-level policy
    # This means we train the high-level policy once for every
    # <steps_per_train> high-level steps
    steps_per_train_h = steps_per_assign * steps_per_train
    
    env = env_wrapper.Env(config_env, config_main)
    
    l_state = env.state_dim
    l_action = env.action_dim
    l_obs = env.obs_dim
    N_home = config_env['num_home_players']
    
    if config_main['alg_name'] == 'qmix':
        alg = alg_qmix.Alg(config_alg, N_home, l_state, l_obs, l_action, config['nn_qmix'])
    elif alg_name == 'hsd-scripted' or alg_name == 'mara-c':
        alg = alg_hsd_scripted.Alg(alg_name, config_alg, N_home, l_state, l_obs, l_action, N_roles, config['nn_hsd_scripted'])
    
    config_proto = tf.ConfigProto()
    config_proto.gpu_options.allow_growth = True
    sess = tf.Session(config=config_proto)
    sess.run(tf.global_variables_initializer())
    
    sess.run(alg.list_initialize_target_ops)
    
    if summarize:
        writer = tf.summary.FileWriter('../results/%s' % dir_name, sess.graph)
    saver = tf.train.Saver(max_to_keep=config_main['max_to_keep'])
    
    # Buffer for high level role assignment policy
    buf_high = replay_buffer.Replay_Buffer(size=buffer_size)
    # Buffer for low level agent policy
    buf_low = replay_buffer.Replay_Buffer(size=buffer_size)
    
    # Logging
    header = "Episode,Step,Step_train,R_avg,R_eval,Steps_per_eps,Opp_win_rate,Win_rate,T_env,T_alg\n"
    with open("../results/%s/log.csv" % dir_name, 'w') as f:
        f.write(header)
    
    t_start = time.time()
    t_env = 0
    t_alg = 0
    
    reward_period = 0
    
    step = 0
    step_train = 0
    step_h = 0
    for idx_episode in range(1, N_train+1):
    
        state_home, state_away, list_obs_home, list_obs_away, done = env.reset()
    
        # Variables with suffix _h are high-level quantities for training the role assignment policy
        # These are the high-level equivalent of the s_t in a usual transition tuple (s_t, a_t, s_{t+1})
        state_home_h, state_away_h, list_obs_home_h, list_obs_away_h = state_home, state_away, list_obs_home, list_obs_away
        # Cumulative discounted reward for high-level policy
        reward_h = 0
        # Action taken by high-level role assignment policy
        roles_int = np.random.randint(0, N_roles, N_home)
    
        reward_episode = 0
        summarized = 0
        summarized_h = 0
        step_episode = 0 # steps within an episode
        while not done:
            
            if step_episode % steps_per_assign == 0:
                if step_episode != 0:
                    # The environment state at this point, e.g. <state_home>,
                    # acts like the "next state" for the high-level policy
                    # All of the intervening environment steps act as a single step for the high-level policy
                    r_discounted = reward_h * (config_alg['gamma']**steps_per_assign)
                    if alg_name == 'hsd-scripted':
                        buf_high.add( np.array([ state_home_h, np.array(list_obs_home_h), roles_int, r_discounted, state_home, np.array(list_obs_home), done ]) )
                    elif alg_name == 'mara-c':
                        buf_high.add( np.array([ state_home_h, idx_action_centralized, r_discounted, state_home, done ]) )
                step_h += 1
                    
                # Get new role assignment, i.e. take high-level action
                if idx_episode < pretrain_episodes:
                    roles_int = np.random.randint(0, N_roles, N_home)
                    if alg_name == 'mara-c':
                        idx_action_centralized = np.random.randint(0, alg.dim_role_space)
                else:
                    t_alg_start = time.time()
                    if alg_name == 'hsd-scripted':
                        roles_int = alg.assign_roles(list_obs_home, epsilon, sess)
                    elif alg_name == 'mara-c':
                        roles_int, idx_action_centralized = alg.assign_roles_centralized(state_home, epsilon, sess)
                    t_alg += time.time() - t_alg_start
                roles = np.zeros([N_home, N_roles])
                roles[np.arange(N_home), roles_int] = 1
    
                if (idx_episode >= pretrain_episodes) and (step_h % steps_per_train == 0):
                    # Conduct training of high-level policy
                    batch = buf_high.sample_batch(batch_size)
                    t_alg_start = time.time()
                    if summarize and idx_episode % period == 0 and not summarized_h:
                        alg.train_step(sess, batch, step_train, summarize=True, writer=writer)
                        summarized_h = True
                    else:
                        alg.train_step(sess, batch, step_train, summarize=False, writer=None)
                    step_train += 1
                    t_alg += time.time() - t_alg_start
    
                # Update high-level state
                state_home_h, state_away_h, list_obs_home_h, list_obs_away_h = state_home, state_away, list_obs_home, list_obs_away
    
                reward_h = 0
    
            # Take low-level actions, conditioned on roles
            if idx_episode < pretrain_episodes:
                actions_int = env.random_actions()
            else:
                t_alg_start = time.time()
                actions_int = alg.run_actor(list_obs_home, roles, epsilon, sess)
                t_alg += time.time() - t_alg_start
                
            t_env_start = time.time()
            state_home_next, state_away_next, list_obs_home_next, list_obs_away_next, reward, local_rewards, done, info = env.step(actions_int, roles_int)
            t_env += time.time() - t_env_start
    
            step += 1
            step_episode += 1
    
            l_temp = [np.array(list_obs_home), actions_int, local_rewards, np.array(list_obs_home_next), roles]
            a_temp = np.empty(len(l_temp), dtype=object)
            a_temp[:] = l_temp
            buf_low.add( a_temp )
    
            if (idx_episode >= pretrain_episodes) and (step % steps_per_train == 0):
                # Train low-level policies
                batch = buf_low.sample_batch(batch_size)
                t_alg_start = time.time()
                if summarize and idx_episode % period == 0 and not summarized:
                    alg.train_step_low(sess, batch, step_train, summarize=True, writer=writer)
                    summarized = True
                else:
                    alg.train_step_low(sess, batch, step_train, summarize=False, writer=None)
                step_train += 1
                t_alg += time.time() - t_alg_start
    
            state_home = state_home_next
            list_obs_home = list_obs_home_next
            reward_episode += reward
            reward_h += reward
    
            if done:
                # Since the episode is done, we also terminate the current role assignment period,
                # even if not all <steps_per_assign> have been completed
                r_discounted = reward_h * config_alg['gamma']**(step_episode % steps_per_assign)
                if alg_name == 'hsd-scripted':
                    buf_high.add( np.array([ state_home_h, np.array(list_obs_home_h), roles_int, r_discounted, state_home, np.array(list_obs_home), done]) )
                elif alg_name == 'mara-c':
                    buf_high.add( np.array([ state_home_h, idx_action_centralized, r_discounted, state_home, done ]) )
    
        if idx_episode >= pretrain_episodes and epsilon > epsilon_end:
            epsilon -= epsilon_step
    
        reward_period += reward_episode
    
        if idx_episode == 1 or idx_episode % (5*period) == 0:
            print('{:>10s}{:>10s}{:>12s}{:>8s}{:>8s}{:>15s}{:>15s}{:>10s}{:>12s}{:>12s}'.format(*(header.strip().split(','))))
    
        if idx_episode % period == 0:
            # Evaluation episodes
            r_avg_eval, steps_per_episode, win_rate, win_rate_opponent = evaluate.test_hierarchy(alg_name, N_eval, env, sess, alg, steps_per_assign)
            if win_rate >= config_main['save_threshold']:
                saver.save(sess, '../results/%s/%s-%d' % (dir_name, "model_good.ckpt", idx_episode))
    
            s = '%d,%d,%d,%.2f,%.2f,%d,%.2f,%.2f,%.5e,%.5e\n' % (idx_episode, step, step_train, reward_period/float(period), r_avg_eval, steps_per_episode, win_rate_opponent, win_rate, t_env, t_alg)
            with open('../results/%s/log.csv' % dir_name, 'a') as f:
                f.write(s)
            print('{:10d}{:10d}{:12d}{:8.2f}{:8.2f}{:15d}{:15.2f}{:10.2f}{:12.5e}{:12.5e}\n'.format(idx_episode, step, step_train, reward_period/float(period), r_avg_eval, int(steps_per_episode), win_rate_opponent, win_rate, t_env, t_alg))
            reward_period = 0
    
        if idx_episode % save_period == 0:
            saver.save(sess, '../results/%s/%s-%d' % (dir_name, "model.ckpt", idx_episode))
            
    saver.save(sess, '../results/%s/%s' % (dir_name, model_name))
    
    with open('../results/%s/time.txt' % dir_name, 'a') as f:
        f.write('t_env_total,t_env_per_step,t_alg_total,t_alg_per_step\n')
        f.write('%.5e,%.5e,%.5e,%.5e' % (t_env, t_env/step, t_alg, t_alg/step))