Exemple #1
0
    def __init__(self, env, args, itr, seed):
        # 随机设置种子
        if seed is not None:
            self.setup_seed(seed)
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0
        self.time_steps = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        alg_dir = self.args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '_' + \
                  str(self.args.target_update_period)
        self.alg_tag = '_' + self.args.optim

        if self.args.her:
            self.alg_tag += str(self.args.her)
            alg_dir += '_her=' + str(self.args.her)

        # self.save_path = self.args.result_dir + '/' + alg_dir + '/' + self.args.map + '/' + itr
        self.save_path = self.args.result_dir + '/' + self.args.map + '/' + alg_dir + '/' + itr
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.args.model_dir = args.model_dir + '/' + args.map + '/' + alg_dir + '/' + itr

        self.agents = Agents(args, itr=itr)
        print('step runner 初始化')
        if self.args.her:
            print('使用HER')
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']

        self.__num_of_agents = self.env.observation_space.shape[0]
        state_size = self.env.observation_space.shape[1]
        action_size = self.env.action_space_size
        agent_params['num_of_agents'] = self.__num_of_agents
        agent_params['state_size'] = state_size
        agent_params['action_size'] = action_size
        self.agents = Agents(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']
        self.t_max = trainer_params['t_max']

        self.exploration_noise = UOProcess()

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        self.sigma = 0.5

        print("MADDPG agent.")
        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))
Exemple #3
0
    def __init__(self, env, args, itr):
        # 获取参数
        # self.args = get_common_args()
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.agents = Agents(args, itr=itr)
        # 不复用网络,就会有多个agent,训练的时候共享参数,就是一个网络
        # if not self.args.reuse_network:
        #     self.agents = []
        #     for i in range(self.args.n_agents):
        #         self.agents.append(Agents(self.args, i))

        # self.rollout = RollOut(self.agents, self.args)

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        self.save_path = self.args.result_dir + '/' + self.args.alg + '/' + self.args.map + '/' + str(
            itr)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        print('runner 初始化')
def import_agent_file(scenario_settings, con, cur, engine, model_settings,
                      agent_file_status, input_name):
    """
    Generates new agents or uses pre-generated agents from provided .pkl file
    
    Parameters
    ----------
    **scenario_settings** : 'SQL schema'
        Schema of the scenario settings
    **con** : 'SQL connection'
        SQL connection to connect to database
    **cur** : 'SQL cursor'
        Cursor
    **engine** : 'SQL engine'
        SQL engine to intepret SQL query
    **model_settings** : 'object'
        Model settings that apply to all scenarios
    **agent_file_status** : 'attribute'
        Attribute that describes whether to use pre-generated agent file or create new    
    **input_name** : 'string'
        .Pkl file name substring of pre-generated agent table 
    
    Returns
    -------
    **solar_agents** : 'Class'
        Instance of Agents class with either user pre-generated or new data

    """

    schema = scenario_settings.schema
    input_agent_dir = model_settings.input_agent_dir
    state_to_model = scenario_settings.state_to_model

    ISO_List = ['ERCOT', 'NEISO', 'NYISO', 'CAISO', 'PJM', 'MISO', 'SPP']

    if agent_file_status == 'Use pre-generated Agents':

        userdefined_table_name = "input_" + input_name + "_user_defined"
        scenario_userdefined_name = get_userdefined_scenario_settings(
            schema, userdefined_table_name, con)
        scenario_userdefined_value = scenario_userdefined_name['val'].values[0]

        solar_agents_df = pd.read_pickle(
            os.path.join(input_agent_dir, scenario_userdefined_value + ".pkl"))

        if scenario_settings.region in ISO_List:
            solar_agents_df = pd.read_pickle(
                os.path.join(input_agent_dir,
                             scenario_userdefined_value + ".pkl"))

        else:
            solar_agents_df = solar_agents_df[
                solar_agents_df['state_abbr'].isin(state_to_model)]

        if solar_agents_df.empty:
            raise ValueError(
                'Region not present within pre-generated agent file - Edit Inputsheet'
            )

        solar_agents = Agents(solar_agents_df)

        solar_agents.on_frame(agent_mutation.elec.reassign_agent_tariffs, con)

    else:
        raise ValueError(
            'Generating agents is not supported at this time. Please select "Use pre-generated Agents" in the input sheet'
        )

    return solar_agents
Windows (x86_64): "path/to/Reacher_Windows_x86_64/Reacher.exe"
Linux (x86): "path/to/Reacher_Linux/Reacher.x86"
Linux (x86_64): "path/to/Reacher_Linux/Reacher.x86_64"
'''
env = UnityEnvironment(file_name="./Reacher_Linux/Reacher.x86")

brain_name = env.brain_names[0]  # get the default brain
brain = env.brains[brain_name]
env_info = env.reset(train_mode=True)[brain_name]  # reset the environment
num_agents = len(env_info.agents)  # get number of agents
states = env_info.vector_observations  # get the current state
state_size = states.shape[1]
action_size = brain.vector_action_space_size
agents = Agents(
    num_agents=num_agents,  # create RL agents
    state_size=state_size,
    action_size=action_size,
    random_seed=0)

checkpoint = torch.load('solution.pth', map_location={'cuda:0':
                                                      'cpu'})  # load weights
agents.actor_local.load_state_dict(checkpoint['actor_local_state_dict'])
agents.actor_target.load_state_dict(checkpoint['actor_target_state_dict'])
agents.actor_optimizer.load_state_dict(
    checkpoint['actor_optimizer_state_dict'])
agents.critic_local.load_state_dict(checkpoint['critic_local_state_dict'])
agents.critic_target.load_state_dict(checkpoint['critic_target_state_dict'])
agents.critic_optimizer.load_state_dict(
    checkpoint['critic_optimizer_state_dict'])

scores = np.zeros(num_agents)  # initialize the score (for each agent)
def make_env(n_substeps=3,
             horizon=250,
             deterministic_mode=False,
             n_agents=1,
             env_no=1):

    env = Base(n_agents=n_agents,
               n_substeps=n_substeps,
               horizon=horizon,
               floor_size=10,
               grid_size=50,
               deterministic_mode=deterministic_mode,
               env_no=env_no)

    # Add Walls
    #env.add_module(RandomWalls(grid_size=5, num_rooms=2, min_room_size=5, door_size=5, low_outside_walls=True, outside_wall_rgba="1 1 1 0.1"))

    # Add Agents
    first_agent_placement = uniform_placement
    second_agent_placement = uniform_placement
    agent_placement_fn = [first_agent_placement] + [second_agent_placement]
    env.add_module(Agents(n_agents, placement_fn=agent_placement_fn))

    # Add LidarSites
    n_lidar_per_agent = 1
    visualize_lidar = False
    compress_lidar_scale = None
    if visualize_lidar:
        env.add_module(
            LidarSites(n_agents=n_agents, n_lidar_per_agent=n_lidar_per_agent))

    env.reset()

    keys_self = ['agent_qpos_qvel']
    keys_mask_self = []  #['mask_aa_obs']
    keys_external = []  #['agent_qpos_qvel']
    keys_mask_external = []
    keys_copy = []

    env = AddConstantObservationsWrapper(
        env, new_obs={'agents_health': np.full((n_agents, 1), 100.0)})
    keys_self += ['agents_health']
    env = ProjectileWrapper(env)
    #env = HealthWrapper(env)

    env = SplitMultiAgentActions(env)
    env = DiscretizeActionWrapper(env, 'action_movement')
    env = AgentAgentObsMask2D(env)

    if n_lidar_per_agent > 0:
        env = Lidar(env,
                    n_lidar_per_agent=n_lidar_per_agent,
                    visualize_lidar=visualize_lidar,
                    compress_lidar_scale=compress_lidar_scale)
        keys_copy += ['lidar']
        keys_external += ['lidar']

    env = SplitObservations(env,
                            keys_self + keys_mask_self,
                            keys_copy=keys_copy)
    env = DiscardMujocoExceptionEpisodes(env)

    env = SelectKeysWrapper(env,
                            keys_self=keys_self,
                            keys_external=keys_external,
                            keys_mask=keys_mask_self + keys_mask_external,
                            flatten=False)

    return env
Exemple #7
0
 def __init__(self, agent_id, evaluation):
     Agents.__init__(self, agent_id, evaluation)
     self.wins = {}  # a (agent_id, state): count dict
     self.plays = {}  # a (agent_id, state): count dict
     self.depth = 60
     self.simulation_time = 10
Exemple #8
0
 def __init__(self, agent_id, evaluation, depth=2):
     Agents.__init__(self, agent_id, evaluation)
     self.max_depth = depth
Exemple #9
0
def main(mode=None, resume_year=None, endyear=None, ReEDS_inputs=None):
    """
    Compute the economic adoption of distributed generation resources on an agent-level basis.

    Model output is saved to a `/runs` file within the dGen directory.
    """
    try:
        # =====================================================================
        # SET UP THE MODEL TO RUN
        # =====================================================================
        # initialize Model Settings object
        # (this controls settings that apply to all scenarios to be executed)
        model_settings = settings.init_model_settings()
        prerun_test.check_dependencies()

        # make output directory
        # create the logger and stamp with git hash
        logger = utilfunc.get_logger(
            os.path.join(model_settings.out_dir, 'dg_model.log'))
        logger.info("Model version is git commit {:}".format(
            model_settings.git_hash))

        # =====================================================================
        # LOOP OVER SCENARIOS
        # =====================================================================

        out_subfolders = {'solar': []}

        for i, scenario_file in enumerate(model_settings.input_scenarios):
            logger.info('============================================')
            logger.info('============================================')
            logger.info("Running Scenario {i} of {n}".format(
                i=i + 1, n=len(model_settings.input_scenarios)))

            # initialize ScenarioSettings object
            # (this controls settings that apply only to this specific scenario)
            scenario_settings = settings.init_scenario_settings(
                scenario_file, model_settings)

            # log summary high level secenario settings
            logger.info('Scenario Settings:')
            logger.info('\tScenario Name: %s' %
                        scenario_settings.scenario_name)

            logger.info('\tSectors: %s' %
                        list(scenario_settings.sector_data.keys()))
            logger.info('\tTechnologies: %s' % scenario_settings.techs)
            logger.info(
                '\tYears: %s - %s' %
                (scenario_settings.start_year, scenario_settings.end_year))

            logger.info('Results Path: %s' % (scenario_settings.out_scen_path))

            #==========================================================================================================
            # CREATE AGENTS
            #==========================================================================================================
            logger.info("-------------- Agent Preparation ---------------")

            if scenario_settings.generate_agents:
                logger.info('\tCreating Agents')
                solar_agents = Agents(
                    agent_mutation.init_solar_agents(scenario_settings))
                logger.info('....{} agents in input csv'.format(
                    len(solar_agents)))

                # Write base agents to disk
                solar_agents.df.to_pickle(scenario_settings.out_scen_path +
                                          '/agent_df_base.pkl')
            else:
                logger.info('Loading %s' % scenario_settings.agents_file_name)
                with open(scenario_settings.agents_file_name, "r") as f:
                    solar_agents = Agents(pickle.load(f))

            # Get set of columns that define agent's immutable attributes
            cols_base = list(solar_agents.df.columns.values)

            #==============================================================================
            # TECHNOLOGY DEPLOYMENT
            #==============================================================================
            logger.info("-------------- Yearly Analysis ---------------")
            complete_df = pd.DataFrame()
            if scenario_settings.techs == ['solar']:
                solar_agents.df['tech'] = 'solar'

                for i, year in enumerate(scenario_settings.model_years):

                    is_first_year = year == model_settings.start_year

                    logger.info('\tWorking on %s' % year)

                    # determine any non-base columns and drop them
                    cols = list(solar_agents.df.columns.values)
                    cols_to_drop = [x for x in cols if x not in cols_base]
                    if len(cols_to_drop) != 0:
                        solar_agents.df.drop(cols_to_drop,
                                             axis=1,
                                             inplace=True)

                    # copy the core agent object and set their year
                    solar_agents.df['year'] = year

                    # get and apply load growth
                    load_growth_yearly = scenario_settings.get_load_growth(
                        year)

                    solar_agents.on_frame(
                        agent_mutation.elec.apply_load_growth,
                        (load_growth_yearly))

                    # Normalize the hourly load profile to updated total load which includes load growth multiplier
                    solar_agents.on_frame(agent_mutation.elec.
                                          apply_scale_normalized_load_profiles)

                    # Get and apply net metering parameters
                    net_metering_yearly = scenario_settings.get_nem_settings(
                        year)
                    solar_agents.on_frame(
                        agent_mutation.elec.apply_export_tariff_params,
                        (net_metering_yearly))

                    # Apply each agent's electricity price change and assumption about increases
                    solar_agents.on_frame(
                        agent_mutation.elec.
                        apply_elec_price_multiplier_and_escalator,
                        [year, scenario_settings.get_rate_escalations()])

                    # Apply PV Specs
                    solar_agents.on_frame(agent_mutation.elec.apply_pv_specs,
                                          scenario_settings.get_pv_specs())
                    solar_agents.on_frame(
                        agent_mutation.elec.apply_storage_specs, [
                            scenario_settings.get_batt_price_trajectories(),
                            year, scenario_settings
                        ])

                    # Apply financial terms
                    solar_agents.on_frame(
                        agent_mutation.elec.apply_financial_params, [
                            scenario_settings.get_financing_terms(),
                            scenario_settings.
                            financial_options['annual_inflation_pct']
                        ])

                    # Apply wholesale electricity prices
                    solar_agents.on_frame(
                        agent_mutation.elec.apply_wholesale_elec_prices,
                        scenario_settings.get_wholesale_elec_prices())

                    # Size S+S system and calculate electric bills
                    if 'ix' not in os.name:
                        cores = None
                    else:
                        cores = model_settings.local_cores

                    solar_agents.on_row(
                        fFuncs.calc_system_size_and_financial_performance,
                        cores=cores)

                    solar_agents.df['agent_id'] = solar_agents.df.index.values

                    # Calculate the financial performance of the S+S systems
                    solar_agents.on_frame(
                        financial_functions.calc_financial_performance)

                    # Calculate Maximum Market Share
                    solar_agents.on_frame(
                        financial_functions.calc_max_market_share,
                        scenario_settings.get_max_market_share())

                    # determine "developable" population
                    solar_agents.on_frame(
                        agent_mutation.elec.
                        calculate_developable_customers_and_load)

                    # Apply market_last_year
                    if is_first_year:
                        solar_agents.on_frame(
                            agent_mutation.elec.estimate_initial_market_shares)
                        market_last_year_df = None
                    else:
                        solar_agents.on_frame(
                            agent_mutation.elec.apply_market_last_year,
                            market_last_year_df)

                    # Calculate diffusion based on economics and bass diffusion
                    solar_agents.df, market_last_year_df = diffusion_functions.calc_diffusion_solar(
                        solar_agents.df, is_first_year,
                        scenario_settings.get_bass_params())

                    # Estimate total generation
                    solar_agents.on_frame(
                        agent_mutation.elec.estimate_total_generation)

                    # Aggregate results
                    scenario_settings.output_batt_dispatch_profiles = False
                    if is_first_year == True:
                        interyear_results_aggregations = agent_mutation.elec.aggregate_outputs_solar(
                            solar_agents.df, year, is_first_year,
                            scenario_settings)
                    else:
                        interyear_results_aggregations = agent_mutation.elec.aggregate_outputs_solar(
                            solar_agents.df, year, is_first_year,
                            scenario_settings, interyear_results_aggregations)

                    # --- Check to ensure that agent_df isn't growing (i.e. merges are failing silently) ---
                    df_print = solar_agents.df.copy()
                    df_print = df_print.loc[df_print['year'] == year]
                    df_print = df_print.groupby(['sector_abbr'
                                                 ])['pv_kw_cum'].sum()

                    #==========================================================================================================
                    # WRITE AGENT DF AS PICKLES FOR POST-PROCESSING
                    #==========================================================================================================

                    # Write Outputs to the database
                    drop_fields = [
                        'consumption_hourly_initial', 'bill_savings',
                        'consumption_hourly', 'solar_cf_profile',
                        'tariff_dict', 'deprec_sch', 'batt_dispatch_profile'
                    ]  #dropping because are arrays or json
                    df_write = solar_agents.df.drop(drop_fields, axis=1)

                    write_annual = False
                    if write_annual:
                        df_write.to_pickle(scenario_settings.out_scen_path +
                                           '/agent_df_%s.pkl' % year)

                    if i == 0:
                        complete_df = df_write
                    else:
                        complete_df = pd.concat([complete_df, df_write],
                                                sort=False)

            #==============================================================================
            #    Outputs & Visualization
            #==============================================================================
            logger.info("---------Saving Model Results---------")

            complete_df.to_csv(scenario_settings.out_scen_path +
                               '/agent_outputs.csv')

            logger.info("-------------Model Run Complete-------------")
            logger.info('Completed in: %.1f seconds' %
                        (time.time() - model_settings.model_init))

    except Exception as e:
        if 'logger' in locals():
            logger.error(e.__str__(), exc_info=True)
            logger.info('Error on line {}'.format(
                sys.exc_info()[-1].tb_lineno))
            logger.info('Type of error {}'.format(type(e)))
            logger.info('Error Text: {}'.format(e))
        if 'logger' not in locals():
            raise
    finally:
        if 'logger' in locals():
            utilfunc.shutdown_log(logger)
            utilfunc.code_profiler(model_settings.out_dir)
Exemple #10
0
    def decode_msg(self, lines):
        '''
        decode message for reinforcement learning

        show:0
        team_l:0
        team_r:0
        ball_x:0.000000, ball_y:0.000000, ball_vx:0.000000, ball_vy:0.000000
        side: 1, num:1, x:-49.000000, y:0.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0,
            stamina, staminaCapacity
        side: 1, num:2, x:-25.000000, y:-5.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:3, x:-25.000000, y:5.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:4, x:-25.000000, y:-10.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:5, x:-25.000000, y:10.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:6, x:-25.000000, y:0.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:7, x:-15.000000, y:-5.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:8, x:-15.000000, y:5.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:9, x:-15.000000, y:-10.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:10, x:-15.000000, y:10.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: 1, num:11, x:-15.000000, y:0.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:2, x:25.000000, y:5.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:3, x:25.000000, y:-5.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:4, x:25.000000, y:10.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:5, x:25.000000, y:-10.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:6, x:25.000000, y:-0.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:7, x:15.000000, y:5.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:8, x:15.000000, y:-5.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:9, x:15.000000, y:10.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:10, x:15.000000, y:-10.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        side: -1, num:11, x:15.000000, y:-0.000000, vel_x:0.000000, vel_y:0.000000, kick_count:0
        '''
        try:
            self.show = int(lines[0].split(':')[-1])
            self.scores = np.array(
                [int(lines[1].split(':')[-1]),
                 int(lines[2].split(':')[-1])])
        except ValueError:
            print('score and show error')
        self.ball = np.array(
            [np.float(val.split(':')[-1]) for val in lines[3].split(',')])
        try:
            players = []
            for i in range(4, len(lines)):
                player = [
                    np.float(val.split(':')[-1]) for val in lines[i].split(',')
                ]
                players.append(player)

            self.players = np.vstack(players)
        except ValueError:
            pass
            # print('line error')

        if len(self.agents) < 1:
            self.agents = [
                Agents(self.players, self.ball, i) for i in range(11)
            ]
            self.move_message = self.format_move_message(self.players[:11,
                                                                      2:4])
            # print(self.move_message)

        onball = np.argwhere(self.players[:, -3] - self.kickcount > 0)
        # onball = np.argwhere(np.array([caldist(self.ball, self.players[i,2:4]) for i in range(22)]))
        if len(onball) > 0:
            # self.actioned = False
            for i in range(11):
                self.agents[i].onball = int(onball[0])
            self.onball = int(onball[0])

        self.kickcount = self.players[:, -3]
Exemple #11
0
class Runner:
    def __init__(self, env, args, itr, seed):
        # 随机设置种子
        if seed is not None:
            self.setup_seed(seed)
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0
        self.time_steps = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        alg_dir = self.args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '_' + \
                  str(self.args.target_update_period)
        self.alg_tag = '_' + self.args.optim

        if self.args.her:
            self.alg_tag += str(self.args.her)
            alg_dir += '_her=' + str(self.args.her)

        # self.save_path = self.args.result_dir + '/' + alg_dir + '/' + self.args.map + '/' + itr
        self.save_path = self.args.result_dir + '/' + self.args.map + '/' + alg_dir + '/' + itr
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)

        self.args.model_dir = args.model_dir + '/' + args.map + '/' + alg_dir + '/' + itr

        self.agents = Agents(args, itr=itr)
        print('step runner 初始化')
        if self.args.her:
            print('使用HER')

    @staticmethod
    def setup_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        np.random.seed(seed)
        random.seed(seed)
        torch.backends.cudnn.deterministic = True

    def generate_episode(self, episode_num, evaluate=False):
        # 为保存评价的回放做准备
        if self.args.replay_dir != '' and evaluate and episode_num == 0:
            self.env.close()
        # 变量初始化,使用her需要记录goal
        self.env.reset()
        done = False
        info = None
        win = False

        last_action = np.zeros((self.args.n_agents, self.args.n_actions))
        # epsilon 递减
        epsilon = 0 if evaluate else self.args.epsilon
        # epsilon 递减的方式
        if self.args.epsilon_anneal_scale == 'episode' or \
                (self.args.epsilon_anneal_scale == 'itr' and episode_num == 0):
            epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon

        # 记录一个episode的信息
        episode_buffer = None
        if not evaluate:
            episode_buffer = {
                'o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                's':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'a':
                np.zeros([self.args.episode_limit, self.args.n_agents, 1]),
                'onehot_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'r':
                np.zeros([self.args.episode_limit, 1]),
                'next_o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                'next_s':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'next_avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'done':
                np.ones([self.args.episode_limit, 1]),
                'padded':
                np.ones([self.args.episode_limit, 1])
            }
        # 开始进行一波 episode
        states, former_states = [], []
        obs = self.env.get_obs()
        if self.args.her:
            obs = np.concatenate((obs, self.env.goal), axis=1)
        state = self.env.get_state()
        if self.args.her:
            states.append(self.env.state)
            former_states.append(self.env.former_states)
        avail_actions = []
        self.agents.policy.init_hidden(1)
        for agent_id in range(self.args.n_agents):
            avail_action = self.env.get_avail_agent_actions(agent_id)
            avail_actions.append(avail_action)

        episode_reward = 0
        for step in range(self.args.episode_limit):
            if done:
                break
            else:
                actions, onehot_actions = [], []
                for agent_id in range(self.args.n_agents):
                    # avail_action = self.env.get_avail_agent_actions(agent_id)
                    action, _ = self.agents.choose_action(
                        obs[agent_id], last_action[agent_id], agent_id,
                        avail_actions[agent_id], epsilon, evaluate)
                    # 得到该动作的独热编码
                    onehot_action = np.zeros(self.args.n_actions)
                    onehot_action[action] = 1
                    onehot_actions.append(onehot_action)
                    # 加入联合动作
                    actions.append(action)
                    # avail_actions.append(avail_action)
                    # 记录该动作
                    last_action[agent_id] = onehot_action
                # 对环境执行联合动作
                reward, done, info = self.env.step(actions)
                # 记录时间步
                if not evaluate:
                    self.time_steps += 1
                # 获取改变后的信息
                if not done:
                    next_obs = self.env.get_obs()
                    if self.args.her:
                        next_obs = np.concatenate((next_obs, self.env.goal),
                                                  axis=1)
                    next_state = self.env.get_state()
                    if self.args.her:
                        states.append(self.env.state)
                        former_states.append(self.env.former_states)
                else:
                    next_obs = obs
                    next_state = state
                # 添加可得动作
                next_avail_actions = []
                for agent_id in range(self.args.n_agents):
                    avail_action = self.env.get_avail_agent_actions(agent_id)
                    next_avail_actions.append(avail_action)
                # 添加经验
                if not evaluate:
                    episode_buffer['o'][step] = obs
                    episode_buffer['s'][step] = state
                    episode_buffer['a'][step] = np.reshape(
                        actions, [self.args.n_agents, 1])
                    episode_buffer['onehot_a'][step] = onehot_actions
                    episode_buffer['avail_a'][step] = avail_actions
                    episode_buffer['r'][step] = [reward]
                    episode_buffer['next_o'][step] = next_obs
                    episode_buffer['next_s'][step] = next_state
                    episode_buffer['next_avail_a'][step] = next_avail_actions
                    episode_buffer['done'][step] = [done]
                    episode_buffer['padded'][step] = [0.]

                # 更新变量
                episode_reward += reward
                obs = next_obs
                state = next_state
                avail_actions = next_avail_actions
                if self.args.epsilon_anneal_scale == 'step':
                    epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon

        # 是训练则记录新的epsilon
        if not evaluate:
            self.args.epsilon = epsilon
        # 获取对局信息
        if info.__contains__('battle_won'):
            win = True if done and info['battle_won'] else False
        if evaluate and episode_num == self.args.evaluate_num - 1 and self.args.replay_dir != '':
            self.env.save_replay()
            self.env.close()
        if not evaluate and self.args.her:
            return episode_buffer, states, former_states
        return episode_buffer, episode_reward, win

    def run(self):
        train_steps = 0
        early_stop = 10
        num_eval = 0
        self.max_win_rate = 0
        self.time_steps = 0
        last_test_step = 0
        begin_time = None
        begin_step = None

        # for itr in range(self.args.n_itr):
        while self.time_steps < self.args.max_steps:
            if begin_step is None:
                begin_time = datetime.utcnow().astimezone(
                    timezone(timedelta(hours=8)))
                begin_step = self.time_steps
            # 收集 n_episodes 的数据
            if self.args.her:
                episode_batch, states, former_states = self.generate_episode(0)
                self.her_k(episode_batch, states, former_states)
            else:
                episode_batch, _, _ = self.generate_episode(0)
            for key in episode_batch.keys():
                episode_batch[key] = np.array([episode_batch[key]])
            for e in range(1, self.args.n_episodes):
                if self.args.her:
                    episode_batch, states, former_states = self.generate_episode(
                        e)
                    self.her_k(episode_batch, states, former_states)
                else:
                    episode, _, _ = self.generate_episode(e)

                for key in episode_batch.keys():
                    episode[key] = np.array([episode[key]])
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)

            # 添加到 replay buffer
            self.replay_buffer.store(episode_batch)
            # 训练 TODO 12.5
            if self.replay_buffer.size < self.args.batch_size * self.args.bs_rate:
                print('replay buffer 还没 batch size * {} 大 !'.format(
                    self.args.bs_rate))
                begin_time = None
                begin_step = None
                continue
            for _ in range(self.args.train_steps):
                batch = self.replay_buffer.sample(self.args.batch_size)
                self.agents.train(batch, train_steps)
                train_steps += 1
            # 周期性评价
            # if itr % self.args.evaluation_period == 0:
            if (self.time_steps -
                    last_test_step) / self.args.evaluation_steps_period >= 1.0:
                num_eval += 1
                last_test_step = self.time_steps
                print(
                    f'进程 {self.pid}: {self.time_steps} step / {self.args.max_steps} steps'
                )
                # print('幂为:{}'.format(self.agents.policy.power))
                win_rate, episodes_reward = self.evaluate()
                # 保存测试结果
                self.evaluate_itr.append(self.time_steps)
                self.win_rates.append(win_rate)
                self.episodes_rewards.append(episodes_reward)
                # 表现好的模型要额外保存
                if win_rate > self.max_win_rate:
                    self.max_win_rate = win_rate
                    self.agents.policy.save_model(str(win_rate))
                # 不时刻保存,从而减少时间花费
                if num_eval % 50 == 0:
                    self.save_results()
                    self.plot()
                    # 记录经历50次测试花费了多久
                    now = datetime.utcnow().astimezone(
                        timezone(timedelta(hours=8)))
                    elapsed_time = now - begin_time
                    expected_remain_time = (elapsed_time / (self.time_steps - begin_step)) * \
                                           (self.args.max_steps - self.time_steps)
                    expected_end_time = now + expected_remain_time
                    print("预计还需: {}".format(str(expected_remain_time)))
                    print("预计结束时间为: {}".format(
                        expected_end_time.strftime("%Y-%m-%d_%H-%M-%S")))
        # 最后把所有的都保存一下
        self.save_results()
        self.plot()
        self.env.close()

    def evaluate(self):
        """
        得到平均胜率和每次测试的累加奖赏,方便画误差阴影图
        :return:
        """
        win_number = 0
        episodes_reward = []
        for itr in range(self.args.evaluate_num):
            if self.args.didactic:
                episode_reward, win = self.get_eval_qtot()
            else:
                _, episode_reward, win = self.generate_episode(itr,
                                                               evaluate=True)
            episodes_reward.append(episode_reward)
            if win:
                win_number += 1
        return win_number / self.args.evaluate_num, episodes_reward

    def save_results(self):
        """
        保存数据,方便后面多种算法结果画在一张图里比较
        :return:
        """
        # 如果已经有图片就删掉
        for filename in os.listdir(self.save_path):
            if filename.endswith('.npy'):
                os.remove(self.save_path + '/' + filename)
        np.save(self.save_path + '/evaluate_itr.npy', self.evaluate_itr)
        if self.args.didactic and self.args.power is None and 'strapped' in self.args.alg:
            np.save(self.save_path + '/train_steps.npy',
                    self.agents.policy.train_steps)
            np.save(self.save_path + '/differences.npy',
                    self.agents.policy.differences)
        else:
            np.save(self.save_path + '/win_rates.npy', self.win_rates)
        np.save(self.save_path + '/episodes_rewards.npy',
                self.episodes_rewards)

    def plot(self):
        """
        定期绘图
        :return:
        """
        fig = plt.figure()
        ax1 = fig.add_subplot(211)
        if self.args.didactic and self.args.power is None and 'strapped' in self.args.alg:
            win_x = np.array(self.agents.policy.train_steps)[:,
                                                             None] / 1000000.
            win_y = np.array(self.agents.policy.differences)[:, None]
            plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1),
                                    columns=['T (mil)', self.args.which_diff])
            sns.lineplot(x="T (mil)",
                         y=self.args.which_diff,
                         data=plot_win,
                         ax=ax1)
        else:
            win_x = np.array(self.evaluate_itr)[:, None] / 1000000.
            win_y = np.array(self.win_rates)[:, None]
            plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1),
                                    columns=['T (mil)', 'Test Win'])
            sns.lineplot(x="T (mil)", y="Test Win", data=plot_win, ax=ax1)

        ax2 = fig.add_subplot(212)
        reward_x = np.repeat(self.evaluate_itr,
                             self.args.evaluate_num)[:, None] / 1000000.
        reward_y = np.array(self.episodes_rewards).flatten()[:, None]
        plot_reward = pd.DataFrame(np.concatenate((reward_x, reward_y),
                                                  axis=1),
                                   columns=['T (mil)', 'Median Test Returns'])
        sns.lineplot(x="T (mil)",
                     y="Median Test Returns",
                     data=plot_reward,
                     ax=ax2,
                     ci='sd',
                     estimator=np.median)
        plt.tight_layout()
        # 格式化成2016-03-20-11_45_39形式
        # tag = self.args.alg + '-' + time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
        tag = self.args.alg + '_' + str(self.args.target_update_period)
        # if 'averaged' in self.args.alg:
        tag += (self.alg_tag + '_' + datetime.utcnow().astimezone(
            timezone(timedelta(hours=8))).strftime("%Y-%m-%d_%H-%M-%S"))
        # 如果已经有图片就删掉
        for filename in os.listdir(self.save_path):
            if filename.endswith('.png'):
                os.remove(self.save_path + '/' + filename)
        fig.savefig(self.save_path + "/%s.png" % tag)
        plt.close()

    def get_eval_qtot(self):
        """
        得到eval qtot
        """
        self.env.reset()

        all_last_action = np.zeros((self.args.n_agents, self.args.n_actions))

        # 开始进行一波 episode
        all_obs = self.env.get_obs()
        state = self.env.get_state()
        avail_actions = []
        self.agents.policy.init_hidden(1)
        eval_qs = []
        actions = []
        one_hot_actions = []
        hidden_evals = None
        for agent_idx in range(self.args.n_agents):
            obs = all_obs[agent_idx]
            last_action = all_last_action[agent_idx]
            avail_action = self.env.get_avail_agent_actions(agent_idx)
            avail_actions.append(avail_action)

            onehot_agent_idx = np.zeros(self.args.n_agents)
            onehot_agent_idx[agent_idx] = 1.
            if self.args.last_action:
                # 在水平方向上平铺
                obs = np.hstack((obs, last_action))
            if self.args.reuse_network:
                obs = np.hstack((obs, onehot_agent_idx))
            hidden_state = self.agents.policy.eval_hidden[:, agent_idx, :]
            # 转置
            obs = torch.Tensor(obs).unsqueeze(0)
            # 是否使用 GPU
            if self.args.cuda:
                obs = obs.cuda()
                hidden_state = hidden_state.cuda()
            # 获取 Q(s, a)
            qsa, hidden_eval = self.agents.policy.eval_rnn(obs, hidden_state)
            qsa[avail_action == 0.0] = -float("inf")

            eval_qs.append(torch.max(qsa))

            action = torch.argmax(qsa)
            actions.append(action)

            onehot_action = np.zeros(self.args.n_actions)
            onehot_action[action] = 1
            one_hot_actions.append(onehot_action)
            if hidden_evals is None:
                hidden_evals = hidden_eval
            else:
                hidden_evals = torch.cat([hidden_evals, hidden_eval], dim=0)

        s = torch.Tensor(state)
        eval_qs = torch.Tensor(eval_qs).unsqueeze(0)
        actions = torch.Tensor(actions).unsqueeze(0)
        one_hot_actions = torch.Tensor(one_hot_actions).unsqueeze(0)
        hidden_evals = hidden_evals.unsqueeze(0)
        # 是否使用GPU
        if self.args.cuda:
            s = s.cuda()
            eval_qs = eval_qs.cuda()
            actions = actions.cuda()
            one_hot_actions = one_hot_actions.cuda()
            hidden_evals = hidden_evals.cuda()
        # 计算Q_tot
        eval_q_total = None
        if self.args.alg == 'qatten':
            eval_q_total, _, _ = self.agents.policy.eval_mix_net(
                eval_qs, s, actions)
        elif self.args.alg == 'qmix' \
                or 'wqmix' in self.args.alg \
                or 'strapped' in self.args.alg:
            eval_q_total = self.agents.policy.eval_mix_net(eval_qs, s)
        elif 'dmaq' in self.args.alg:
            if self.args.alg == "dmaq_qatten":
                ans_chosen, _, _ = self.agents.policy.mixer(eval_qs,
                                                            s,
                                                            is_v=True)
                ans_adv, _, _ = self.agents.policy.mixer(
                    eval_qs,
                    s,
                    actions=one_hot_actions,
                    max_q_i=eval_qs,
                    is_v=False)
                eval_q_total = ans_chosen + ans_adv
            else:
                ans_chosen = self.agents.policy.mixer(eval_qs, s, is_v=True)
                ans_adv = self.agents.policy.mixer(eval_qs,
                                                   s,
                                                   actions=one_hot_actions,
                                                   max_q_i=eval_qs,
                                                   is_v=False)
                eval_q_total = ans_chosen + ans_adv
        elif self.args.alg == 'qtran_base':
            one_hot_actions = one_hot_actions.unsqueeze(0)
            hidden_evals = hidden_evals.unsqueeze(0)
            eval_q_total = self.agents.policy.eval_joint_q(
                s, hidden_evals, one_hot_actions)

        eval_q_total = eval_q_total.squeeze().item()
        return eval_q_total, 0

    def her_k(self, episode, states, former_states):
        import copy
        for _ in range(self.args.her):
            episode_buffer = {
                'o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                's':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'a':
                np.zeros([self.args.episode_limit, self.args.n_agents, 1]),
                'onehot_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'r':
                np.zeros([self.args.episode_limit, 1]),
                'next_o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                'next_s':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'next_avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'done':
                np.ones([self.args.episode_limit, 1]),
                'padded':
                np.ones([self.args.episode_limit, 1])
            }
            # 重新生成goal,order等信息
            self.env.reset()
            # 使用新生成的goals重构整个episode
            for i in range(len(episode)):
                reward = self.env.get_reward(states[i], former_states[i])
                done = episode['done'][i]
                if reward >= 0:
                    reward = 0
                    done = True
                episode_buffer['o'][i] = episode['o'][i]
                episode_buffer['o'][i, :, -2:] = np.array(self.env.goal)[:]
                episode_buffer['s'][i] = episode['s'][i]
                episode_buffer['a'][i] = episode['a'][i]
                episode_buffer['onehot_a'][i] = episode['onehot_a'][i]
                episode_buffer['avail_a'][i] = episode['avail_a'][i]
                episode_buffer['r'][i] = [reward]
                episode_buffer['next_o'][i] = episode['next_o'][i]
                episode_buffer['next_o'][i, :,
                                         -2:] = np.array(self.env.goal)[:]
                episode_buffer['next_s'][i] = episode['next_s'][i]
                episode_buffer['next_avail_a'][i] = episode['next_avail_a'][i]
                episode_buffer['done'][i] = [done]
                episode_buffer['padded'][i] = [0.]
                if done:
                    break
            for key in episode_buffer.keys():
                episode_buffer[key] = np.array([episode_buffer[key]])
            self.replay_buffer.store(episode_buffer)
class Trainer:
    def __init__(self, params):

        seed = params['general_params']['seed']
        self.__set_seed(seed=seed)

        env_params = params['env_params']
        env_params['seed'] = seed
        self.env = UnityEnv(params=env_params)

        agent_params = params['agent_params']

        self.__num_of_agents = self.env.observation_space.shape[0]
        state_size = self.env.observation_space.shape[1]
        action_size = self.env.action_space_size
        agent_params['num_of_agents'] = self.__num_of_agents
        agent_params['state_size'] = state_size
        agent_params['action_size'] = action_size
        self.agents = Agents(params=agent_params)

        trainer_params = params['trainer_params']
        self.learning_rate_decay = trainer_params['learning_rate_decay']
        self.results_path = trainer_params['results_path']
        self.model_path = trainer_params['model_path']
        self.t_max = trainer_params['t_max']

        self.exploration_noise = UOProcess()

        # data gathering variables
        self.avg_rewards = []
        self.scores = []
        self.score = 0

        self.sigma = 0.5

        print("MADDPG agent.")
        print("Configuration:")
        pprint(params)
        logging.info("Configuration: {}".format(params))

    def train(self, num_of_episodes):

        logging.info("Training:")
        reward_window = deque(maxlen=100)

        for episode_i in range(1, num_of_episodes):

            states = self.env.reset()
            self.agents.reset(self.sigma)
            scores = np.zeros(self.env.observation_space.shape[0])
            total_loss = 0

            self.sigma *= 0.99

            counter = 0
            for t in range(self.t_max):

                actions = self.agents.choose_action(states)
                next_states, rewards, dones, _ = self.env.step(actions)
                self.agents.step(states, actions, rewards, next_states, dones)
                states = next_states

                # DEBUG
                # logging.info("epsiode: {}, reward: {}, counter: {}, action: {}".
                #              format(episode_i, reward, counter, action))

                total_loss += self.agents.agent_loss
                scores += rewards
                counter += 1
                if any(dones):
                    break

            reward_window.append(np.max(scores))
            self.avg_rewards.append(np.mean(np.array(reward_window)))
            print(
                '\rEpisode {}\tCurrent Score: {:.4f}\tAverage Score: {:.4f} '
                '\t\tTotal loss: {:.2f}\tLearning rate (actor): {:.4f}\tLearning rate (critic): {:.4f}'
                .format(episode_i, np.max(scores), np.mean(reward_window),
                        total_loss, self.agents.learning_rate_actor,
                        self.agents.learning_rate_critic),
                end="")

            logging.info(
                'Episode {}\tCurrent Score: {:.4f}\tAverage Score (over episodes): {:.4f} '
                '\t\tTotal loss: {:.2f}\tLearning rate (actors): {:.4f}\tLearning rate (critic): {:.4f}'
                .format(episode_i, np.max(scores), np.mean(reward_window),
                        total_loss, self.agents.learning_rate_actor,
                        self.agents.learning_rate_critic))

            self.agents.learning_rate_actor *= self.learning_rate_decay
            self.agents.learning_rate_critic *= self.learning_rate_decay
            self.agents.set_learning_rate(self.agents.learning_rate_actor,
                                          self.agents.learning_rate_critic)

            if episode_i % 100 == 0:

                avg_reward = np.mean(np.array(reward_window))
                print("\rEpisode: {}\tAverage total reward: {:.2f}".format(
                    episode_i, avg_reward))

                if avg_reward >= 1.0:
                    print(
                        '\nEnvironment solved in {:d} episodes!\tAverage Score: {:.2f}'
                        .format(episode_i - 100, avg_reward))
                    if not os.path.exists(self.model_path):
                        os.makedirs(self.model_path)

                    t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
                    torch.save(
                        self.agents.get_actor()[0].state_dict(),
                        self.model_path + 'checkpoint_actor1_{}.pth'.format(t))
                    torch.save(
                        self.agents.get_actor()[1].state_dict(),
                        self.model_path + 'checkpoint_actor2_{}.pth'.format(t))
                    torch.save(
                        self.agents.get_critic().state_dict(),
                        self.model_path + 'checkpoint_critic_{}.pth'.format(t))
                    np.array(self.avg_rewards).dump(
                        self.results_path + 'average_rewards_{}.dat'.format(t))

        t = datetime.datetime.today().strftime('%Y-%m-%d_%H-%M')
        # reward_matrix.dump(self.results_path + 'reward_matrix_new_{}.dat'.format(t))
        np.array(self.avg_rewards).dump(self.results_path +
                                        'average_rewards_{}.dat'.format(t))

    def test(self,
             checkpoint_actor1_filename,
             checkpoint_actor2_filename,
             checkpoint_critic_filename,
             time_span=10):
        checkpoint_actor1_path = self.model_path + checkpoint_actor1_filename
        checkpoint_actor2_path = self.model_path + checkpoint_actor2_filename
        checkpoint_critic_path = self.model_path + checkpoint_critic_filename
        self.agents.get_actor()[0].load_state_dict(
            torch.load(checkpoint_actor1_path))
        self.agents.get_actor()[1].load_state_dict(
            torch.load(checkpoint_actor2_path))
        self.agents.get_critic().load_state_dict(
            torch.load(checkpoint_critic_path))
        for t in range(time_span):
            state = self.env.reset(train_mode=False)
            self.score = 0
            #done = False

            while True:
                action = self.agents.choose_action(state, 'test')
                state, reward, done, _ = self.env.step(action)
                self.score += np.array(np.max(reward))
                if any(done):
                    break

            print('\nFinal score:', self.score)

        self.env.close()

    @staticmethod
    def __set_seed(seed):
        torch.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        random.seed(seed)
        np.random.seed(seed)
Exemple #13
0
class Runner:
    def __init__(self, env, args, itr):
        # 获取参数
        # self.args = get_common_args()
        self.args = args

        # 获取环境
        self.env = env
        # 进程编号
        self.pid = itr

        self.agents = Agents(args, itr=itr)
        # 不复用网络,就会有多个agent,训练的时候共享参数,就是一个网络
        # if not self.args.reuse_network:
        #     self.agents = []
        #     for i in range(self.args.n_agents):
        #         self.agents.append(Agents(self.args, i))

        # self.rollout = RollOut(self.agents, self.args)

        self.replay_buffer = ReplayBuffer(self.args)

        self.win_rates = []
        '''
        这里,episode_reward 代表一个episode的累加奖赏,
        episodes_reward代表多个episode的累加奖赏,
        episodes_rewards代表多次评价的多个episode的累加奖赏
        '''
        self.episodes_rewards = []
        self.evaluate_itr = []

        self.max_win_rate = 0

        # 保存结果和模型的位置,增加计数,帮助一次运行多个实例
        self.save_path = self.args.result_dir + '/' + self.args.alg + '/' + self.args.map + '/' + str(
            itr)
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        print('runner 初始化')

    def generate_episode(self, episode_num, evaluate=False):
        # 为保存评价的回放做准备
        if self.args.replay_dir != '' and evaluate and episode_num == 0:
            self.env.close()
        # 变量初始化
        self.env.reset()
        done = False
        info = None
        win = False

        last_action = np.zeros((self.args.n_agents, self.args.n_actions))
        # epsilon 递减
        epsilon = 0 if evaluate else self.args.epsilon
        # epsilon 递减的方式
        if self.args.epsilon_anneal_scale == 'episode' or \
                (self.args.epsilon_anneal_scale == 'itr' and episode_num == 0):
            epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon

        # 记录一个episode的信息
        episode_buffer = None
        if not evaluate:
            episode_buffer = {
                'o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                's':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'a':
                np.zeros([self.args.episode_limit, self.args.n_agents, 1]),
                'onehot_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'r':
                np.zeros([self.args.episode_limit, 1]),
                'next_o':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.obs_shape
                ]),
                'next_s':
                np.zeros([self.args.episode_limit, self.args.state_shape]),
                'next_avail_a':
                np.zeros([
                    self.args.episode_limit, self.args.n_agents,
                    self.args.n_actions
                ]),
                'done':
                np.ones([self.args.episode_limit, 1]),
                'padded':
                np.ones([self.args.episode_limit, 1])
            }
        # 开始进行一波 episode
        obs = self.env.get_obs()
        state = self.env.get_state()
        avail_actions = []
        self.agents.policy.init_hidden(1)
        for agent_id in range(self.args.n_agents):
            avail_action = self.env.get_avail_agent_actions(agent_id)
            avail_actions.append(avail_action)

        episode_reward = 0
        for step in range(self.args.episode_limit):
            if done:
                break
            else:
                actions, onehot_actions = [], []
                for agent_id in range(self.args.n_agents):
                    # avail_action = self.env.get_avail_agent_actions(agent_id)
                    action = self.agents.choose_action(obs[agent_id],
                                                       last_action[agent_id],
                                                       agent_id,
                                                       avail_actions[agent_id],
                                                       epsilon, evaluate)
                    # 得到该动作的独热编码
                    onehot_action = np.zeros(self.args.n_actions)
                    onehot_action[action] = 1
                    onehot_actions.append(onehot_action)
                    # 加入联合动作
                    actions.append(action)
                    # avail_actions.append(avail_action)
                    # 记录该动作
                    last_action[agent_id] = onehot_action
                # 对环境执行联合动作
                reward, done, info = self.env.step(actions)
                # 获取改变后的信息
                if not done:
                    next_obs = self.env.get_obs()
                    next_state = self.env.get_state()
                else:
                    next_obs = obs
                    next_state = state
                # 添加可得动作
                next_avail_actions = []
                for agent_id in range(self.args.n_agents):
                    avail_action = self.env.get_avail_agent_actions(agent_id)
                    next_avail_actions.append(avail_action)
                # 添加经验
                if not evaluate:
                    episode_buffer['o'][step] = obs
                    episode_buffer['s'][step] = state
                    episode_buffer['a'][step] = np.reshape(
                        actions, [self.args.n_agents, 1])
                    episode_buffer['onehot_a'][step] = onehot_actions
                    episode_buffer['avail_a'][step] = avail_actions
                    episode_buffer['r'][step] = [reward]
                    episode_buffer['next_o'][step] = next_obs
                    episode_buffer['next_s'][step] = next_state
                    episode_buffer['next_avail_a'][step] = next_avail_actions
                    episode_buffer['done'][step] = [done]
                    episode_buffer['padded'][step] = [0.]

                # 更新变量
                episode_reward += reward
                obs = next_obs
                state = next_state
                avail_actions = next_avail_actions
                if self.args.epsilon_anneal_scale == 'step':
                    epsilon = epsilon - self.args.epsilon_decay if epsilon > self.args.min_epsilon else epsilon

        # 是训练则记录新的epsilon
        if not evaluate:
            self.args.epsilon = epsilon
        # 获取对局信息
        if info.__contains__('battle_won'):
            win = True if done and info['battle_won'] else False
        if evaluate and episode_num == self.args.evaluate_num - 1 and self.args.replay_dir != '':
            self.env.save_replay()
            self.env.close()
        return episode_buffer, episode_reward, win

    def run(self):
        train_steps = 0
        early_stop = 10
        num_eval = 0
        self.max_win_rate = 0

        for itr in range(self.args.n_itr):
            # 收集 n_episodes 的数据
            episode_batch, _, _ = self.generate_episode(0)
            for key in episode_batch.keys():
                episode_batch[key] = np.array([episode_batch[key]])
            for e in range(1, self.args.n_episodes):
                episode, _, _ = self.generate_episode(e)
                for key in episode_batch.keys():
                    episode[key] = np.array([episode[key]])
                    episode_batch[key] = np.concatenate(
                        (episode_batch[key], episode[key]), axis=0)

            # 添加到 replay buffer
            self.replay_buffer.store(episode_batch)
            # 训练
            if self.replay_buffer.size < self.args.batch_size * 12.5:
                # print('replay buffer 还没 batch size 大')
                continue
            for _ in range(self.args.train_steps):
                batch = self.replay_buffer.sample(self.args.batch_size)
                self.agents.train(batch, train_steps)
                # if self.args.reuse_network:
                #     self.agents.train(batch, train_steps)
                # else:
                #     for i in range(self.args.n_agents):
                #         self.agents[i].train(batch, train_steps)
                train_steps += 1
            # 周期性评价
            if itr % self.args.evaluation_period == 0:
                num_eval += 1
                print(f'进程 {self.pid}: {itr} / {self.args.n_itr}')
                win_rate, episodes_reward = self.evaluate()
                # 保存测试结果
                self.evaluate_itr.append(itr)
                self.win_rates.append(win_rate)
                self.episodes_rewards.append(episodes_reward)
                # 表现好的模型要额外保存
                if win_rate > self.max_win_rate:
                    self.max_win_rate = win_rate
                    self.agents.policy.save_model(str(win_rate))
                # 不时刻保存,从而减少时间花费
                if num_eval % 50 == 0:
                    self.save_results()
                    self.plot()
        # 最后把所有的都保存一下
        self.save_results()
        self.plot()
        self.env.close()

    def evaluate(self):
        """
        得到平均胜率和每次测试的累加奖赏,方便画误差阴影图
        :return:
        """
        win_number = 0
        episodes_reward = []
        for itr in range(self.args.evaluate_num):
            _, episode_reward, win = self.generate_episode(itr, evaluate=True)
            episodes_reward.append(episode_reward)
            if win:
                win_number += 1
        return win_number / self.args.evaluate_num, episodes_reward

    def save_results(self):
        """
        保存数据,方便后面多种算法结果画在一张图里比较
        :return:
        """
        # 如果已经有图片就删掉
        for filename in os.listdir(self.save_path):
            if filename.endswith('.npy'):
                os.remove(self.save_path + '/' + filename)
        np.save(self.save_path + '/evaluate_itr.npy', self.evaluate_itr)
        np.save(self.save_path + '/win_rates.npy', self.win_rates)
        np.save(self.save_path + '/episodes_rewards.npy',
                self.episodes_rewards)

    def plot(self):
        """
        定期绘图
        :return:
        """
        fig = plt.figure()
        ax1 = fig.add_subplot(211)
        win_x = np.array(self.evaluate_itr)[:, None]
        win_y = np.array(self.win_rates)[:, None]
        plot_win = pd.DataFrame(np.concatenate((win_x, win_y), axis=1),
                                columns=['evaluate_itr', 'win_rates'])
        sns.lineplot(x="evaluate_itr", y="win_rates", data=plot_win, ax=ax1)

        ax2 = fig.add_subplot(212)
        reward_x = np.repeat(self.evaluate_itr, self.args.evaluate_num)[:,
                                                                        None]
        reward_y = np.array(self.episodes_rewards).flatten()[:, None]
        plot_reward = pd.DataFrame(
            np.concatenate((reward_x, reward_y), axis=1),
            columns=['evaluate_itr', 'episodes_rewards'])
        sns.lineplot(x="evaluate_itr",
                     y="episodes_rewards",
                     data=plot_reward,
                     ax=ax2,
                     ci=68,
                     estimator=np.median)

        # 格式化成2016-03-20-11_45_39形式
        tag = self.args.alg + '-' + time.strftime("%Y-%m-%d_%H-%M-%S",
                                                  time.localtime())
        # 如果已经有图片就删掉
        for filename in os.listdir(self.save_path):
            if filename.endswith('.png'):
                os.remove(self.save_path + '/' + filename)
        fig.savefig(self.save_path + "/%s.png" % tag)
        plt.close()
Exemple #14
0
num_agents = len(env_info.agents)
print('Number of agents:', num_agents)

# size of each action
action_size = brain.vector_action_space_size
print('Size of each action:', action_size)

# examine the state space
states = env_info.vector_observations
state_size = states.shape[1]
print('There are {} agents. Each observes a state with length: {}'.format(
    states.shape[0], state_size))
print('The state for the first agent looks like:', states[0])

agents = Agents(num_agents=num_agents,
                state_size=state_size,
                action_size=action_size,
                random_seed=0)
scores = ddpg(env,
              brain_name,
              agents,
              n_episodes=n_episodes,
              eps_start=eps_start,
              eps_end=eps_end,
              eps_decay=eps_decay,
              resume=resume)

# plot the scores
plt.plot(np.arange(1, len(scores) + 1), np.mean(scores, axis=-1))
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()
Exemple #15
0
def import_agent_file(scenario_settings, con, cur, engine, model_settings,
                      agent_file_status, input_name):
    """
    Generates new agents or uses pre-generated agents from provided .pkl file
    
    Parameters
    ----------
    **scenario_settings** : 'SQL schema'
        Schema of the scenario settings
    **con** : 'SQL connection'
        SQL connection to connect to database
    **cur** : 'SQL cursor'
        Cursor
    **engine** : 'SQL engine'
        SQL engine to intepret SQL query
    **model_settings** : 'object'
        Model settings that apply to all scenarios
    **agent_file_status** : 'attribute'
        Attribute that describes whether to use pre-generated agent file or create new    
    **input_name** : 'string'
        .Pkl file name substring of pre-generated agent table 
    
    Returns
    -------
    **solar_agents** : 'Class'
        Instance of Agents class with either user pre-generated or new data

    """

    schema = scenario_settings.schema
    input_agent_dir = model_settings.input_agent_dir
    state_to_model = scenario_settings.state_to_model

    if agent_file_status == 'Use pre-generated Agents':

        userdefined_table_name = 'input_' + input_name + '_user_defined'
        scenario_userdefined_name = get_userdefined_scenario_settings(
            schema, userdefined_table_name, con)
        scenario_userdefined_value = scenario_userdefined_name['val'].values[0]

        agents_df = pd.read_pickle(
            os.path.join(input_agent_dir, scenario_userdefined_value + '.pkl'))
        agents_df = agents_df[agents_df['state_abbr'].isin(state_to_model)]

        if agents_df.empty:
            raise ValueError(
                'Region not present within pre-generated agent file - Edit Inputsheet'
            )

        # Convert dtypes of specific columns to floats
        cols = [
            'customers_in_bin_initial', 'load_kwh_per_customer_in_bin_initial',
            'load_kwh_in_bin_initial', 'max_demand_kw', 'avg_monthly_kwh',
            'cap_cost_multiplier', 'developable_roof_sqft',
            'pct_of_bldgs_developable'
        ]
        agents_df[cols] = agents_df[cols].astype(float)

        agents = Agents(agents_df)

        # Re-assign tariffs to agents with incompatible rate units
        agents.on_frame(agent_mutation.elec.reassign_agent_tariffs, con)

        # For wind: Ingest agent core attributes to database from pickle file
        if scenario_settings.techs in [['wind']]:
            ingest_agent_core_attributes(agents.df, con, cur, engine, schema,
                                         model_settings.role,
                                         scenario_settings.sectors,
                                         scenario_settings.techs)

    else:
        raise ValueError(
            'Generating agents is not supported at this time. Please select "Use pre-generated Agents" in the input sheet'
        )

    return agents