def env_load_fn(env_name): del env_name obs_spec = array_spec.BoundedArraySpec((2,), np.int32, -10, 10) action_spec = array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10) return random_py_environment.RandomPyEnvironment( obs_spec, action_spec=action_spec, min_duration=2, max_duration=4)
def __init__( self, start_state, target_state, min_observation=MIN_STATE, max_observation=MAX_STATE, min_action=MIN_VELOCITY, max_action=MAX_VELOCITY, low_process_noise_var=LOW_PROCESS_NOISE_VAR, high_process_noise_var=HIGH_PROCESS_NOISE_VAR, gating_bitmap=None, velocity_init=0.0, delta_time=DELTA_TIME, min_acceleration=MIN_ACCELERATION, max_acceleration=MAX_ACCELERATION, ): # self.gravitational_acceleration = tf.constant(9.81, dtype=float_type) self.e3 = tf.constant([[0.0], [0.0], [1.0]], dtype=float_type) self.mass = tf.constant(1.0, dtype=float_type) self.inertia_matrix = tf.constant([[1.0]], dtype=float_type) self.inv_inertia_matrix = 1.0 / self.inertia_matrix # simulation parameters self.start_state = start_state self.target_state = target_state self.state_dim = 6 self.control_dim = 2 self.state_init = start_state print("self.state_init") print(self.state_init) self._state = self.state_init self.delta_time = delta_time # environment parameters if isinstance(low_process_noise_var, np.ndarray): self.low_process_noise_var = low_process_noise_var else: print("low_process_noise_var isn't array so broadcasting") self.low_process_noise_var = low_process_noise_var * np.ones( num_states) if isinstance(high_process_noise_var, np.ndarray): self.high_process_noise_var = high_process_noise_var else: print("high_process_noise_var isn't array so broadcasting") self.high_process_noise_var = high_process_noise_var * np.ones( num_states) # # configure action spec min_action = np.array([-10, -10]) max_action = np.array([10, 10]) self._action_spec = array_spec.BoundedArraySpec( shape=(self.control_dim, ), dtype=float_type, minimum=min_action, maximum=max_action, name="action", ) # configure observation spec # if not isinstance(min_observation, np.ndarray): # min_observation = min_observation * np.ones(state_dim) # print("min_observation isn't array so broadcasting") # if not isinstance(max_observation, np.ndarray): # max_observation = max_observation * np.ones(state_dim) # print("max_observation isn't array so broadcasting") min_observation = np.array([-3.0, -3.0, -0.5, -0.5, -360, -5]) max_observation = np.array([3.0, 3.0, 0.5, 0.5, 360, 5]) self._observation_spec = array_spec.BoundedArraySpec( shape=(self.state_dim, ), dtype=float_type, minimum=min_observation, maximum=max_observation, name="observation", ) self.episode_ended = False if gating_bitmap is None: resolution = BITMAP_RESOLUTION self.gating_bitmap = np.ones([resolution, resolution]) elif isinstance(gating_bitmap, str): self.gating_bitmap = cv2.imread(gating_bitmap, cv2.IMREAD_GRAYSCALE) self.gating_bitmap = self.gating_bitmap / 255 elif isinstance(gating_bitmap, np.ndarray): self.gating_bitmap = gating_bitmap else: raise ( "gating_bitmap must be np.ndarray or filepath string for bitmap" ) # TODO check x and y are the right way around self.num_pixels = np.array( # [self.gating_bitmap.shape[0] - 1, self.gating_bitmap.shape[1] - 1] [self.gating_bitmap.shape[1] - 1, self.gating_bitmap.shape[0] - 1]) self.viewer = EnvRenderer(self)
def test_unbounded(self): obs_spec = array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10) action_spec = array_spec.ArraySpec((2,), np.int32) with self.assertRaisesRegexp(ValueError, 'bounded action specs'): env = random_py_environment.RandomPyEnvironment(obs_spec, action_spec) env = wrappers.ActionOffsetWrapper(env)
def __init__( # pylint: disable=W0231 self, alphabet: str, starting_seq: str, model: flexs.Model, landscape: flexs.Landscape, max_num_steps: int, ): """ Initialize DyNA-PPO agent environment. Based on this tutorial: https://www.mikulskibartosz.name/how-to-create-an-environment-for-a-tensorflow-agent Args: alphabet: Usually UCGA. starting_seq: When initializing the environment, the sequence which is initially mutated. model: Landscape or model which evaluates each sequence. max_num_steps: Maximum number of steps before episode is forced to terminate. Usually the `model_queries_per_batch`. """ self.alphabet = alphabet # model/model/measurements self.model = model self.landscape = landscape self.fitness_model_is_gt = False self.previous_fitness = -float("inf") self.seq = starting_seq self._state = { "sequence": s_utils.string_to_one_hot(self.seq, self.alphabet).astype(np.float32), "fitness": self.model.get_fitness([starting_seq]).astype(np.float32), } self.episode_seqs = set() # the sequences seen in the current episode self.all_seqs = {} self.measured_sequences = {} self.lam = 0.1 # tf_agents environment self._action_spec = array_spec.BoundedArraySpec( shape=(1, ), dtype=np.integer, minimum=0, maximum=len(self.seq) * len(self.alphabet) - 1, name="action", ) self._observation_spec = { "sequence": array_spec.BoundedArraySpec( shape=(len(self.seq), len(self.alphabet)), dtype=np.float32, minimum=0, maximum=1, ), "fitness": array_spec.ArraySpec(shape=(1, ), dtype=np.float32), } self.num_steps = 0 self.max_num_steps = max_num_steps
def __init__(self, emulator, balance, logger=None, start_time=1581434096, test_time=12 * 3600, indent=3600, period=1., reset=True, string_start='', orderbook_depth=5, action_ratio=0.25, return_type='delta', pair_list=None, asset_list=None): super().__init__() self.action_ratio = action_ratio self.db = DB() self.emulator = emulator self.logger = logger self.indent = indent self.return_type = return_type self.period = period self.start_time = start_time self.test_time = test_time self.current_data = {} self.memory = {} self.data = {} self.somes = {} self.times = {} self.current_time = start_time self.agent_balance = balance.copy() self.start_balance = balance.copy() self.max_balance = balance.copy() self.currency_number = len(balance) self.orderbook_depth = orderbook_depth if pair_list is None: with open(string_start + 'settings/pairs.txt') as file: self.pairs = [a[:-1] for a in file.readlines()] self.pair_number = 11 else: self.pairs = pair_list.copy() self.pair_number = len(self.pairs) if asset_list is None: with open(string_start + 'settings/cryptos.txt') as file: self.assets = [a[:-1] for a in file.readlines()] else: self.assets = asset_list.copy() assert len(self.agent_balance) == len( self.assets), 'Эй друг, что за махинации ты проворачиваешь?' n = self.orderbook_depth memory_columns = [f'depth_ask_price_{i + 1}' for i in range(n)] + \ [f'depth_bid_price_{i + 1}' for i in range(n)] + \ [f'depth_ask_quantity_{i + 1}' for i in range(n)] + \ [f'depth_bid_quantity_{i + 1}' for i in range(n)] self.memory_columns = { val: idx for idx, val in enumerate(memory_columns) } for pair in self.pairs: self.memory[pair] = self.db.fetch_pandas(start=start_time - indent, end=start_time + test_time, pair_names={pair}) self.times[pair] = self.memory[pair]['time'].copy() self.times[pair].index = self.times[pair].apply( datetime.datetime.fromtimestamp) self.memory[pair].index = self.memory[pair]['time'].apply( datetime.datetime.fromtimestamp) data = dp.basic_clean(self.memory[pair].copy()) copy = data.copy() some = dp.make_x(copy) self.times[pair] = self.times[pair][some.index] self.somes[pair] = some common_index = self.times[self.pairs[0]].index for pair in self.pairs: common_index = common_index.intersection(self.times[pair].index) self.time = self.times[self.pairs[0]][common_index] for pair in self.pairs: some = self.somes[pair] if reset: scaler = StandardScaler() ok_cols = list(some.columns) scaler.fit(some) joblib.dump( ok_cols, string_start + 'settings/Env_settings/' + pair + '_columns.joblib') joblib.dump( scaler, string_start + 'settings/Env_settings/' + pair + '_scaler.joblib') else: scaler = joblib.load(string_start + 'settings/Env_settings/' + pair + '_scaler.joblib') ok_cols = joblib.load(string_start + 'settings/Env_settings/' + pair + '_columns.joblib') some = some[ok_cols] some = some.loc[common_index] self.memory[pair] = self.memory[pair].loc[common_index][ memory_columns].values self.data[pair] = scaler.transform(some) time = self.time.reset_index(drop=True) current = pd.Series(range(start_time, start_time + test_time)) timeta = pd.DataFrame(time, columns=['time']) timeta['index'] = timeta.index curta = pd.DataFrame(current, columns=['time']) merged = curta.merge(timeta, how='outer', sort=True) merged.ffill(inplace=True) final = curta.join(merged.set_index('time'), on='time') final = final.set_index('time') final['index'] = final['index'].apply(int) self.time_to_id = final del self.times, self.somes self._action_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=self.pair_number * 2, name='action') obs_shape = self.pair_number * 109 + self.currency_number self._observation_spec = array_spec.ArraySpec(shape=(obs_shape, ), dtype=np.float64, name='observation') self._episode_ended = False init_handle = self.emulator.handle([], self.agent_balance, self.form_orderbook()) self.history = [(self.current_time, init_handle['new_usdt'])]
def testCheckArrayMatch(self, dtype): spec = array_spec.BoundedArraySpec((2,), dtype, minimum=5, maximum=15) self.assertTrue(spec.check_array(np.array([6, 7], dtype))) # Bounds should be inclusive. self.assertTrue(spec.check_array(np.array([5, 15], dtype)))
def testBoundedArraySpecSample(self, dtype): spec = array_spec.BoundedArraySpec((2, 3), dtype, -10, 10) sample = array_spec.sample_spec_nest(spec, self.rng) self.assertTrue(np.all(sample >= -10)) self.assertTrue(np.all(sample <= 10))
def __init__(self, fake=False, metrics_key='001'): with open('running', 'w') as f: f.write(str(os.getpid())) self._episode_ended = False self.game = serpent.initialize_game('T4TF1') game_frame = self.game.screen_regions['GAME_REGION'] self.width = 10 self.height = 10 self.state_shape = (int(self.height / 2), int(self.width / 2), 1) self._action_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=1, name='action') self._observation_spec = array_spec.BoundedArraySpec( shape=self.state_shape, dtype=np.float32, minimum=0.0, name='observation') self._state = np.zeros(self.state_shape).astype(np.float32) if fake: return self.interrupted = False self.game.launch() self.game.start_frame_grabber() self.input_controller = InputController(game=self.game) # self.input_proc = self.frame_buffer = FrameGrabber.get_frames([0]) self.frame_buffer = self.extract_game_area(self.frame_buffer) self.width = self.frame_buffer[0].shape[1] self.height = self.frame_buffer[0].shape[0] print('width: %d' % self.width) print('height: %d' % self.height) self.state_shape = (self.height, self.width, 3) self._action_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=1, name='action') self._observation_spec = array_spec.BoundedArraySpec( shape=self.state_shape, dtype=np.float32, minimum=0.0, name='observation') self._state = np.zeros(self.state_shape).astype(np.float32) # print('created input with pid: %s' % self.input_proc.pid) self.sell_keys = [KeyboardKey.KEY_LEFT_SHIFT, KeyboardKey.KEY_LEFT_CTRL, KeyboardKey.KEY_S] self.buy_keys = [KeyboardKey.KEY_LEFT_SHIFT, KeyboardKey.KEY_LEFT_CTRL, KeyboardKey.KEY_B] self.step_keys = [KeyboardKey.KEY_LEFT_SHIFT, KeyboardKey.KEY_LEFT_CTRL, KeyboardKey.KEY_F] self.visual_debugger = VisualDebugger() self.scraper = T4Scraper(game=self.game, visual_debugger=self.visual_debugger) frame = self.game.grab_latest_frame() self.scraper.current_frame = frame self.pl = 0 self.working_trade = 0 self.current_action = '' self.held = False self.fill_count = 0 self.window_controller = WindowController() self.window_id = self.window_controller.locate_window(".*Mini-Dow .*") # self.window_id = self.window_controller.locate_window(".*S&P .*") self.keys = RedisKeys(metrics_key) # self.redis = redis.Redis(port=6001) self.number_of_trades = 0 self.number_of_wins = 0 self.buys = 0 self.sells = 0 self.holds = 0 self.history = list() self.actions = 0 self.last_action = '' self.previous_write = -1 self.get_metadata() self.active_frame = None self.start_time = time.time() self.step_read_time = 0 self.step_write_time = 0
def action_spec(self): spec = self._env.action_spec() minimum = np.zeros(shape=spec.shape, dtype=spec.dtype) maximum = spec.maximum - spec.minimum return array_spec.BoundedArraySpec(spec.shape, spec.dtype, minimum=minimum, maximum=maximum)
def __init__(self, window_name, render_me=True): # game parameters self._board_size = 5 self._max_turns = 400 if self._max_turns > 20: self._frames = 20 else: self._frames = self._max_turns self._agent_count = 2 self._channels = 3 self._action_def = { 0: ShipAction.EAST, 1: ShipAction.NORTH, 2: "NOTHING", 3: ShipAction.SOUTH, 4: ShipAction.WEST } # runtime parameters self.turns_counter = 0 self.episode_ended = False self.total_reward = 0 self.ships_idle = [] self.shipyards_idle = [] self.last_reward = 0 self.render_step = render_me # initialize game self.environment = make("halite", configuration={ "size": self._board_size, "startingHalite": 1000, "episodeSteps": self._max_turns }) self.environment.reset(self._agent_count) self._action_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=len(self._action_def) - 1, name='action') self._observation_spec = array_spec.BoundedArraySpec( shape=(self._frames, self._board_size, self._board_size, self._channels), dtype=np.int32, minimum=0, maximum=1, name='observation') self.state = np.zeros( [self._board_size, self._board_size, self._channels]) # 0 = Halite 0-1 # 1 = Ships (This One Hot, rest are .5) # 2 = Shipyardss (This One Hot, rest are .5) self.state_history = [self.state] * self._frames # get board self.board = self.get_board() self.prime_board() self.halite_image_render = image_render(self._board_size) self.previous_ship_count = 0
def action_spec(self): return array_spec.BoundedArraySpec( [7], dtype=np.float32, minimum=-1.0, maximum=1.0)
def __init__(self, simualtor, discount_factor=1.0): self.timestep = None # Initialize the time step to be zero # These parameters need to be overwritten # 1. Action spec - i.e. what actions are allowed by the environment. self._action_spec = array_spec.BoundedArraySpec( shape=(<INSERT_HERE>), dtype=np.float32, minimum=<INSERT_HERE>, maximum=<INSERT_HERE>, name='action') # Actions specification # 2. Observation Spec - i.e. what is an ageng allowed to observe in this environment self._observation_spec = array_spec.BoundedArraySpec( shape=(<INSERT_HERE>, dtype=np.float32, minimum=<INSERT_HERE>, name='observation') # States are [x y theta velocity]^T # 3. The "reset" state self.state0 = <INSERT_HERE> # Store initial state for resets # 4. The general state self._state = <INSERT HERE> # Synchronize env <--> simulator # 5. Keep track of if an episode is completed self._episode_ended = False # 6. Discount factor self.discount_factor = discount_factor def action_spec(self): """Get action_spec class attributes. Getter method for the action_spec class attribute. Returns: Returns the action specification for this Python environment class. """ return self._action_spec def observation_spec(self): """Get observation_spec class attributes. Getter method for the observation_spec class attribute. Returns: Returns the observation specification for this Python environment class. """ return self._observation_spec def batch_size(self): return self.batch_size def batched(self): if self.batch_size() != 1: return True return False def _reset(self): """Reset the environment back to its default state. This method is used for resetting at the end of episodes, and returns the environment state to its initialized state. Returns: A tf-agents function that carries information about resetting relevant environment variables back to their default settings. """ self._state = <PICK_RANDOM_STATE> # Reset this to a random state self.timestep = 0 # Reset time step counter self._episode_ended = False return ts.restart(np.array(self.state0, dtype=np.float32)) def _step(self, action): """Main functionality for stepping the RL model in this environment. This function lets the agent take an action, then updates the agent's state appropriately and computes the agent's reward. Arguments: action (list): A list corresponding to the action componets [a1, a2, ... , aN] the agent takes at each time step. Returns: A tf-agents function that carries information about the current observation and discounted reward. """ # If episode is over, after terminating time step, reset environment if self._episode_ended: return self.reset() # Else, step the agent, update state, and compute reward position_x, position_y, theta, velocity = \ self.simulator.state_transition(self._state, action, self._dt) position_x, position_y = self.check_bounding_box([position_x, position_y]) # Update state here self._state = <UPDATE_STATE_HERE> # Compute reward reward = <INSERT REWARD COMPUTATION> # Check if the episode has ended if self.timestep.is_last(): self._episode_ended = True return ts.termination(np.array(self._state, dtype=np.float32), reward=reward) # Else, step the time step counter and transition self.timestep += 1 return ts.transition(np.array(self._state, dtype=np.float32), reward=reward, discount=float(self.discount_factor))
def __init__(self, global_context_sampling_fn: Callable[[], types.Array], arm_context_sampling_fn: Callable[[], types.Array], num_actions: int, reward_fn: Callable[[types.Array], Sequence[float]], batch_size: Optional[int] = 1): """Initializes the environment. In each round, global context is generated by global_context_sampling_fn, per-arm contexts are generated by arm_context_sampling_fn. The two feature generating functions should output a single observation, not including either the batch_size or the number of actions. The reward_fn function takes a global and a per-arm feature, and outputs a possibly random reward. Example: def global_context_sampling_fn(): return np.random.randint(0, 10, [2]) # 2-dimensional global features. def arm_context_sampling_fn(): return {'armf1': np.random.randint(-3, 4, [3]), # A dictionary of 'armf2': np.random.randint(0, 2, [4, 5])} # arm features. def reward_fn(global, arm): return sum(global) + arm['armf1'][0] + arm['armf2'][3, 3] env = StationaryStochasticPyEnvironment(global_context_sampling_fn, arm_context_sampling_fn, 5, reward_fn, batch_size=5) Args: global_context_sampling_fn: A function that outputs a possibly nested structure of features. This output is the global context. Its shapes and types must be consistent accross calls. arm_context_sampling_fn: A function that outputs a possibly nested structure of features. This output is the per-arm context. Its shapes must be consistent accross calls. num_actions: (int) the number of actions in every sample. reward_fn: A function that generates a reward when called with a global and a per-arm observation. batch_size: The batch size. """ self._global_context_sampling_fn = global_context_sampling_fn self._arm_context_sampling_fn = arm_context_sampling_fn self._num_actions = num_actions self._reward_fn = reward_fn self._batch_size = batch_size global_example = global_context_sampling_fn() arm_example = arm_context_sampling_fn() observation_spec = { GLOBAL_KEY: tf.nest.map_structure(array_spec.ArraySpec.from_array, global_example), PER_ARM_KEY: array_spec.add_outer_dims_nest( tf.nest.map_structure(array_spec.ArraySpec.from_array, arm_example), (num_actions, )) } action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=num_actions - 1, name='action') super(StationaryStochasticStructuredPyEnvironment, self).__init__(observation_spec, action_spec)
def __init__(self, global_context_sampling_fn, arm_context_sampling_fn, max_num_actions, reward_fn, num_actions_fn=None, batch_size=1, variable_action_method=VariableActionMethod.FIXED): """Initializes the environment. In each round, global context is generated by global_context_sampling_fn, per-arm contexts are generated by arm_context_sampling_fn. The reward_fn function takes the concatenation of a gloabl and a per-arm feature, and outputs a possibly random reward. In case `num_action_fn` is specified, the number of actions will be dynamic. The actual number of actions can be encoded in multiple ways, specified by `variable_action_method`. The observation spec constructed by the environment will also reflect the method used. The below list explains how the observations are built for all the methods. The different values of `variable_action_method` and the corresponding behavior: -- `FIXED` (default): The number of actions per sample is fixed. In this case, `num_actions_fn` should be `None`. -- 'MASK': The actually available actions are encoded by an action mask added to the observation in the format of `(observation, [1 1 ... 1 0 ... 0])`. The length of the mask, as well of the number of arm observations if `max_num_actions`. -- `NUM_ACTIONS_FEATURE`: An extra feature key `num_actions` is added to the observation, with an integer feature value indicating the number of available actions. The arm observation tensor has shape `[batch_size, max_num_actions, arm_feature_dim]`. -- `IN_BATCH_DIM`: The number of actions is folded into the batch dimension. In this case, the actual batch size should be 1, and the batch dimension is used to list all the actions for a sample. The global observation will internally be tiled to match this induced batch size. Also note that in this case, the `max_num_actions` parameter is ignored. Example: def global_context_sampling_fn(): return np.random.randint(0, 10, [2]) # 2-dimensional global features. def arm_context_sampling_fn(): return np.random.randint(-3, 4, [3]) # 3-dimensional arm features. def reward_fn(x): return sum(x) def num_actions_fn(): return np.random.randint(2, 6) env = StationaryStochasticPerArmPyEnvironment( global_context_sampling_fn, arm_context_sampling_fn, 5, reward_fn, num_actions_fn, VariableActionMethod.NUM_ACTIONS_FEATURE) Args: global_context_sampling_fn: A function that outputs a random 1d array or list of ints or floats. This output is the global context. Its shape and type must be consistent accross calls. arm_context_sampling_fn: A function that outputs a random 1 array or list of ints or floats (same type as the output of `global_context_sampling_fn`). This output is the per-arm context. Its shape must be consistent accross calls. max_num_actions: (int) the maximum number of actions in every sample. If `num_actions_fn` is not set, this many actions are available in every time step. reward_fn: A function that generates a reward when called with an observation. num_actions_fn: If set, it should be a function that outputs a single integer specifying the number of actions for a given time step. The value output by this function will be capped between 1 and `max_num_actions`. The number of actions will be encoded based on the method specified in `variable_action_method`. The different encodings are explained in the documentation above. batch_size: The batch size. variable_action_method: An instance of `VariableActionMethod`. Determines the way variable number of actions are handled. """ self._global_context_sampling_fn = global_context_sampling_fn self._arm_context_sampling_fn = arm_context_sampling_fn self._max_num_actions = max_num_actions self._reward_fn = reward_fn self._batch_size = batch_size self._num_actions_fn = num_actions_fn self._variable_action_method = variable_action_method observation_spec = self._create_observation_spec() action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=max_num_actions - 1, name='action') super(StationaryStochasticPerArmPyEnvironment, self).__init__(observation_spec, action_spec)
def testNotEqualDifferentMaximum(self): spec_1 = array_spec.BoundedArraySpec( (1, 2), np.int32, minimum=0.0, maximum=2.0) spec_2 = array_spec.BoundedArraySpec( (1, 2), np.int32, minimum=[0.0, 0.0], maximum=[1.0, 1.0]) self.assertNotEqual(spec_1, spec_2)
def __init__(self, data_dir: Text, rank_k: int, batch_size: int = 1, num_actions: int = 50, csv_delimiter=',', name: Optional[Text] = 'movielens_per_arm'): """Initializes the Per-arm MovieLens Bandit environment. Args: data_dir: (string) Directory where the data lies (in text form). rank_k : (int) Which rank to use in the matrix factorization. This will also be the feature dimension of both the user and the movie features. batch_size: (int) Number of observations generated per call. num_actions: (int) How many movies to choose from per round. csv_delimiter: (string) The delimiter to use in loading the data csv file. name: (string) The name of this environment instance. """ self._batch_size = batch_size self._context_dim = rank_k self._num_actions = num_actions # Compute the matrix factorization. self._data_matrix = dataset_utilities.load_movielens_data( data_dir, delimiter=csv_delimiter) self._num_users, self._num_movies = self._data_matrix.shape # Compute the SVD. u, s, vh = np.linalg.svd(self._data_matrix, full_matrices=False) # Keep only the largest singular values. self._u_hat = u[:, :rank_k].astype(np.float32) self._s_hat = s[:rank_k].astype(np.float32) self._v_hat = np.transpose(vh[:rank_k]).astype(np.float32) self._approx_ratings_matrix = np.matmul(self._u_hat * self._s_hat, np.transpose(self._v_hat)) self._action_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=num_actions - 1, name='action') observation_spec = { GLOBAL_KEY: array_spec.ArraySpec(shape=[rank_k], dtype=np.float32), PER_ARM_KEY: array_spec.ArraySpec( shape=[num_actions, rank_k], dtype=np.float32), } self._time_step_spec = ts.time_step_spec(observation_spec) self._current_user_indices = np.zeros(batch_size, dtype=np.int32) self._previous_user_indices = np.zeros(batch_size, dtype=np.int32) self._current_movie_indices = np.zeros([batch_size, num_actions], dtype=np.int32) self._previous_movie_indices = np.zeros([batch_size, num_actions], dtype=np.int32) self._observation = { GLOBAL_KEY: np.zeros([batch_size, rank_k]), PER_ARM_KEY: np.zeros([batch_size, num_actions, rank_k]), } super(MovieLensPerArmPyEnvironment, self).__init__( observation_spec, self._action_spec, name=name)
def testRepr(self): as_string = repr( array_spec.BoundedArraySpec( (1, 2), np.int32, minimum=73.0, maximum=101.0)) self.assertIn("101", as_string) self.assertIn("73", as_string)
def testInvalidMaximum(self): with self.assertRaisesRegexp(ValueError, "not compatible"): array_spec.BoundedArraySpec((3, 5), np.uint8, 0, (1, 1, 1))
def testCheckArrayNoMatch(self, array): spec = array_spec.BoundedArraySpec((2,), np.int64, minimum=5, maximum=15) self.assertFalse(spec.check_array(array))
def testMinLargerThanMax(self): with self.assertRaisesRegexp(ValueError, "min has values greater than max"): array_spec.BoundedArraySpec((3,), np.uint8, (1, 2, 3), (3, 2, 1))
def __init__(self, data_dir: Text, rank_k: int, batch_size: int = 1, num_movies: int = 20, csv_delimiter: Text = ',', name: Optional[Text] = 'movielens'): """Initializes the MovieLens Bandit environment. Args: data_dir: (string) Directory where the data lies (in text form). rank_k : (int) Which rank to use in the matrix factorization. batch_size: (int) Number of observations generated per call. num_movies: (int) Only the first `num_movies` movies will be used by the environment. The rest is cut out from the data. csv_delimiter: (string) The delimiter to use in loading the data csv file. name: The name of this environment instance. """ self._num_actions = num_movies self._batch_size = batch_size self._context_dim = rank_k # Compute the matrix factorization. self._data_matrix = dataset_utilities.load_movielens_data( data_dir, delimiter=csv_delimiter) # Keep only the first items. self._data_matrix = self._data_matrix[:, :num_movies] # Filter the users with no iterm rated. nonzero_users = list(np.nonzero(np.sum(self._data_matrix, axis=1) > 0.0)[0]) self._data_matrix = self._data_matrix[nonzero_users, :] self._effective_num_users = len(nonzero_users) # Compute the SVD. u, s, vh = np.linalg.svd(self._data_matrix, full_matrices=False) # Keep only the largest singular values. self._u_hat = u[:, :rank_k] * np.sqrt(s[:rank_k]) self._v_hat = np.transpose( np.transpose(vh[:rank_k, :]) * np.sqrt(s[:rank_k])) self._approx_ratings_matrix = np.matmul(self._u_hat, self._v_hat) self._current_users = np.zeros(batch_size) self._previous_users = np.zeros(batch_size) self._action_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.int32, minimum=0, maximum=self._num_actions - 1, name='action') observation_spec = array_spec.ArraySpec( shape=(self._context_dim,), dtype=np.float64, name='observation') self._time_step_spec = ts.time_step_spec(observation_spec) self._observation = np.zeros((self._batch_size, self._context_dim)) self._optimal_action_table = np.argmax( self._approx_ratings_matrix, axis=1) self._optimal_reward_table = np.max( self._approx_ratings_matrix, axis=1) super(MovieLensPyEnvironment, self).__init__( observation_spec, self._action_spec, name=name)
def testMinMaxAttributes(self): spec = array_spec.BoundedArraySpec((1, 2, 3), np.float32, 0, (5, 5, 5)) self.assertEqual(type(spec.minimum), np.ndarray) self.assertEqual(type(spec.maximum), np.ndarray)
def __init__(self, global_context_sampling_fn, arm_context_sampling_fn, max_num_actions, reward_fn, num_actions_fn=None, batch_size=1): """Initializes the environment. In each round, global context is generated by global_context_sampling_fn, per-arm contexts are generated by arm_context_sampling_fn. The reward_fn function takes the concatenation of a global and a per-arm feature, and outputs a possibly random reward. In case `num_action_fn` is specified, the number of actions will be dynamic and a `num_actions` feature key indicates the number of actions in any given sample. Example: def global_context_sampling_fn(): return np.random.randint(0, 10, [2]) # 2-dimensional global features. def arm_context_sampling_fn(): return np.random.randint(-3, 4, [3]) # 3-dimensional arm features. def reward_fn(x): return sum(x) def num_actions_fn(): return np.random.randint(2, 6) env = StationaryStochasticPerArmPyEnvironment(global_context_sampling_fn, arm_context_sampling_fn, 5, reward_fn, num_actions_fn) Args: global_context_sampling_fn: A function that outputs a random 1d array or list of ints or floats. This output is the global context. Its shape and type must be consistent across calls. arm_context_sampling_fn: A function that outputs a random 1 array or list of ints or floats (same type as the output of `global_context_sampling_fn`). This output is the per-arm context. Its shape must be consistent across calls. max_num_actions: (int) the maximum number of actions in every sample. If `num_actions_fn` is not set, this many actions are available in every time step. reward_fn: A function that generates a reward when called with an observation. num_actions_fn: If set, it should be a function that outputs a single integer specifying the number of actions for a given time step. The value output by this function will be capped between 1 and `max_num_actions`. The number of actions will be encoded in the observation by the feature key `num_actions`. batch_size: The batch size. """ self._global_context_sampling_fn = global_context_sampling_fn self._arm_context_sampling_fn = arm_context_sampling_fn self._max_num_actions = max_num_actions self._reward_fn = reward_fn self._batch_size = batch_size self._num_actions_fn = num_actions_fn observation_spec = { GLOBAL_KEY: array_spec.ArraySpec.from_array(global_context_sampling_fn()), PER_ARM_KEY: array_spec.add_outer_dims_nest( array_spec.ArraySpec.from_array(arm_context_sampling_fn()), (max_num_actions, )) } if self._num_actions_fn is not None: num_actions_spec = array_spec.BoundedArraySpec( shape=(), dtype=np.dtype(type(self._num_actions_fn())), minimum=1, maximum=max_num_actions) observation_spec.update({NUM_ACTIONS_KEY: num_actions_spec}) action_spec = array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=max_num_actions - 1, name='action') super(StationaryStochasticPerArmPyEnvironment, self).__init__(observation_spec, action_spec)
def testNotWriteable(self): spec = array_spec.BoundedArraySpec((1, 2, 3), np.float32, 0, (5, 5, 5)) with self.assertRaisesRegexp(ValueError, "read-only"): spec.minimum[0] = -1 with self.assertRaisesRegexp(ValueError, "read-only"): spec.maximum[0] = 100
def observation_spec(self): return array_spec.BoundedArraySpec( shape=(360,), dtype=np.dtype('float64'), name='observation' )
def testEqualBroadcastingBounds(self): spec_1 = array_spec.BoundedArraySpec( (1, 2), np.int32, minimum=0.0, maximum=1.0) spec_2 = array_spec.BoundedArraySpec( (1, 2), np.int32, minimum=[0.0, 0.0], maximum=[1.0, 1.0]) self.assertEqual(spec_1, spec_2)
def setUp(self): super(AgentPolicyTest, self).setUp() self._action_spec = array_spec.BoundedArraySpec(shape=(3, ), dtype=np.float, minimum=[0, 0, 0], maximum=[1, 1, 1])
def testReuseSpec(self): spec_1 = array_spec.BoundedArraySpec( (1, 2), np.int32, minimum=0.0, maximum=1.0) spec_2 = array_spec.BoundedArraySpec(spec_1.shape, spec_1.dtype, spec_1.minimum, spec_1.maximum) self.assertEqual(spec_1, spec_2)
def test_continuous(self): obs_spec = array_spec.BoundedArraySpec((2, 3), np.int32, -10, 10) action_spec = array_spec.BoundedArraySpec((2,), np.float32, -1, 1) with self.assertRaisesRegexp(ValueError, 'discrete action specs'): env = random_py_environment.RandomPyEnvironment(obs_spec, action_spec) env = wrappers.ActionOffsetWrapper(env)
from random import randint from tf_agents.specs import array_spec from generic_environment import GenericEnv from dqn_agent import DqnAgent #params num_episode = 2000 # @param board_size = 9 #env dqn = DqnAgent( array_spec.BoundedArraySpec(shape=(), dtype=np.int32, minimum=0, maximum=3, name='action'), array_spec.BoundedArraySpec(shape=(2, ), dtype=np.int32, minimum=0, maximum=board_size, name='observation'), np.array([0, 0], dtype=np.int32)) #[row, column] state = np.array([0, 0], dtype=np.int32) episode_count = 0 step_count = 0 while episode_count < num_episode: