Python GymEnvironmentの例、gym_evaluator.GymEnvironment Pythonの例

コード例 #1

0

ファイルを表示

def environment(discrete=True):
    if discrete:
        separators = [
            np.linspace(-.4, .4, num=5 + 1)[1:-1],  # x
            np.linspace(-.05, .9, num=6 + 1)[1:-1],  # y
            np.linspace(-.5, .5, num=5 + 1)[1:-1],  # vel x
            np.linspace(-.8, .8, num=7 + 1)[1:-1],  # vel y
            np.linspace(-.2, .2, num=3 + 1)[1:-1],  # rot
            np.linspace(-.2, .2, num=5 + 1)[1:-1],  # ang vel
            [.5],  #lc
            [.5],  #rc
        ]
        evaluator = gym_evaluator.GymEnvironment("LunarLander-v2",
                                                 separators=separators)
    else:
        evaluator = gym_evaluator.GymEnvironment("LunarLander-v2")

    evaluator._expert = gym.make("LunarLander-v2")
    evaluator._expert.seed(42)
    evaluator._expert.continuous = not discrete

    def expert_trajectory():
        state, trajectory, done = evaluator._expert.reset(), [], False
        initial_state = evaluator._maybe_discretize(state)
        while not done:
            action = gym.envs.box2d.lunar_lander.heuristic(
                evaluator._expert, state)
            state, reward, done, _ = evaluator._expert.step(action)
            trajectory.append(
                (action, reward, evaluator._maybe_discretize(state)))
        return initial_state, trajectory

    evaluator.expert_trajectory = expert_trajectory

    return evaluator

コード例 #2

0

ファイルを表示

ファイル: mountain_car_evaluator.py プロジェクト: maobedkova/DeepLearning

def environment(discrete=True):
    if discrete:
        bins = 12
        separators = [
            np.linspace(-1.2, 0.6, num=bins + 1)[1:-1],   # car position
            np.linspace(-0.07, 0.07, num=bins + 1)[1:-1], # car velocity
        ]
        return gym_evaluator.GymEnvironment("MountainCarLimit1000-v0", bins, separators)

    return gym_evaluator.GymEnvironment("MountainCarLimit1000-v0")

コード例 #3

0

ファイルを表示

def environment(discrete=True, tiles=None, verbose=True):
    if discrete:
       bins = 24 if tiles is None or tiles <= 1 else 12 if tiles <= 3 else 8
       separators = [
           np.linspace(-1.2, 0.6, num=bins + 1)[1:-1],   # car position
           np.linspace(-0.07, 0.07, num=bins + 1)[1:-1], # car velocity
       ]
       return gym_evaluator.GymEnvironment("MountainCarLimit1000-v0", separators=separators, tiles=tiles, verbose=verbose)

    return gym_evaluator.GymEnvironment("MountainCarLimit1000-v0", verbose=verbose)

コード例 #4

0

ファイルを表示

ファイル: cart_pole_evaluator.py プロジェクト: Gldkslfmsd/npfl122

def environment(discrete=True):
    if discrete:
        bins = 8
        separators = [
            np.linspace(-2.4, 2.4, num=bins + 1)[1:-1],  # cart position
            np.linspace(-3, 3, num=bins + 1)[1:-1],  # pole angle
            np.linspace(-0.5, 0.5, num=bins + 1)[1:-1],  # cart velocity
            np.linspace(-2, 2, num=bins + 1)[1:-1],  # pole angle velocity
        ]
        return gym_evaluator.GymEnvironment("CartPole-v1", bins, separators)

    return gym_evaluator.GymEnvironment("CartPole-v1")

コード例 #5

0

ファイルを表示

def environment(cards):
    env_name = "MemoryGame{}-v0".format(cards)
    if env_name not in memory_games:
        gym.envs.register(id=env_name,
                          entry_point=lambda: MemoryGame(cards),
                          max_episode_steps=2 * cards,
                          reward_threshold=0)
        memory_games.add(env_name)
    env = gym_evaluator.GymEnvironment(env_name)

    env._expert = gym.make(env_name)

    def expert_episode():
        state = env._expert.reset()
        episode, seen, done = [], {}, False
        while not done:
            last_action, observation = state
            if observation in seen:
                action = seen.pop(observation)
                if action == last_action - 1:
                    action = cards
            else:
                seen[observation] = last_action
                action = cards

            episode.append((state, action))
            state, _, done, _ = env._expert.step(action)
        episode.append((state, None))
        return episode

    env.expert_episode = expert_episode

    return env

コード例 #6

0

ファイルを表示

ファイル: car_racing_evaluator.py プロジェクト: Auratons/npfl122

def environment(frame_skip=1):
    if frame_skip not in FRAME_SKIPS:
        raise ValueError(
            "Unsupported frame skip {}, only {} are supported".format(
                frame_skip, list(FRAME_SKIPS)))

    return gym_evaluator.GymEnvironment(
        "CarRacingCustomDrawFrameSkip{}-v0".format(frame_skip))

コード例 #7

0

ファイルを表示

def environment(cards):
    env_name = "MemoryGame{}-v0".format(cards)
    if env_name not in memory_games:
        gym.envs.register(id=env_name,
                          entry_point=lambda: MemoryGame(cards),
                          max_episode_steps=2 * cards,
                          reward_threshold=0)
        memory_games.add(env_name)
    return gym_evaluator.GymEnvironment(env_name)

コード例 #8

0

ファイルを表示

ファイル: car_racing_evaluator.py プロジェクト: davda54/npfl122

def environment():
    env = gym_evaluator.GymEnvironment("CarRacingCustomDraw-v0")

    def step(action, frame_skip=1):
        env._env.unwrapped.frame_skip = frame_skip
        return gym_evaluator.GymEnvironment.step(env, action)

    env.step = step

    return env

コード例 #9

0

ファイルを表示

ファイル: paac.py プロジェクト: Auratons/npfl122

    gpus = tf.config.experimental.list_physical_devices('GPU')
    if gpus:
        try:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
                logical_gpus = tf.config.experimental.list_logical_devices(
                    'GPU')
                print(len(gpus), "Physical GPUs,", len(logical_gpus),
                      "Logical GPUs")
        except RuntimeError as e:
            # Memory growth must be set before GPUs have been initialized
            print(e)

    # Create the environment
    env = gym_evaluator.GymEnvironment(args.env)

    # Construct the network
    network = Network(env, args)

    # Initialize parallel workers by env.parallel_init
    states = env.parallel_init(args.workers)

    saved_model_path = Path(__file__).parent / 'paac_models_weights'
    training = not args.use_pretrained

    summary_writer = tf.summary.create_file_writer(
        str(
            Path(__file__).parent / 'logs' /
            f'train-{datetime.datetime.now().strftime("%Y%m%d-%H%M%S")}'))
    summary_writer.set_as_default()

コード例 #10

0

ファイルを表示

def environment():
    return gym_evaluator.GymEnvironment("CartPolePixels-v0")

コード例 #11

0

ファイルを表示

ファイル: cart_pole_pixels_evaluator.py プロジェクト: vhotmar/npfl114

def environment(seed=None):
    return gym_evaluator.GymEnvironment("CartPolePixels-v0", seed=seed)

コード例 #12

0

ファイルを表示

class Network:
    def __init__(self, env, args):
        # TODO: Similarly to reinforce, define two models:
        # - _policy, which predicts distribution over the actions
        # - _value, which predicts the value function
        # Use independent networks for both of them, each with
        # `args.hidden_layer` neurons in one hidden layer,
        # and train them using Adam with given `args.learning_rate`.

    def train(self, states, actions, returns):
        states, actions, returns = np.array(states, np.float32), np.array(actions, np.int32), np.array(returns, np.float32)
        # TODO: Train the policy network using policy gradient theorem
        # and the value network using MSE.

    def predict_actions(self, states):
        states = np.array(states, np.float32)
        return self._policy.predict_on_batch(states)

    def predict_values(self, states):
        states = np.array(states, np.float32)
        return self._value.predict_on_batch(states)[:, 0]

if __name__ == "__main__":
    # Parse arguments
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment.")
    parser.add_argument("--evaluate_each", default=100, type=int, help="Evaluate each number of batches.")
    parser.add_argument("--evaluate_for", default=10, type=int, help="Evaluate for number of batches.")
    parser.add_argument("--gamma", default=None, type=float, help="Discounting factor.")
    parser.add_argument("--hidden_layer", default=None, type=int, help="Size of hidden layer.")
    parser.add_argument("--learning_rate", default=None, type=float, help="Learning rate.")
    parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.")
    parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
    parser.add_argument("--workers", default=1, type=int, help="Number of parallel workers.")
    args = parser.parse_args()

    # Fix random seeds and number of threads
    np.random.seed(42)
    tf.random.set_seed(42)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Create the environment
    env = gym_evaluator.GymEnvironment(args.env)

    # Construct the network
    network = Network(env, args)

    # Initialize parallel workers by env.parallel_init
    states = env.parallel_init(args.workers)
    while True:
        # Training
        for _ in range(args.evaluate_each):
            # TODO: Choose actions using network.predict_actions

            # TODO: Perform steps by env.parallel_step

            # TODO: Compute return estimates by
            # - extracting next_states from steps
            # - computing value function approximation in next_states
            # - estimating returns by reward + (0 if done else args.gamma * next_state_value)

            # TODO: Train network using current states, chosen actions and estimated returns

        # Periodic evaluation
        returns = []
        for _ in range(args.evaluate_for):
            returns.append(0)
            state, done = env.reset(), False
            while not done:
                if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                    env.render()

                probabilities = network.predict_actions([state])[0]
                action = np.argmax(probabilities)
                state, reward, done, _ = env.step(action)
                returns[-1] += reward
        print("Evaluation of {} episodes: {}".format(args.evaluate_for, np.mean(returns)))

コード例 #13

0

ファイルを表示

import gym
import numpy as np
import gym_evaluator
import _pickle as pickle
import sys

env = gym_evaluator.GymEnvironment('BipedalWalker-v2')

#env = gym.make('BipedalWalker-v2')
np.random.seed(10)

hl_size = 100
version = 1
npop = 50
sigma = 0.1
alpha = 0.03
iter_num = 300
aver_reward = None
allow_writing = True
reload = False

print(hl_size, version, npop, sigma, alpha, iter_num)

if reload:
    model = pickle.load(open('model-pedal%d.p' % version, 'rb'))
else:
    model = {}
    model['W1'] = np.random.randn(24, hl_size) / np.sqrt(24)
    model['W2'] = np.random.randn(hl_size, 4) / np.sqrt(hl_size)

コード例 #14

0

ファイルを表示

ファイル: vtrace.py プロジェクト: BartoszPiotrowski/npfl122

class Network:
    def __init__(self, env, args):
        # Store the arguments regularization
        self.args = args

        # TODO: Create the actor. The input should be a batch of _sequences_ of
        # states (so the input shape is `[None] + env.state_shape`), each state
        # processed independently by the same network with a dense layer of
        # args.hidden_layer units with ReLU activation, followed by an softmax
        # layer with `env.actions` units.
        #
        # We use sequences of states on the input, because we want to predict
        # probabilities of up to `args.n` following states.
        #
        # We train the actor using sparse categorical crossentropy loss
        # and Adam optimizer with args.learning_rate.

        # TODO: Create the critic. The input should be again a batch of _sequences_
        # of states, each processed independently by a network with a dense layer of
        # args.hidden_layer units with ReLU activation, followed by a dense layer
        # with 1 output and no activation.
        #
        # We train the critic using MSE loss and Adam optimizer with args.learning_rate.

    # Do not change the method signature, as this method is used for testing in ReCodEx.
    @staticmethod
    def vtrace(args, actions, action_probabilities, rewards, actor_probabilities, critic_values):
        """Compute loss for V-trace algorithm.

        Arguments:
          args:
              command line arguments
          actions: [batch_size, n]
              chosen actions
          action_probabilities: [batch_size, n]
              probability of the chosen actions under behaviour policy;
              guaranteed to be 1 for actions after episode termination
          rewards: [batch_size, n]
              observed rewards;
              guaranteed to be 0 for rewards after episode termination
          actor_probabilities: [batch_size, n, num_actions]
              probabilities of actions under current (target) policy
          critic_values: [batch_size, n+1]
              critic estimation of values of encountered states;
              guaranteed to be 0 for states after episode termination
        """

        # TODO: Compute target policy probability of given actions
        # into `actor_action_probabilities`, i.e., symbolically
        #   actor_action_probabilities = actor_probabilities[:, :, actions[:, :]]

        rhos, cs = [], []
        # TODO: Compute clipped rho-s and c-s, as a Python list with
        # args.n elements, each a tensor (values for a whole batch).
        # The value rhos[i] and cs[i] should be importance sampling
        # ratio for actions[:, i] clipped by `args.clip_rho` and
        # `args.clip_c`, respectively.

        vs = [None] * (args.n + 1)
        # TODO: Compute vs from the last one to the first one.
        # The `vs[args.n]` is just `critic_values[:, args.n]`
        # The others can be computed recursively as
        #   vs[t] = critic_values[:, t] + delta_t V + gamma * cs[t] * (vs[t+1] - critic_values[:, t+1])

        # TODO: Return a pair with following elements:
        # - coefficient for actor loss, i.e., a product of the importance
        #   sampling factor (rhos[0]) and the estimated q_value
        #   (rewards + gamma * vs[1]) minus the baseline of critic_values
        # - target for the critic, i.e., vs[0]

    @tf.function
    def train(self, steps, states, actions, action_probabilities, rewards):
        # TODO: Run the actor on first `args.n` states and the critic on `args.n+1` states

        # TODO: Only first `steps` of `states` are valid (so `steps` might be `args.n+1`
        # if all `states` are non-terminal), so the critic predictions for the
        # states after the `steps` ones must be set to zero.

        # TODO: Run the `vtrace` method, with the last two arguments being the actor
        # and critic predictions, obtaining `actor_weights` and `critic_targets`.

        # TODO: Train the actor, using the first state of every batch instance, with
        # - sparse categorical crossentropy loss, weighted by `actor_weights`
        # - plus entropy regularization with weights self.args.entropy_regularization.
        #   Entropy of a given categorical distribution `d` is
        #     tf.reduce_sum(-d * tf.math.log(d), axis=-1)

        # TODO: Train the critic using the first state of every batch instance,
        # utilizing MSE loss with `critic_targets` as gold values.

    @tf.function
    def _predict_actions(self, states):
        return self._actor(states)

    def predict_actions(self, states):
        states = np.expand_dims(np.array(states, np.float32), axis=1)
        return self._predict_actions(states).numpy()[:, 0]


if __name__ == "__main__":
    # Parse arguments
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", default=None, type=int, help="Number of transitions to train on.")
    parser.add_argument("--clip_c", default=1., type=float, help="Clip value for c in V-trace.")
    parser.add_argument("--clip_rho", default=1., type=float, help="Clip value for rho in V-trace.")
    parser.add_argument("--entropy_regularization", default=0.1, type=float, help="Entropy regularization weight.")
    parser.add_argument("--env", default="CartPole-v1", type=str, help="Environment.")
    parser.add_argument("--evaluate_each", default=100, type=int, help="Evaluate each number of episodes.")
    parser.add_argument("--evaluate_for", default=10, type=int, help="Evaluate for number of episodes.")
    parser.add_argument("--gamma", default=None, type=float, help="Discounting factor.")
    parser.add_argument("--hidden_layer", default=None, type=int, help="Size of hidden layer.")
    parser.add_argument("--learning_rate", default=None, type=float, help="Learning rate.")
    parser.add_argument("--n", default=None, type=int, help="Number of steps to use in V-trace.")
    parser.add_argument("--replay_buffer_maxlen", default=None, type=int, help="Replay buffer maxlen.")
    parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.")
    parser.add_argument("--target_return", default=495, type=float, help="Target return.")
    parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Fix random seeds and number of threads
    np.random.seed(42)
    tf.random.set_seed(42)
    tf.config.threading.set_inter_op_parallelism_threads(args.threads)
    tf.config.threading.set_intra_op_parallelism_threads(args.threads)

    # Create the environment
    env = gym_evaluator.GymEnvironment(args.env)

    # Construct the network
    network = Network(env, args)

    # Replay memory
    replay_buffer = collections.deque(maxlen=args.replay_buffer_maxlen)
    Transition = collections.namedtuple("Transition", ["state", "action", "action_probability", "reward", "done"])

    def evaluate_episode(evaluating=False):
        rewards = 0
        state, done = env.reset(evaluating), False
        while not done:
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            probabilities = network.predict_actions([state])[0]
            action = np.argmax(probabilities)
            state, reward, done, _ = env.step(action)
            rewards += reward
        return rewards

    while True:
        # Training
        for _ in range(args.evaluate_each):
            state, done = env.reset(), False
            while not done:
                probabilities = network.predict_actions([state])[0]
                action = np.random.choice(np.arange(len(probabilities)), p=probabilities)
                next_state, reward, done, _ = env.step(action)
                replay_buffer.append(Transition(state, action, probabilities[action], reward, done))
                state = next_state

                if len(replay_buffer) > args.n * args.batch_size:
                    steps = np.zeros((args.batch_size), dtype=np.int32)
                    states = np.zeros([args.batch_size, args.n + 1] + env.state_shape, dtype=np.float32)
                    actions = np.zeros((args.batch_size, args.n), dtype=np.int32)
                    action_probabilities = np.ones((args.batch_size, args.n), dtype=np.float32)
                    rewards = np.zeros((args.batch_size, args.n), dtype=np.float32)

                    # TODO: Prepare a batch.
                    #
                    # Each batch instance is a sequence of `args.n+1` consecutive `states` and
                    # `args.n` consecutive `actions`, `action_probabilities` and `rewards`.
                    # The `steps` indicate how many `states` in range [1,2,...,args.n+1] are valid.
                    #
                    # To generate a batch, sample `args.batch_size` indices from replay_buffer
                    # (ignoring the last `args.n` ones to avoid overflow). Then fill for every
                    # sampled index the consecutive states, actions, action_probabilities and
                    # rewards -- if `done` is not set, all of them are filled and `steps` is
                    # set to `args.n+1`. If `done` is set, only a subset of states, actions,
                    # action_probabilities and rewards are set, and `steps` is set to the
                    # number of valid states (<`args.n+1`).
                    network.train(steps, states, actions, action_probabilities, rewards)

        # Periodic evaluation
        returns = []
        for _ in range(args.evaluate_for):
            returns.append(evaluate_episode())
        print("Evaluation of {} episodes: {}".format(args.evaluate_for, np.mean(returns)))

        if np.mean(returns) >= args.target_return:
            print("Reached mean average return of {}, running final evaluation.".format(np.mean(returns)))
            while True:
                evaluate_episode(True)

コード例 #15

0

ファイルを表示

ファイル: ddpg.py プロジェクト: vvolhejn/npfl122

    def construct(self, args, state_shape, action_components, action_lows, action_highs):
        with self.session.graph.as_default():
            self.states = tf.placeholder(tf.float32, [None] + state_shape)
            self.actions = tf.placeholder(tf.float32, [None, action_components])
            self.returns = tf.placeholder(tf.float32, [None])

            # Actor
            def actor(inputs):
                # TODO: Implement actor network, starting with `inputs` and returning
                # action_components values for each batch example. Usually, one
                # or two hidden layers are employed.
                #
                # Each action_component[i] should be mapped to range
                # [actions_lows[i]..action_highs[i]], for example using tf.nn.sigmoid
                # and suitable rescaling.

            with tf.variable_scope("actor"):
                self.mus = actor(self.states)

            with tf.variable_scope("target_actor"):
                target_actions = actor(self.states)

            # Critic from given actions
            def critic(inputs, actions):
                # TODO: Implement critic network, starting with `inputs` and `actions`
                # and producing a vector of predicted returns. Usually, `inputs` are fed
                # through a hidden layer first, and then concatenated with `actions` and fed
                # through two more hidden layers, before computing the returns.

            with tf.variable_scope("critic"):
                values_of_given = critic(self.states, self.actions)

            with tf.variable_scope("critic", reuse=True):
                values_of_predicted = critic(self.states, self.mus)

            with tf.variable_scope("target_critic"):
                self.target_values = critic(self.states, target_actions)

            # Update ops
            update_target_ops = []
            for target_var, var in zip(tf.global_variables("target_actor") + tf.global_variables("target_critic"),
                                       tf.global_variables("actor") + tf.global_variables("critic")):
                update_target_ops.append(target_var.assign((1.-args.target_tau) * target_var + args.target_tau * var))

            # TODO: Training
            # Define actor_loss and critic loss and then:
            # - train the critic (if required, using critic variables only,
            #     by using `var_list` argument of `Optimizer.minimize`)
            # - train the actor (if required, using actor variables only,
            #     by using `var_list` argument of `Optimizer.minimize`)
            # - update target network variables
            # You can group several operations into one using `tf.group`.

            global_step = tf.train.create_global_step()
            self.training = tf.group(...)

            # Initialize variables
            self.session.run(tf.global_variables_initializer())

    def predict_actions(self, states):
        return self.session.run(self.mus, {self.states: states})

    def predict_values(self, states):
        return self.session.run(self.target_values, {self.states: states})

    def train(self, states, actions, returns):
        self.session.run(self.training, {self.states: states, self.actions: actions, self.returns: returns})


class OrnsteinUhlenbeckNoise:
    """Ornstein-Uhlenbeck process."""

    def __init__(self, shape, mu, theta, sigma):
        self.mu = mu * np.ones(shape)
        self.theta = theta
        self.sigma = sigma
        self.reset()

    def reset(self):
        self.state = np.copy(self.mu)

    def sample(self):
        self.state += self.theta * (self.mu - self.state) + np.random.normal(scale=self.sigma, size=self.state.shape)
        return self.state


if __name__ == "__main__":
    # Fix random seed
    np.random.seed(42)

    # Parse arguments
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--batch_size", default=None, type=int, help="Batch size.")
    parser.add_argument("--env", default="Pendulum-v0", type=str, help="Environment.")
    parser.add_argument("--evaluate_each", default=100, type=int, help="Evaluate each number of episodes.")
    parser.add_argument("--evaluate_for", default=10, type=int, help="Evaluate for number of batches.")
    parser.add_argument("--noise_sigma", default=0.2, type=float, help="UB noise sigma.")
    parser.add_argument("--noise_theta", default=0.15, type=float, help="UB noise theta.")
    parser.add_argument("--gamma", default=None, type=float, help="Discounting factor.")
    parser.add_argument("--hidden_layer", default=None, type=int, help="Size of hidden layer.")
    parser.add_argument("--learning_rate", default=None, type=float, help="Learning rate.")
    parser.add_argument("--render_each", default=0, type=int, help="Render some episodes.")
    parser.add_argument("--target_tau", default=None, type=float, help="Target network update weight.")
    parser.add_argument("--threads", default=1, type=int, help="Maximum number of threads to use.")
    args = parser.parse_args()

    # Create the environment
    env = gym_evaluator.GymEnvironment(args.env)
    assert len(env.action_shape) == 1
    action_lows, action_highs = map(np.array, env.action_ranges)

    # Construct the network
    network = Network(threads=args.threads)
    network.construct(args, env.state_shape, env.action_shape[0], action_lows, action_highs)

    # Replay memory; maxlen parameter can be passed to deque for a size limit,
    # which we however do not need in this simple task.
    replay_buffer = collections.deque()
    Transition = collections.namedtuple("Transition", ["state", "action", "reward", "done", "next_state"])

    def evaluate_episode(evaluating=False):
        rewards = 0
        state, done = env.reset(evaluating), False
        while not done:
            if args.render_each and env.episode > 0 and env.episode % args.render_each == 0:
                env.render()

            action = network.predict_actions([state])[0]
            state, reward, done, _ = env.step(action)
            rewards += reward
        return rewards

    noise = OrnsteinUhlenbeckNoise(env.action_shape[0], 0., args.noise_theta, args.noise_sigma)
    while True:
        # Training
        for _ in range(args.evaluate_each):
            state, done = env.reset(), False
            noise.reset()
            while not done:
                # TODO: Perform an action and store the transition in the replay buffer

                # If the replay_buffer is large enough, perform training
                if len(replay_buffer) >= args.batch_size:
                    batch = np.random.choice(len(replay_buffer), size=args.batch_size, replace=False)
                    states, actions, rewards, dones, next_states = zip(*[replay_buffer[i] for i in batch])
                    # TODO: Perform the training

        # Evaluation
        returns = []
        for _ in range(args.evaluate_for):
            returns.append(evaluate_episode())
        print("Evaluation of {} episodes: {}".format(args.evaluate_for, np.mean(returns)))