コード例 #1
0
  def __init__(self, env):
    self.env = env
    state_shape = self.env.observation_space.shape
    action_dim = self.env.action_space.shape[1]

    # for now, with single machine synchronous training, use a replay memory for training.
    # this replay memory stores states in a Variable (ie potentially in gpu memory)
    # TODO: switch back to async training with multiple replicas (as in drivebot project)
    self.replay_memory = replay_memory.ReplayMemory(opts.replay_memory_size,
                                                    state_shape, action_dim)

    # s1 and s2 placeholders
    batched_state_shape = [None] + list(state_shape)
    s1 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32)
    s2 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32)

    # initialise base models for actor / critic and their corresponding target networks
    # target_actor is never used for online sampling so doesn't need explore noise.
    self.actor = ActorNetwork("actor", s1, action_dim)
    self.critic = CriticNetwork("critic", self.actor)
    self.target_actor = ActorNetwork("target_actor", s2, action_dim)
    self.target_critic = CriticNetwork("target_critic", self.target_actor)

    # setup training ops;
    # training actor requires the critic (for getting gradients)
    # training critic requires target_critic (for RHS of bellman update)
    self.actor.init_ops_for_training(self.critic)
    self.critic.init_ops_for_training(self.target_critic)
コード例 #2
0
  def __init__(self, env, agent_opts):
    self.env = env
    state_dim = self.env.observation_space.shape[0]
    action_dim = self.env.action_space.shape[1]

    # for now, with single machine synchronous training, use a replay memory for training.
    # TODO: switch back to async training with multiple replicas (as in drivebot project)
    self.replay_memory = replay_memory.ReplayMemory(agent_opts.replay_memory_size, 
                                                    state_dim, action_dim)

    # initialise base models for actor / critic and their corresponding target networks
    # target_actor is never used for online sampling so doesn't need explore noise.
    self.actor = ActorNetwork("actor", state_dim, action_dim,
                              agent_opts.actor_hidden_layers,
                              agent_opts.action_noise_theta,
                              agent_opts.action_noise_sigma,
                              agent_opts.actor_activation_init_magnitude)
          
    self.critic = CriticNetwork("critic", self.actor, 
                                agent_opts.critic_hidden_layers)
    self.target_actor = ActorNetwork("target_actor", state_dim, action_dim,
                                     agent_opts.actor_hidden_layers)
    self.target_critic = CriticNetwork("target_critic", self.target_actor,
                                       agent_opts.critic_hidden_layers)

    # setup training ops;
    # training actor requires the critic (for getting gradients)
    # training critic requires target_critic (for RHS of bellman update)
    self.actor.init_ops_for_training(self.critic,
                                     agent_opts.actor_learning_rate,
                                     agent_opts.actor_gradient_clip)
    self.critic.init_ops_for_training(self.target_critic,
                                      agent_opts.critic_learning_rate,
                                      agent_opts.critic_gradient_clip)
    def get_batches(self):
        """Yields randomized batches epsilon-greedy games.

    Maintains a replay memory at full capacity.
    """

        print("Initializing memory...")
        memory = replay_memory.ReplayMemory()
        while not memory.is_full():
            for experience in self.experience_collector.collect(
                    play.random_strategy):
                memory.add(experience)

        memory.print_stats()

        for i in itertools.count():
            if i < START_DECREASE_EPSILON_GAMES:
                epsilon = 1.0
            else:
                epsilon = max(
                    MIN_EPSILON, 1.0 - (i - START_DECREASE_EPSILON_GAMES) /
                    DECREASE_EPSILON_GAMES)

            strategy = play.make_epsilon_greedy_strategy(
                self.get_q_values, epsilon)

            for experience in self.experience_collector.collect(strategy):
                memory.add(experience)
                batch_experiences = memory.sample(BATCH_SIZE)
                yield self.experiences_to_batches(batch_experiences)
コード例 #4
0
    def __init__(self, name):
        self.game = DoomGame()
        self.game.load_config(CONFIG_FILE_PATH)
        self.game.set_window_visible(False)
        self.game.set_mode(Mode.PLAYER)
        #         self.game.set_screen_format(ScreenFormat.GRAY8)
        self.game.set_screen_format(ScreenFormat.CRCGCB)
        self.game.set_screen_resolution(ScreenResolution.RES_640X480)
        self.game.init()

        health = self.game.get_game_variable(GameVariable.HEALTH)
        ammo = self.game.get_game_variable(GameVariable.SELECTED_WEAPON_AMMO)
        frag = self.game.get_game_variable(GameVariable.FRAGCOUNT)
        pos_x = self.game.get_game_variable(GameVariable.POSITION_X)
        pos_y = self.game.get_game_variable(GameVariable.POSITION_Y)
        self.reward_gen = RewardGenerater(health, ammo, frag, pos_x, pos_y)

        self.replay_buff = replay_memory.ReplayMemory(
            CAPACITY, data_name="demodata_cig2017.npy")
        self.network = Network()
        self.agent = Agent(self.network, self.replay_buff, self.reward_gen)

        self.local_step = 0

        self.finished = False

        self.name = name
コード例 #5
0
    def initialize(self):

        if self.input_height is not 0:
            self.policy_net = deep_q_network.DQN_Conv(
                self.input_height, self.input_width,
                self.n_actions).double().to(self.device)
            self.target_net = deep_q_network.DQN_Conv(
                self.input_height, self.input_width,
                self.n_actions).double().to(self.device)
        else:
            # if not a convolutional network
            print("Linear Network")
            self.policy_net = deep_q_network.DQN_Linear(
                self.input_width, self.n_actions).to(self.device)
            self.target_net = deep_q_network.DQN_Linear(
                self.input_width, self.n_actions,
                requires_grad=False).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        if self.model_path:
            self.loadModel(self.model_path)
        self.target_net.eval()

        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-5)
        #self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = replay_memory.ReplayMemory(self.capacity)
コード例 #6
0
    def  __init__(self, candidates, dropout_prob=1.0, batch_size=64, replay_memory_size=10000):
        self._candidates = [candidates[i] for i in range(len(candidates))]
        self._candidate_number = len(candidates) 
        self._dimension = len(candidates[0])
        self._droput_prob = dropout_prob
        self._batch_size = batch_size   
        self._replay_memory_size = replay_memory_size
        self._replay_memory = replay_memory.ReplayMemory(replay_memory_size, [1, self._dimension, self._dimension, 1])
        self._count = 0

        # define a MLP
        self.graph = tf.Graph()
        with self.graph.as_default():
          self.nid = tf.placeholder(tf.int32, [None, 1], name='nid')
          self.doc = tf.placeholder(tf.float32, [None, 5], name='doc')
          self.ctx = tf.placeholder(tf.float32, [None, 5], name='ctx')
          self.label = tf.placeholder(tf.float32, [None, 1], name='label') 

          self.dpp = tf.placeholder(tf.float32, shape=(), name='droput_prob')
          self.batch = tf.placeholder(tf.int64, shape=(), name='batch')

          self.ds = tf.contrib.data.Dataset.from_tensor_slices((self.nid, self.doc, self.ctx, self.label))
          self.ds = self.ds.repeat(1).batch(self.batch)
          self.itr = self.ds.make_initializable_iterator()
          self.nxt = self.itr.get_next()

          with tf.variable_scope('nid_embedding'):
            self.nid_embs = tf.get_variable('embedding', initializer=tf.random_uniform([200, 16], -1.0, 1.0))
            self.nid_emb = tf.nn.relu(tf.reduce_sum(tf.nn.embedding_lookup(self.nid_embs, self.nxt[0]), 1))
            print >> sys.stderr, self.nid_emb.get_shape().as_list()

          with tf.variable_scope('dot'):
            self.dot = tf.reduce_sum(
                tf.multiply(tf.nn.dropout(self.nxt[1], self.dpp), tf.nn.dropout(self.nxt[2], self.dpp)), 
                1, keep_dims=True)
            print >> sys.stderr, self.dot.get_shape().as_list()

          with tf.variable_scope('FC'):
            self.weight = tf.get_variable('weight', [17, 1], tf.float32, tf.random_normal_initializer(stddev=0.05))
            self.bias = tf.get_variable('bias', [1], tf.float32, tf.constant_initializer(0.0))
            self.feats = tf.concat([self.nid_emb, self.dot], 1)
            print >> sys.stderr, self.feats.get_shape().as_list()
            self.fc = tf.matmul(self.feats, self.weight) + self.bias
            print >> sys.stderr, self.fc.get_shape().as_list()
            self.pred_score = tf.nn.sigmoid(self.fc)
            self.max_idx = tf.argmax(self.pred_score)
            print >> sys.stderr, self.pred_score.get_shape().as_list()

          with tf.variable_scope('optimizer'):
            self.t_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=self.nxt[3], logits=self.fc)
            print >> sys.stderr, self.t_loss.get_shape().as_list()
            self.loss = tf.reduce_mean(self.t_loss)
            print >> sys.stderr, self.loss.get_shape().as_list()
            self.optim = tf.train.GradientDescentOptimizer(0.01).minimize(self.loss)

          self.init_var = tf.global_variables_initializer()

        self.sess = tf.Session(graph=self.graph)
        self.sess.run(self.init_var)
コード例 #7
0
    def test_overfit_simple_artificial_dataset(self):
        input_shape = 1
        batch_size = 10
        num_actions = 2
        num_hidden = 2
        discount = 1
        learning_rate = 1
        update_rule = 'adam'
        freeze_interval = 100
        regularization = 0
        rng = None
        num_hidden_layers = 1
        network = qnetwork.QNetwork(input_shape, batch_size, num_hidden_layers,
                                    num_actions, num_hidden, discount,
                                    learning_rate, regularization, update_rule,
                                    freeze_interval, rng)

        rm = replay_memory.ReplayMemory(batch_size)
        # state 0 to state 1 reward +1
        for idx in range(20):
            state = np.array([0])
            next_state = np.array([1])
            action = 1
            reward = 1
            terminal = 1
            rm.store((state, action, reward, next_state, terminal))

        # state 0 to state 0 reward -1
        for idx in range(20):
            switch = random.randint(0, 1)
            state = np.array([0])
            next_state = np.array([0])
            action = 0
            reward = -1
            terminal = 0
            rm.store((state, action, reward, next_state, terminal))

        print rm.terminal_count
        print_data = False
        l = logger.Logger('test')
        counter = 0
        while True:
            counter += 1
            states, actions, rewards, next_states, terminals = rm.sample_batch(
            )
            loss = network.train(states, actions, rewards, next_states,
                                 terminals)
            l.log_loss(loss)

            if counter % 100 == 0:
                l.log_epoch(counter)
                Q = {}
                s0 = network.get_q_values(np.array([0]))
                Q['s0_a0'] = s0[0]
                Q['s0_a1'] = s0[1]
                s1 = network.get_q_values(np.array([1]))
                Q['s1_a0'] = s1[0]
                Q['s1_a1'] = s1[1]
コード例 #8
0
ファイル: emsemble.py プロジェクト: algum2/mab
 def __init__(self, candidates, emsemble_num=10,dropout_prob=1, batch_size=64, replay_memory_size=10000):
     self._candidates = [candidates[i] for i in range(len(candidates))]
     self._emsemble_num = emsemble_num
     self._candidate_number = len(candidates)
     self._dimension = len(candidates[0])
     self._droput_prob = dropout_prob
     self._batch_size = batch_size
     self._replay_memory_size = replay_memory_size
     self._replay_memory = replay_memory.ReplayMemory(replay_memory_size, [1, self._dimension, self._dimension, 1])
     self._count = 0
     self._model = [rmax.Rmax(self._candidates) for i in range(self._emsemble_num)]
コード例 #9
0
ファイル: simulate.py プロジェクト: wulfebw/acas_xu
def simulate():

    draw = False
    print 'building network...'
    if draw:
        pltAcas = plot_acas_xu.Plot_ACAS_XU(state_generator.RMAX, ICON_FILE, 1)
    sg = state_generator.StateGenerator(state_generator.RMAX,
                                        state_generator.RMIN,
                                        state_generator.VMIN,
                                        state_generator.VMAX, K_SIZE)
    q = qnetwork.QNetwork(state_generator.NUM_INPUTS, replay_memory.BATCH_SIZE,
                          state_generator.NUM_ACTIONS, GAMMA, SOLVER)
    repMem = replay_memory.ReplayMemory()
    count = 0

    dt = state_generator.DT
    dti = state_generator.DTI
    state = sg.randomStateGenerator()
    i = 0
    print 'starting training...'
    while True:

        for j in range(TRAIN_FREQ):
            i += 1
            action = q.getAction(state)
            nextStates, rewards = sg.getNextState(state, action, dt, dti)
            stateNorm, nextStateNorm = sg.normState(state, nextStates)
            repMem.store((stateNorm, action, rewards, nextStateNorm))
            state = nextStates[0]
            count += 1
            if draw:
                pltAcas.updateState(state, action)
                pltAcas.draw()
                time.sleep(0.3)

            if sg.checkRange(state) or i > 100:
                i = 0
                state = sg.randomStateGenerator()

        if count % PRINT_FREQ == 0 and count >= replay_memory.INIT_SIZE:
            print "Samples: %d, Trainings: %d" % (
                count, (count - replay_memory.INIT_SIZE) /
                TRAIN_FREQ), "Loss: %.3e" % q.test(repMem.sample_batch())
            sys.stdout.flush()

        elif (count % 10000 == 0):
            print "Samples: %d" % count
            sys.stdout.flush()

        q.train(repMem.sample_batch())
コード例 #10
0
    def __init__(self, opts):
        self.opts = opts

        config = tf.ConfigProto()
        #config.gpu_options.allow_growth = True
        #config.log_device_placement = True
        config.gpu_options.per_process_gpu_memory_fraction = 0.5  #opts.gpu_mem_fraction
        self.sess = tf.Session(config=config)

        render_shape = (opts.height, opts.width, 3)
        self.replay_memory = replay_memory.ReplayMemory(
            opts=opts, state_shape=render_shape, action_dim=2, load_factor=1.2)
        if opts.event_log_in:
            self.replay_memory.reset_from_event_log(opts.event_log_in,
                                                    opts.event_log_in_num)

        # s1 and s2 placeholders
        batched_state_shape = [None] + list(render_shape)
        s1 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32)
        s2 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32)

        # initialise base models for value & naf networks. value subportion of net is
        # explicitly created seperate because it has a target network note: in the case of
        # --share-input-state-representation the input state network of the value_net will
        # be reused by the naf.l_value and naf.output_actions net
        self.value_net = models.ValueNetwork("value", s1, opts)
        self.target_value_net = models.ValueNetwork("target_value", s2, opts)
        self.network = models.NafNetwork("naf",
                                         s1,
                                         s2,
                                         self.value_net,
                                         self.target_value_net,
                                         action_dim=2,
                                         opts=opts)

        with self.sess.as_default():
            # setup saver util and either load latest ckpt or init variables
            self.saver_util = None
            if opts.ckpt_dir is not None:
                self.saver_util = util.SaverUtil(self.sess, opts.ckpt_dir,
                                                 opts.ckpt_freq)
            else:
                self.sess.run(tf.initialize_all_variables())
            for v in tf.all_variables():
                print >> sys.stderr, v.name, util.shape_and_product_of(v)

            # setup target network
            self.target_value_net.set_as_target_network_for(
                self.value_net, 0.01)
コード例 #11
0
    def test_minibatch_sample_shapes_1D_state(self):
        batch_size = 100
        state_shape = 2
        rm = replay_memory.ReplayMemory(batch_size)
        for idx in range(1000):
            state = np.ones(state_shape)
            action = 0
            reward = 0
            next_state = np.ones(state_shape)
            terminal = 0
            rm.store((state, action, reward, next_state, terminal))

        states, actions, rewards, next_states, terminals = rm.sample_batch()
        self.assertEquals(states.shape, (batch_size, state_shape))
        self.assertEquals(actions.shape, (batch_size, 1))
        self.assertEquals(rewards.shape, (batch_size, 1))
        self.assertEquals(next_states.shape, (batch_size, state_shape))
        self.assertEquals(terminals.shape, (batch_size, 1))
コード例 #12
0
    def __init__(self,
                 input_size=10,
                 TICKER='MSFT',
                 BATCH_SIZE=128,
                 GAMMA=0.999,
                 EPS_START=0.9,
                 EPS_END=0.05,
                 EPS_DECAY=200,
                 TARGET_UPDATE=10,
                 REPLAY_MEMORY_CAPACITY=10000,
                 NUM_EPISODES=1,
                 hidden_layer=120,
                 actions=3):

        self.TICKER = TICKER
        self.BATCH_SIZE = BATCH_SIZE
        self.GAMMA = GAMMA
        self.EPS_START = EPS_START
        self.EPS_END = EPS_END
        self.EPS_DECAY = EPS_DECAY
        self.TARGET_UPDATE = TARGET_UPDATE
        self.NUM_EPISODES = NUM_EPISODES
        self.fd = financial_data.financial_data(input_size)
        self.date = self.fd.norm_data_ls[self.fd.ticker_ls.index(TICKER)].date
        self.policy_net = dqn.DQN(input_size, hidden_layer, actions)
        self.target_net = dqn.DQN(input_size, hidden_layer, actions)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = replay_memory.ReplayMemory(REPLAY_MEMORY_CAPACITY)
        self.steps_done = 0
        self.episode_durations = []
        self.actions = actions
        self.input_size = input_size
        self.action_index = ['Buy', 'Sell', 'Hold']
        self.reward_list = []
        self.episode_list = []
        self.episode_len = 1200
        self.money = self.fd.norm_data_ls[self.fd.ticker_ls.index(
            TICKER)].Close.values[0] * 20
        self.money_list = []
        self.loss_list = []
        self.action_list = []
コード例 #13
0
def gather_data(mdp, numTrials=10000, maxIterations=1000):
    mdp.computeStates()
    actions = mdp.actions(None)
    replay = replay_memory.ReplayMemory()

    for trial in range(numTrials):
        state = mdp.start_state
        if replay.isFull():
            break
        for _ in range(maxIterations):
            action = random.choice(actions)
            transitions = mdp.succAndProbReward(state, action)
            if len(transitions) == 0:
                break
            i = sample([prob for newState, prob, reward in transitions])
            newState, prob, reward = transitions[i]
            replay.store((state, action, reward, newState))
            state = newState
    return replay
コード例 #14
0
    def test_minibatch_sample_shapes_multidimensional_state(self):
        batch_size = 100
        state_shape = (1, 2, 2)
        rm = replay_memory.ReplayMemory(batch_size)
        for idx in range(1000):
            state = np.ones(state_shape)
            action = 0
            reward = 0
            next_state = np.ones(state_shape)
            terminal = 0
            rm.store((state, action, reward, next_state, terminal))

        states, actions, rewards, next_states, terminals = rm.sample_batch()
        expected_states_shape = (batch_size, ) + state_shape

        self.assertEquals(states.shape, expected_states_shape)
        self.assertEquals(actions.shape, (batch_size, 1))
        self.assertEquals(rewards.shape, (batch_size, 1))
        self.assertEquals(next_states.shape, expected_states_shape)
        self.assertEquals(terminals.shape, (batch_size, 1))
コード例 #15
0
 def test_agent(self):
     room_size = 5
     mdp = mdps.MazeMDP(room_size, 1)
     mdp.compute_states()
     mdp.EXIT_REWARD = 1
     mdp.MOVE_REWARD = -0.1
     discount = mdp.get_discount()
     num_actions = len(mdp.get_actions(None))
     network = qnetwork.QNetwork(input_shape=2 * room_size,
                                 batch_size=1,
                                 num_actions=4,
                                 num_hidden=10,
                                 discount=discount,
                                 learning_rate=1e-3,
                                 update_rule='sgd',
                                 freeze_interval=10000,
                                 rng=None)
     p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, 10000)
     rm = replay_memory.ReplayMemory(1)
     log = logger.NeuralLogger(agent_name='QNetwork')
     adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(
         room_size=room_size)
     a = agent.NeuralAgent(network=network,
                           policy=p,
                           replay_memory=rm,
                           logger=log,
                           state_adapter=adapter)
     num_epochs = 2
     epoch_length = 10
     test_epoch_length = 0
     max_steps = 10
     run_tests = False
     e = experiment.Experiment(mdp,
                               a,
                               num_epochs,
                               epoch_length,
                               test_epoch_length,
                               max_steps,
                               run_tests,
                               value_logging=False)
     e.run()
コード例 #16
0
  def __init__(self, env):
    self.env = env
    state_shape = self.env.state_shape
    action_dim = self.env.action_space.shape[1]
    self.obj_list = [i for i in range(10)]

    # for now, with single machine synchronous training, use a replay memory for training.
    # TODO: switch back to async training with multiple replicas (as in drivebot project)
    self.replay_memory = replay_memory.ReplayMemory(opts.replay_memory_size,
                                                    state_shape, action_dim, opts)

    # s1 and s2 placeholders
    batched_state_shape = [None] + list(state_shape)
    s1 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32)
    s2 = tf.placeholder(shape=batched_state_shape, dtype=tf.float32)

    if opts.use_full_internal_state:
        temp = [18]
    else:
        temp = [9]

    batched_internal_state_shape = [None] + temp
    internal_state = tf.placeholder(shape=batched_internal_state_shape, dtype=tf.float32)

    temp = [10] # object one hot
    batched_taget_obj_shape = [None] + temp
    target_obj_hot = tf.placeholder(shape=batched_taget_obj_shape, dtype=tf.float32)

    # initialise base models for value & naf networks. value subportion of net is
    # explicitly created seperate because it has a target network note: in the case of
    # --share-input-state-representation the input state network of the value_net will
    # be reused by the naf.l_value and naf.output_actions net
    self.value_net = ValueNetwork("value", s1, internal_state, target_obj_hot, opts.hidden_layers)
    self.target_value_net = ValueNetwork("target_value", s2, internal_state, target_obj_hot, opts.hidden_layers)
    self.naf = NafNetwork("naf", s1, s2,
                          self.value_net, self.target_value_net,
                          internal_state, target_obj_hot,
                          action_dim)
コード例 #17
0
        self.target_net = None
        self.optimizer = None
        self.steps_done = 0 #ToDo: Save and Load this value

    def initialize(self):
        if(self.input_height not 0):
            self.policy_net = deep_q_network.DQN_Conv(self.input_height, self.input_width, self.n_actions).to(self.device)
            self.target_net = deep_q_network.DQN_Conv(self.input_height, self.input_width, self.n_actions).to(self.device)
        else:
            # if not a convolutional network
            self.policy_net = deep_q_network.DQN_Linear(self.input_width, self.n_actions).to(self.device)
            self.target_net = deep_q_network.DQN_Linear(self.input_width, self.n_actions, requires_grad=False).to(self.device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()
        self.optimizer = optim.RMSprop(self.policy_net.parameters())
        self.memory = replay_memory.ReplayMemory(self.capacity)

    def selectAction(self, state):

        # greedy eps algorithm
        sample = random.random()
        eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \
                        math.exp(-1. * self.steps_done / self.eps_decay)
        self.steps_done += 1
        if sample > eps_threshold:
            with torch.no_grad():
                # t.max(1) will return largest column value of each row.
                # second column on max result is index of where max element was
                # found, so we pick action with the larger expected reward.
                return self.policy_net(state).max(1)[1].view(1, 1)
        else:
コード例 #18
0
    def train(self, sess, train_writer):
        """ Performs Q-learning on the Block World Task. The agent interacts with the
         simulator and performs roll-out followed by MSE updates. """

        start = time.time()

        max_epoch = AbstractLearning.max_epochs
        dataset_size = AbstractLearning.dataset_size
        tuning_size = AbstractLearning.validation_datasize
        train_size = dataset_size - tuning_size
        logger.Log.info("Deep Q-Learning: Max Epoch: " + str(max_epoch) +
                        " Train/Tuning: " + str(train_size) + "/" +
                        str(tuning_size))

        # Saver for logging the model
        saver = tf.train.Saver(max_to_keep=AbstractLearning.models_to_keep)

        # Iteration is the number of parameter update steps performed in the training
        iteration = 0

        # Validation metric
        avg_bisk_metric = self.agent.test(tuning_size)
        min_avg_bisk_metric = avg_bisk_metric
        patience = 0
        max_patience = AbstractLearning.max_patience
        logger.Log.info("Tuning Data: (Before Training) Avg. Bisk Metric: " +
                        str(avg_bisk_metric))

        for epoch in range(1, max_epoch + 1):
            logger.Log.info("=================\n Starting Epoch: " +
                            str(epoch) + "\n=================")
            for data_point in range(1, train_size + 1):

                # Create a queue to handle history of states
                state = collections.deque([], 5)
                # Add the dummy images
                dummy_images = self.model.image_embedder.get_dummy_images()
                [state.append(v) for v in dummy_images]

                # Receive the instruction and the environment
                (_, bisk_metric, current_env, instruction,
                 trajectory) = self.agent.receive_instruction_and_image()
                logger.Log.info("Train Bisk Metric " + str(bisk_metric))
                state.append(current_env)

                ########################
                text_indices = self.q_network.text_embedder.convert_text_to_indices(
                    instruction)
                _, text_embedder_bucket = self.q_network.get_bucket_network(
                    len(text_indices))
                (text_input_word_indices_bucket, text_mask_bucket
                 ) = text_embedder_bucket.pad_and_return_mask(text_indices)
                (text_input_word_indices,
                 text_mask) = self.q_network.text_embedder.pad_and_return_mask(
                     text_indices)
                ########################

                logger.Log.info("=================\n " + str(data_point) +
                                ": Instruction: " + str(instruction) +
                                "\n=================")

                total_reward_episode = 0
                steps = 0
                previous_action = self.q_network.null_previous_action

                # Perform a roll out
                while True:
                    # Compute the qVal of the current state
                    q_val = self.q_network.evaluate_qfunction(
                        state, text_input_word_indices_bucket,
                        text_mask_bucket, previous_action, sess)

                    # take an action using a behaviour policy
                    action_id = self.behaviour_policy.get_action(q_val)
                    action_str = self.agent.message_protocol_kit.encode_action(
                        action_id)
                    logger.Log.debug("Sending Message: " + action_str)
                    self.agent.connection.send_message(action_str)

                    # receive reward and a new environment as response on the completion of action
                    (_, reward, new_env,
                     is_reset) = self.agent.receive_response_and_image()
                    logger.Log.debug("Received reward: " + str(reward))

                    # compute target y = r + gamma * max_a' Q(s', a')
                    copy_state = collections.deque(state, 5)
                    copy_state.append(new_env)
                    q_val_new = self.target_q_network.evaluate_qfunction(
                        copy_state, text_input_word_indices_bucket,
                        text_mask_bucket, previous_action, sess)
                    if self.agent.message_protocol_kit.is_reset_message(
                            is_reset):
                        # Terminal condition
                        y = reward
                    else:
                        y = reward + self.agent.gamma * q_val_new.max()
                    logger.Log.debug("Reward " + str(reward) + " Target " +
                                     str(y) + " max is " +
                                     str(q_val_new.max()) + " current " +
                                     str(q_val[action_id]) + " diff " +
                                     str(y - q_val[action_id]))

                    # add to replay memory
                    replay_memory_item = rm.ReplayMemory(
                        text_input_word_indices,
                        text_mask,
                        state,
                        action_id,
                        reward,
                        new_env,
                        y,
                        previous_action_id=previous_action)
                    self.replay_memory.appendleft(replay_memory_item)
                    state.append(new_env)

                    # Update metric
                    total_reward_episode += reward
                    steps += 1
                    block_id = int(action_id / 4)
                    if action_id == 80:
                        direction_id = 4
                    else:
                        direction_id = action_id % 4
                    previous_action = (direction_id, block_id)

                    # Reset episode
                    if self.agent.message_protocol_kit.is_reset_message(
                            is_reset):
                        logger.Log.debug("Resetting the episode")
                        self.agent.connection.send_message("Ok-Reset")
                        logger.Log.debug("Now waiting for response")

                        # Perform minibatch SGD
                        # Pick a sample using prioritized sweeping and perform backpropagation
                        sample = self.ps.sample(self.replay_memory,
                                                self.batch_size)
                        loss = self.min_loss(sample,
                                             sess,
                                             train_writer,
                                             factorized_actions=False)
                        iteration += 1
                        logger.Log.info("Number of sample " +
                                        str(len(sample)) +
                                        " size of replay memory " +
                                        str(len(self.replay_memory)) +
                                        " loss = " + str(loss))

                        # Decay the epsilon
                        self.behaviour_policy.decay_epsilon()
                        logger.Log.info("Total reward in this episode: " +
                                        str(total_reward_episode))

                        # Print time statistics
                        total_time = time.time() - start
                        logger.Log.info("Total time: " + str(total_time))
                        logger.Log.flush()
                        break

            # Synchronize the target network and main network
            self.copy_variables_to_target_network(sess)

            # Compute validation accuracy
            avg_bisk_metric = self.agent.test(tuning_size)
            logger.Log.info("Tuning Data: (end of epoch " + str(epoch) +
                            ") Avg. Bisk Metric: " + str(avg_bisk_metric) +
                            "Min was " + str(min_avg_bisk_metric))
            # Save the model
            save_path = saver.save(
                sess, "./saved/model_epoch_" + str(epoch) + ".ckpt")
            logger.Log.info("Model saved in file: " + str(save_path))

            if avg_bisk_metric >= min_avg_bisk_metric:
                if patience == max_patience:
                    logger.Log.info(
                        "Max patience reached. Terminating learning after " +
                        str(epoch) + " epochs and " + str(iteration) +
                        " iterations.")
                    break
                else:
                    logger.Log.info(
                        "Tuning accuracy did not improve. Increasing patience to "
                        + str(patience + 1))
                    patience += 1
            else:
                logger.Log.info("Resetting patience to 0")
                patience = 0
            min_avg_bisk_metric = min(min_avg_bisk_metric, avg_bisk_metric)

        logger.Log.close()
コード例 #19
0
    healths[trans][step] = health
    ammos[trans][step] = ammo
    frags[trans][step] = frag
    deaths[trans][step] = death
    posxs[trans][step] = posx
    posys[trans][step] = posy


if __name__ == "__main__":

    game = initialize_vizdoom("./config/custom_config.cfg")

    n_actions = game.get_available_buttons_size()
    # actions = [list(a) for a in it.product([0, 1], repeat=n_actions)]
    commands = np.eye(n_actions, dtype=np.int32).tolist()
    replaymemory = replay_memory.ReplayMemory(n_transit,
                                              data_name="demodata_cig2017.npy")

    r_gen = reward_generater.reward_generater(game)

    # demo_data = replay_memory.ReplayMemory(resolution,n_transit)

    game.new_episode()

    for i in range(bots_num):
        game.send_game_command("addbot")

    total_reward = 0.0
    death_bias = 0
    for transit in tqdm(range(n_transit)):
        if game.is_episode_finished():
            print(
コード例 #20
0
    def do_reinforce_learning_self_critical(self):
        """ Performs policy gradient learning using Reinforce on the Block World Task. The agent interacts with the
         simulator and performs roll-out followed by REINFORCE updates. """

        start = time.time()

        max_epoch = 1000
        dataset_size = 667
        tuning_size = int(0.05 * dataset_size)
        train_size = dataset_size - tuning_size
        logger.Log.info("REINFORCE: Max Epoch: " + str(max_epoch) +
                        " Train/Tuning: " + str(train_size) + "/" +
                        str(tuning_size))

        # Saver for logging the model
        saver = tf.train.Saver(max_to_keep=120)

        # Iteration is the number of parameter update steps performed in the training
        iteration = 0

        # Reinforce baseline
        baseline = 0

        # Validation metric
        avg_bisk_metric = self.test(tuning_size)
        min_avg_bisk_metric = avg_bisk_metric
        patience = 0
        max_patience = 1000
        logger.Log.info("Tuning Data: (Before Training) Avg. Bisk Metric: " +
                        str(avg_bisk_metric))

        for epoch in range(1, max_epoch + 1):
            logger.Log.info("=================\n Starting Epoch: " +
                            str(epoch) + "\n=================")
            for data_point in range(1, train_size + 1):

                # Create a queue to handle history of states
                state = collections.deque([], 5)
                # Add the dummy images
                dummy_images = self.image_embedder.get_dummy_images()
                [state.append(v) for v in dummy_images]

                # Receive the instruction and the environment
                (_, _, current_env, instruction,
                 trajectory) = self.receive_instruction_and_image()
                state.append(current_env)
                (text_input_word_indices, text_mask
                 ) = self.text_embedder.get_word_indices_and_mask(instruction)
                logger.Log.info("=================\n " + str(data_point) +
                                ": Instruction: " + str(instruction) +
                                "\n=================")

                block_id = int(trajectory[0] / 4.0)
                total_reward_episode = 0
                steps = 0

                # Reinforce requires sampling from Q-function for the future.
                # So we cannot directly add entries to the global replay memory.
                replay_memory_items = []
                rewards = []

                # Perform a roll out
                while True:
                    # Compute the probability of the current state
                    prob = self.evaluate_qfunction(state,
                                                   text_input_word_indices,
                                                   text_mask)

                    # Sample from the prob. distribution
                    action_id = gp.GenericPolicy.sample_action_from_prob(prob)

                    action_str = self.message_protocol_kit.encode_action_from_pair(
                        block_id, action_id)
                    logger.Log.debug("Sending Message: " + action_str +
                                     " with probability " +
                                     str(prob[action_id]))
                    self.connection.send_message(action_str)

                    # receive reward and a new environment as a response on the completion of action
                    (_, reward, new_env,
                     is_reset) = self.receive_response_and_image()
                    logger.Log.debug("Received reward: " + str(reward))

                    # add to replay memory
                    replay_memory_item = rm.ReplayMemory(
                        text_input_word_indices, text_mask, state, action_id,
                        reward, None, None, prob[action_id])
                    replay_memory_items.append(replay_memory_item)
                    rewards.append(reward)

                    state.append(
                        new_env)  ##### CHECK if state is being overwritten

                    # Update metric
                    total_reward_episode += reward
                    steps += 1

                    # Reset episode
                    if self.message_protocol_kit.is_reset_message(is_reset):
                        logger.Log.debug("Resetting the episode")
                        self.connection.send_message("Ok-Reset")
                        logger.Log.debug("Now waiting for response")

                        # Compute monte carlo q values

                        baseline = self.get_reinforce_self_critical_baseline()
                        logger.Log.info("Reward: " +
                                        " ".join([str(v) for v in rewards]) +
                                        " steps: " + str(steps))
                        logger.Log.info(" Total Reward: " +
                                        str(total_reward_episode) +
                                        ", Self Critical Baseline: " +
                                        str(baseline))

                        # Define the targets
                        for replay_memory_item in replay_memory_items:
                            replay_memory_item.set_target_retroactively(
                                total_reward_episode - baseline)

                        self.replay_memory.clear()
                        for replay_memory_item in replay_memory_items:
                            self.replay_memory.appendleft(replay_memory_item)

                        # Perform minibatch SGD
                        # Pick a sample using prioritized sweeping and perform backpropagation
                        sample = self.ps.sample(self.replay_memory,
                                                self.batch_size)
                        loss = self.min_loss(sample)
                        if np.isnan(loss):
                            logger.Log.error("NaN found. Exiting")
                            exit(0)
                        iteration += 1
                        logger.Log.info("Number of sample " +
                                        str(len(sample)) +
                                        " size of replay memory " +
                                        str(len(self.replay_memory)) +
                                        " loss = " + str(loss))

                        logger.Log.info("Total reward:" +
                                        str(total_reward_episode) +
                                        " Steps: " + str(steps))

                        # Print time statistics
                        total_time = time.time() - start
                        logger.Log.info("Total time: " + str(total_time))
                        logger.Log.flush()
                        break

            # Compute validation accuracy
            avg_bisk_metric = self.test(tuning_size)
            logger.Log.info("Tuning Data: (end of epoch " + str(epoch) +
                            ") Avg. Bisk Metric: " + str(avg_bisk_metric) +
                            "Min was " + str(min_avg_bisk_metric))
            # Save the model
            save_path = saver.save(
                self.sess, "./saved/model_epoch_" + str(epoch) + ".ckpt")
            logger.Log.info("Model saved in file: " + str(save_path))

            if avg_bisk_metric >= min_avg_bisk_metric:
                if patience == max_patience:
                    logger.Log.info(
                        "Max patience reached. Terminating learning after " +
                        str(epoch) + " epochs and " + str(iteration) +
                        " iterations.")
                    break
                else:
                    logger.Log.info(
                        "Tuning accuracy did not improve. Increasing patience to "
                        + str(patience + 1))
                    patience += 1
            else:
                logger.Log.info("Resetting patience to 0")
                patience = 0
            min_avg_bisk_metric = min(min_avg_bisk_metric, avg_bisk_metric)

        logger.Log.close()
コード例 #21
0
ファイル: test_replay_memory.py プロジェクト: tianhan4/hh-DQN
import replay_memory
from collections import namedtuple

Config = namedtuple("Config",["memory_size", "history_length", "batch_size", "state_num"])

config = Config(10,2,5,1)
model = replay_memory.ReplayMemory(config)
for i in range(5):
    model.add(i,i*0.1,i*2,i%4==0)
print(model.states,model.actions,model.terminals,model.rewards)
print("sampling")
for i in range(5):
    print(model.sample_one())
for i in range(8):
    model.add(i,i*0.1,i*2,i%4==0)
print(model.states,model.actions,model.terminals,model.rewards)
print("sampling")
for i in range(5):
    print(model.sample_one())

    

コード例 #22
0
ファイル: ai.py プロジェクト: m4n4n-j/subway-surfers-AI-main
os.environ['KMP_DUPLICATE_LIB_OK']='True'
'''

# Getting the Subway Surfers environment
senv = env()
number_actions = senv.action_space

# Building an AI
cnn = neural_net.CNN(number_actions)
softmax_body = neural_net.SoftmaxBody(T=10)
ai = neural_net.AI(body=softmax_body, brain=cnn)

# Setting up Experience Replay and n_step progress
n_steps = n_step.NStepProgress(ai=ai, env=senv, n_step=7)
memory = replay_memory.ReplayMemory(n_steps=n_steps, capacity=5000)

ma = moving_avg.MA(500)  #Moving average used to grade our model


# Functions to save and load the checkpoints created while training.
def load():
    if os.path.isfile('old_brain.pth'):
        print("=> loading checkpoint... ")
        checkpoint = torch.load('old_brain.pth')
        cnn.load_state_dict(checkpoint['state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        print("done !")
    else:
        print("no checkpoint found...")
コード例 #23
0
    def __init__(self, conf):
        self.conf = conf
        self.word_dim = conf['word_dim']
        self.word_size = conf['word_size']
        self.turn_len = conf['turn_len']
        self.dialogue_len = conf['dialogue_len']
        # epsilon setting
        self.ep_start = conf['ep_start']
        self.ep = conf['ep_start']
        self.ep_end = conf['ep_end']
        self.ep_step = conf['ep_step']

        self.discount = conf['discount']
        self.update_freq = conf['update_freq']
        self.max_reward = conf['max_reward']
        self.min_reward = conf['min_reward']
        self.num_actions = conf['num_actions']

        self.batch_size = conf['batch_size']
        self.learn_start = conf['learn_start']

        self.target_q_clone_step = conf['target_q_clone_step']

        self.debug = conf['debug']

        self.num_step = 0
        self.mini_batch_step = 0
        self.last_s = None
        self.last_ask = -1
        self.last_confirm = -1
        self.last_r = None
        self.last_t = None

        self.ask_loss = None
        self.confirm_loss = None
        self.v_ask_avg = 0
        self.v_confirm_avg = 0

        try:
            self.loss_log = open('../data/loss_log', 'w')
        except:
            print("open file failed!")
            sys.exit(1)

        replay_memory_conf = {'replay_memory_size': conf['replay_memory_size'],
                              'learn_start': conf['prioritized_learnt_start'],
                              'batch_size': conf['batch_size'],
                              'word_dim': conf['word_dim'],
                              'debug': conf['debug']}
        self.replay_memory = replay_memory.ReplayMemory(replay_memory_conf)

        embedding_init = np.random.rand(self.word_size, self.word_dim)
        embedding_init[0] *= 0
        embedding_init = embedding_init.astype('float32')

        output_network_conf = {'name': 'output_network',
                               'num_actions': conf['num_actions'],
                               'word_dim': conf['word_dim'],
                               'word_size': conf['word_size'],
                               'turn_len': conf['turn_len'],
                               'dialogue_len': conf['dialogue_len'],
                               'mlp_hidden_unit': conf['mlp_hidden_unit'],
                               'clip_delta': conf['clip_delta'],
                               'lr': conf['lr'],
                               'embedding_init': embedding_init}
        self.output_network = DQN.DQN(output_network_conf)
        target_network_conf = {'name': 'target_network',
                               'num_actions': conf['num_actions'],
                               'word_dim': conf['word_dim'],
                               'word_size': conf['word_size'],
                               'turn_len': conf['turn_len'],
                               'dialogue_len': conf['dialogue_len'],
                               'mlp_hidden_unit': conf['mlp_hidden_unit'],
                               'clip_delta': conf['clip_delta'],
                               'lr': conf['lr'],
                               'embedding_init': embedding_init}
        self.target_network = DQN.DQN(target_network_conf)

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.75)
        self.sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))
        self.writer = tf.train.SummaryWriter('../data/graph_logs', self.sess.graph)
        self.init = tf.initialize_all_variables()
        self.sess.run(self.init)
        # self.sess.run(self.output_network.embedding_init)
        # self.sess.run(self.target_network.embedding_init)
        self.sync = self.sync_func()
        self.sess.run(self.sync)
コード例 #24
0
ファイル: policy_gradient.py プロジェクト: phisad/blocks
    def train(self, sess, train_writer, max_epoch=AbstractLearning.max_epochs, model_name="./model"):
        """ Performs policy gradient learning using Reinforce on the Block World Task. The agent interacts with the
         simulator and performs roll-out followed by REINFORCE updates. """

        start = time.time()

        # Initialization using 2 epochs of MLE
        self.mle_policy_gradient.train(sess, train_writer, max_epoch=2, model_name="./model_mle", terminate=False)
        # Reinitialize the direction parameters
        w1, b1 = self.policy_model.mix_and_gen_prob.get_direction_weights()
        sess.run(tf.initialize_variables([w1, b1]))

        dataset_size = AbstractLearning.dataset_size
        tuning_size = AbstractLearning.validation_datasize
        train_size = dataset_size - tuning_size
        logger.Log.info("REINFORCE: Max Epoch: " + str(max_epoch) + " Train/Tuning: "
                        + str(train_size) + "/" + str(tuning_size))

        # Saver for logging the model
        saver = tf.train.Saver(max_to_keep=AbstractLearning.models_to_keep)

        # Iteration is the number of parameter update steps performed in the training
        iteration = 0

        # Validation metric
        avg_bisk_metric = self.agent.test(tuning_size, oracle=True)
        min_avg_bisk_metric = avg_bisk_metric
        patience = 0
        max_patience = AbstractLearning.max_patience
        logger.Log.info("Tuning Data: (Before Training) Avg. Bisk Metric: " + str(avg_bisk_metric))

        for epoch in range(1, max_epoch + 1):
            logger.Log.info("=================\n Starting Epoch: " + str(epoch) + "\n=================")
            for data_point in range(1, train_size + 1):

                # Create a queue to handle history of states
                state = collections.deque([], 5)
                # Add the dummy images
                dummy_images = self.policy_model.image_embedder.get_padding_images()
                [state.append(v) for v in dummy_images]

                # Receive the instruction and the environment
                (_, _, current_env, instruction, trajectory) = self.agent.receive_instruction_and_image()
                state.append(current_env)

                # Convert text to indices
                text_indices = self.policy_model.text_embedder.convert_text_to_indices(instruction)
                _, text_embedder_bucket = self.policy_model.get_bucket_network(len(text_indices))
                (text_input_word_indices_bucket, text_mask_bucket) = text_embedder_bucket.pad_and_return_mask(
                    text_indices)
                (text_input_word_indices, text_mask) = self.policy_model.text_embedder.pad_and_return_mask(text_indices)

                logger.Log.info("=================\n " + str(data_point) + ": Instruction: "
                                + str(instruction) + "\n=================")

                total_reward_episode = 0
                steps = 0

                # Reinforce requires sampling from Q-function for the future.
                # So we cannot directly add entries to the global replay memory.
                replay_memory_items = []
                rewards = []
                previous_status_code = self.policy_model.null_previous_action

                # Perform a roll out
                while True:
                    # Compute the probability of the current state
                    block_prob, direction_prob = self.policy_model.evaluate_policy(
                        state, text_input_word_indices_bucket, text_mask_bucket,
                        previous_action=previous_status_code, sess=sess)

                    # Sample from the prob. distribution
                    block_id = gp.GenericPolicy.sample_action_from_prob(block_prob)
                    direction_id = gp.GenericPolicy.sample_action_from_prob(direction_prob)

                    action_str = self.agent.message_protocol_kit.encode_action_from_pair(block_id, direction_id)
                    prob_action = block_prob[block_id] * direction_prob[direction_id]
                    logger.Log.debug("Sending Message: " + action_str + " with probability " + str(prob_action))
                    self.agent.connection.send_message(action_str)

                    # receive reward and a new environment as a response on the completion of action
                    (status_code, reward, new_env, is_reset) = self.agent.receive_response_and_image()
                    logger.Log.debug("Received reward: " + str(reward))

                    # add to replay memory
                    replay_memory_item = rm.ReplayMemory(text_input_word_indices, text_mask, state,
                                                         (block_id, direction_id), reward, None, None, prob_action,
                                                         previous_action_id=previous_status_code)
                    replay_memory_items.append(replay_memory_item)
                    rewards.append(reward)
                    state.append(new_env)

                    # Update metric
                    total_reward_episode += reward
                    steps += 1

                    previous_status_code = (direction_id, block_id)

                    # Reset episode
                    if self.agent.message_protocol_kit.is_reset_message(is_reset):
                        logger.Log.debug("Resetting the episode")
                        self.agent.connection.send_message("Ok-Reset")
                        logger.Log.debug("Now waiting for response")

                        if self.total_reward:
                            # Compute monte carlo q values
                            reward_multiplier = [0] * steps
                            for i in range(0, steps):
                                # Q-value approximated by roll-out
                                reward_multiplier[i] = sum(rewards[i:])
                        else:
                            # Use immediate reward only
                            reward_multiplier = rewards

                        # Define the targets
                        for replay_memory_item, cumm_reward in zip(replay_memory_items, reward_multiplier):
                            replay_memory_item.set_target_retroactively(cumm_reward)

                        # Perform 1 iteration of minibatch SGD using backpropagation
                        loss = self.min_loss(replay_memory_items, sess, train_writer)
                        if np.isnan(loss):
                            logger.Log.error("NaN found. Exiting")
                            exit(0)
                        iteration += 1
                        logger.Log.info("Number of sample " + str(len(replay_memory_items)) + " loss = " + str(loss))
                        logger.Log.info("Total reward:" + str(total_reward_episode) + " Steps: " + str(steps))

                        # Print time statistics
                        total_time = time.time() - start
                        logger.Log.info("Total time: " + str(total_time))
                        logger.Log.flush()
                        break

            # Compute validation accuracy
            avg_bisk_metric = self.agent.test(tuning_size)
            logger.Log.info("Tuning Data: (end of epoch " + str(epoch) + ") Avg. Bisk Metric: " +
                            str(avg_bisk_metric) + "Min was " + str(min_avg_bisk_metric))
            # Save the model
            save_path = saver.save(sess, "./saved/" + str(model_name) + "_epoch_" + str(epoch) + ".ckpt")
            logger.Log.info("Model saved in file: " + str(save_path))

            if avg_bisk_metric >= min_avg_bisk_metric:
                if patience == max_patience:
                    logger.Log.info("Max patience reached. Terminating learning after " + str(epoch) +
                                    " epochs and " + str(iteration) + " iterations.")
                    break
                else:
                    logger.Log.info(
                        "Tuning accuracy did not improve. Increasing patience to " + str(patience + 1))
                    patience += 1
            else:
                logger.Log.info("Resetting patience to 0")
                patience = 0
            min_avg_bisk_metric = min(min_avg_bisk_metric, avg_bisk_metric)

        logger.Log.close()
コード例 #25
0
            time.sleep(900)

    def stop(self):
        """
        Prevents the thread from continuing when called.
        """
        self.stop = True


if __name__ == '__main__':
    global eta, alpha, number_of_turns, data_from_rm, rm, history, net, counter
    path = os.path.dirname(
        os.path.abspath(__file__)
    ) + '/saves/'  # must end with "/" on Linux and with "\" on Windows
    history = hs.History(path + 'History')
    rm = rm.ReplayMemory(path + 'ReplayMemory.rm', 42000)
    Lambda = .0
    eta = .0001
    alpha = .7
    input_layer = 543
    output_layer = (8, af.tanh)
    hidden_layers = [(543, af.tanh), (543, af.tanh), (543, af.tanh)]
    number_of_turns = 500
    data_from_rm = 500
    net = ann.Neural_Network(path + 'DATA', input_layer, output_layer,
                             hidden_layers, Lambda)
    Saver(net, rm).start()
    counter = 1
    while True:
        history.setGame(hs.generateName('main_net', 'dummy_net', 1).__next__())
        GAME = PONR(Interface('main net'), Interface('dummy net'))
コード例 #26
0
        # Define critic and dual optimizer
        if AC:
            critic = critic.Critic(DIS_EMBEDDING_DIM,
                                   DIS_HIDDEN_DIM,
                                   VOCAB_SIZE,
                                   MAX_SEQ_LEN,
                                   device=DEVICE).to(DEVICE)
            AC_optimizer = optim.Adagrad([{
                'params': actor.parameters(),
                'lr': ACTOR_LR
            }, {
                'params': critic.parameters(),
                'lr': CRITIC_LR
            }])
            memory = replay_memory.ReplayMemory(CAPACITY_RM)
        # Use optimizer for baseline DP-GAN
        else:
            PG_optimizer = optim.Adagrad(actor.parameters(), ACTOR_LR)

        # Adversarial training loop
        gen_data_loader = iter(load_data())
        gen_data_loader_tf = iter(load_data())
        dis_data_loader = iter(load_data())
        num_batches = int(len(gen_data_loader) / 2)
        N = ADV_TRAIN_EPOCHS * num_batches
        M = 1
        K = 5
        for n in range(N):
            if n % num_batches == 0:
                print('Iteration {}'.format(n))
コード例 #27
0
        def run(learning_rate, freeze_interval, num_hidden, reg):
            room_size = 5
            num_rooms = 2
            mdp = mdps.MazeMDP(room_size, num_rooms)
            mdp.compute_states()
            mdp.EXIT_REWARD = 1
            mdp.MOVE_REWARD = -0.01
            discount = 1
            num_actions = len(mdp.get_actions(None))
            batch_size = 100
            print 'building network...'
            network = qnetwork.QNetwork(input_shape=2 * room_size +
                                        num_rooms**2,
                                        batch_size=batch_size,
                                        num_hidden_layers=2,
                                        num_actions=4,
                                        num_hidden=num_hidden,
                                        discount=discount,
                                        learning_rate=learning_rate,
                                        regularization=reg,
                                        update_rule='adam',
                                        freeze_interval=freeze_interval,
                                        rng=None)
            num_epochs = 50
            epoch_length = 2
            test_epoch_length = 0
            max_steps = 4 * (room_size * num_rooms)**2
            epsilon_decay = (num_epochs * epoch_length * max_steps) / 1.5
            print 'building policy...'
            p = policy.EpsilonGreedy(num_actions, 0.5, 0.05, epsilon_decay)
            print 'building memory...'
            rm = replay_memory.ReplayMemory(batch_size, capacity=50000)
            print 'building logger...'
            log = logger.NeuralLogger(agent_name='QNetwork')
            print 'building state adapter...'
            adapter = state_adapters.CoordinatesToRowColRoomAdapter(
                room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToRowColAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToFlattenedGridAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.IdentityAdapter(room_size=room_size, num_rooms=num_rooms)
            # adapter = state_adapters.CoordinatesToSingleRoomRowColAdapter(room_size=room_size)
            print 'building agent...'
            a = agent.NeuralAgent(network=network,
                                  policy=p,
                                  replay_memory=rm,
                                  log=log,
                                  state_adapter=adapter)
            run_tests = False
            e = experiment.Experiment(mdp,
                                      a,
                                      num_epochs,
                                      epoch_length,
                                      test_epoch_length,
                                      max_steps,
                                      run_tests,
                                      value_logging=True)
            e.run()

            ak = file_utils.load_key('../access_key.key')
            sk = file_utils.load_key('../secret_key.key')
            bucket = 'hierarchical'
            try:
                aws_util = aws_s3_utility.S3Utility(ak, sk, bucket)
                aws_util.upload_directory(e.agent.logger.log_dir)
            except Exception as e:
                print 'error uploading to s3: {}'.format(e)
コード例 #28
0
    ckpt_path = "./model/"
    game = initialize_vizdoom(config_file_path)

    print("learning rate: %f" % learning_rate)
    print("discount_factor %f" % discount_factor)
    print("resolution:", resolution)
    print("frame_repeat: %d" % frame_repeat)
    print("capacity:", capacity)
    print("barch_size: %d" % batch_size)
    print("screen_format:", game.get_screen_format())
    n_actions = game.get_available_buttons_size()
    actions = np.eye(n_actions, dtype=np.int32).tolist()
    print("action_size : %d" % (n_actions))
    #actions = [list(a) for a in it.product([0,1], repeat=n_actions)]

    replay_memory = replay_memory.ReplayMemory(resolution, capacity)

    session = tf.Session()
    network = network_double.network_simple(session, resolution, n_actions,
                                            learning_rate)
    #network = network.network_simple(session,resolution,n_actions, learning_rate)
    #network = network_contrib.network_contrib(session,resolution,n_actions,learning_rate)

    session.run(tf.global_variables_initializer())
    for epoch in range(n_epoch):

        print("Epoch %d \n -----" % (epoch))
        print("Training Phase")
        train_episodes_finished = 0
        train_scores = []
        total_train_scores = []
コード例 #29
0
ファイル: ml_estimation.py プロジェクト: mnrmja007/blocks
    def train(self, sess, train_writer, max_epoch=AbstractLearning.max_epochs, model_name="./model"):
        """ Performs supervised learning on the Block World Task. The agent interacts with the
         simulator and performs roll-out followed by supervised learning. """

        start = time.time()

        dataset_size = AbstractLearning.dataset_size
        tuning_size = AbstractLearning.validation_datasize
        train_size = dataset_size - tuning_size
        logger.Log.info("Maximum Likelihood: Max Epoch: " + str(max_epoch) + " Train/Tuning: "
                        + str(train_size) + "/" + str(tuning_size))

        # Saver for logging the model
        saver = tf.train.Saver(max_to_keep=AbstractLearning.models_to_keep)

        # Iteration is the number of parameter update steps performed in the training
        iteration = 0

        # Validation metric
        avg_bisk_metric = self.agent.test(tuning_size)
        min_avg_bisk_metric = avg_bisk_metric
        patience = 0
        max_patience = AbstractLearning.max_patience
        logger.Log.info("Tuning Data: (Before Training) Avg. Bisk Metric: " + str(avg_bisk_metric))

        for epoch in range(1, max_epoch + 1):
            logger.Log.info("=================\n Starting Epoch: "
                            + str(epoch) + "\n=================")

            for data_point in range(1, train_size + 1):

                # Create a queue to handle history of states
                state = collections.deque([], 5)
                # Add the dummy images
                dummy_images = self.policy_model.image_embedder.get_dummy_images()
                [state.append(v) for v in dummy_images]

                # Receive the instruction and the environment
                (_, _, current_env, instruction, trajectory) = self.agent.receive_instruction_and_image()
                state.append(current_env)
                (text_input_word_indices, text_mask) = \
                    self.policy_model.text_embedder.get_word_indices_and_mask(instruction)
                logger.Log.info("=================\n " + str(data_point) + ": Instruction: "
                                + str(instruction) + "\n=================")

                traj_ix = 0
                total_reward_episode = 0
                steps = 0
                previous_action = self.policy_model.null_previous_action
                block_id = int(trajectory[0] / 4.0)

                # Perform a roll out
                while True:
                    # Sample from the prob. distribution
                    action_id = trajectory[traj_ix]
                    traj_ix += 1
                    action_str = self.agent.message_protocol_kit.encode_action(action_id)
                    logger.Log.debug("Sending Message: " + action_str)
                    self.agent.connection.send_message(action_str)

                    # receive reward and a new environment as a response on the completion of action
                    (status_code, reward, new_env, is_reset) = self.agent.receive_response_and_image()
                    logger.Log.debug("Received reward: " + str(reward))

                    # add to replay memory
                    if action_id == 80:
                        direction_id = 4
                    else:
                        direction_id = action_id % 4
                    replay_memory_item = rm.ReplayMemory(text_input_word_indices, text_mask,
                                                         state, (block_id, direction_id), 1.0, new_env, None,
                                                         previous_action_id=previous_action)
                    self.replay_memory.appendleft(replay_memory_item)
                    state.append(new_env)

                    # Update metric
                    total_reward_episode += reward
                    steps += 1
                    previous_action = (direction_id, block_id)

                    # Reset episode
                    if self.agent.message_protocol_kit.is_reset_message(is_reset):
                        logger.Log.debug("Resetting the episode")
                        self.agent.connection.send_message("Ok-Reset")
                        logger.Log.debug("Now waiting for response")

                        # Perform minibatch SGD
                        # Pick a sample using prioritized sweeping and perform backpropagation
                        sample = self.ps.sample(self.replay_memory, self.batch_size)
                        loss = self.min_loss(sample, sess, train_writer)
                        if np.isnan(loss):
                            logger.Log.info("NaN found. Exiting")
                            exit(0)
                        iteration += 1
                        logger.Log.info("Number of sample " + str(len(sample)) + " size of replay memory "
                                        + str(len(self.replay_memory)) + " loss = " + str(loss))

                        logger.Log.info("Total reward:" + str(total_reward_episode) + " Steps: " + str(steps))

                        # Print time statistics
                        total_time = time.time() - start
                        logger.Log.info("Total time: " + str(total_time))

                        logger.Log.flush()
                        break

            # Compute validation accuracy
            avg_bisk_metric = self.agent.test(tuning_size)
            logger.Log.info("Tuning Data: (end of epoch " + str(epoch) + ") Avg. Bisk Metric: "
                            + str(avg_bisk_metric) + "Min was " + str(min_avg_bisk_metric))
            # Save the model
            save_path = saver.save(sess, "./saved/" + str(model_name) + "_epoch_" + str(epoch) + ".ckpt")
            logger.Log.info("Model saved in file: " + str(save_path))

            if avg_bisk_metric >= min_avg_bisk_metric:
                if patience == max_patience:
                    logger.Log.info("Max patience reached. Terminating learning after " + str(epoch) +
                                    " epochs and " + str(iteration) + " iterations.")
                    break
                else:
                    logger.Log.info("Tuning accuracy did not improve. Increasing patience to " + str(patience + 1))
                    patience += 1
            else:
                logger.Log.info("Resetting patience to 0")
                patience = 0
            min_avg_bisk_metric = min(min_avg_bisk_metric, avg_bisk_metric)

        logger.Log.close()
コード例 #30
0
parser = argparse.ArgumentParser(description='PyTorch DDPG')

parser.add_argument('--env-name',
                    default="Walker2d-v1",
                    metavar='G',
                    help='name of the environment to run')

parser.add_argument('--render',
                    action='store_true',
                    help='render the environment')
args = parser.parse_args()

if __name__ == '__main__':

    env = gym.make(args.env_name)
    mem = replay_memory.ReplayMemory(1000000)
    trainer = train_networks.Training(env.observation_space.shape[0],
                                      env.action_space.shape[0],
                                      env.action_space.high[0], mem)

    # for i_episode in count(1):
    # 	num_episodes = 0
    # 	num_steps = 0
    # 	reward_batch = 0
    # 	while num_steps < 1000:
    num_episodes = 0
    reward_batch = 0

    for i in range(no_of_episodes):
        obs = env.reset()