コード例 #1
0
    def teach(self, num_timesteps=2000):
        chosen_action = 0
        print('Initial Chosen Action:', chosen_action)
        for t in range(num_timesteps):
            slopes = [estimate_slope(timesteps, scores) if len(scores) > 1 else 1 for timesteps, scores in zip(self.timesteps, self.scores)]
                
            if self.env.signal == 'SPG':
                 reward = np.abs(slopes[chosen_action]) if self.abs else slopes[chosen_action]
            elif self.env.signal == 'MPG':
                 reward = np.mean(np.abs(slopes) if self.abs else slopes)
                 
            #p = self.policy(np.abs(slopes) if self.abs else slopes)
            p = self.policy(reward)
            temp = np.zeros(self.env.num_actions)
            temp[p] = 1.
            p = temp.copy()
            r, train_done, val_done = self.env.step(p)
            if val_done:
                return self.env.model.epochs
            for a, s in enumerate(r):
                if not np.isnan(s):
                    self.scores[a].append(s)
                    self.timesteps[a].append(t)

            if self.writer:
                for i in range(self.env.num_actions):
                    add_summary(self.writer, "slopes/task_%d" % (i + 1), slopes[i], self.env.model.epochs)
                    add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs)

        return self.env.model.epochs
コード例 #2
0
    def teach(self, num_timesteps=2000):
        for t in range(num_timesteps):
            slopes = [
                estimate_slope(timesteps, scores) if len(scores) > 1 else 1
                for timesteps, scores in zip(self.timesteps, self.scores)
            ]
            p = self.policy(np.abs(slopes) if self.abs else slopes)
            r, train_done, val_done = self.env.step(p)
            if val_done:
                return self.env.model.epochs
            for a, s in enumerate(r):
                if not np.isnan(s):
                    self.scores[a].append(s)
                    self.timesteps[a].append(t)

            if self.writer:
                for i in range(self.env.num_actions):
                    add_summary(
                        self.writer,
                        "slopes/task_%d_%d" % (i // self.env.max_digits + 1,
                                               i % self.env.max_digits + 1),
                        slopes[i], self.env.model.epochs)
                    add_summary(
                        self.writer, "probabilities/task_%d_%d" %
                        (i // self.env.max_digits + 1,
                         i % self.env.max_digits + 1), p[i],
                        self.env.model.epochs)

        return self.env.model.epochs
コード例 #3
0
    def teach(self, num_timesteps=2000):
        for t in range(num_timesteps):
            # find slopes for each task
            if len(self.dscores) > 0:
                if isinstance(self.policy, ThompsonPolicy):
                    slopes = [
                        np.random.choice(drs)
                        for drs in np.array(self.dscores).T
                    ]
                else:
                    slopes = np.mean(self.dscores, axis=0)
            else:
                slopes = np.ones(self.env.num_actions)

            p = self.policy(np.abs(slopes) if self.abs else slopes)
            r, train_done, val_done = self.env.step(p)
            if val_done:
                return self.env.model.epochs

            # log delta score
            dr = r - self.prevr
            self.prevr = r
            self.dscores.append(dr)

            if self.writer:
                for i in range(self.env.num_actions):
                    add_summary(self.writer, "slopes/task_%d" % (i + 1),
                                slopes[i], self.env.model.epochs)
                    add_summary(self.writer, "probabilities/task_%d" % (i + 1),
                                p[i], self.env.model.epochs)

        return self.env.model.epochs
コード例 #4
0
    def step(self, train_dist):
        print("Training on", train_dist)
        train_data = self.model.generate_data(train_dist, self.train_size)
        train_data_double = self.model.generate_data(train_dist,
                                                     self.train_size)
        history = self.model.train_epoch(train_data, self.val_data)
        if self.signal == 'SPG':
            accs = self.model.accuracy_per_length(*train_data_double)
        elif self.signal == 'MPG':
            accs = self.model.accuracy_per_length(*self.val_data)

#        history = self.model.train_epoch(train_data, self.val_data)
#        train_accs = self.model.accuracy_per_length(*train_data)
#        val_accs = self.model.accuracy_per_length(*self.val_data)
        print('Accuracies: ', accs)
        train_done = history['full_number_accuracy'][-1] > 0.99
        val_done = history['val_full_number_accuracy'][-1] > 0.99

        if self.writer:
            for k, v in history.items():
                add_summary(self.writer, "model/" + k, v[-1],
                            self.model.epochs)
            for i in range(self.num_actions):
                #add_summary(self.writer, "train_accuracies/task_%d" % (i + 1), train_accs[i], self.model.epochs)
                add_summary(self.writer, "accuracies/task_%d" % (i + 1),
                            accs[i], self.model.epochs)

        return accs, train_done, val_done
コード例 #5
0
    def teach(self, num_timesteps=2000):
        curriculum_step = 0
        for t in range(num_timesteps):
            p = self.curriculum[curriculum_step]
            print(p)
            r, train_done, val_done = self.env.step(p)
            if train_done and curriculum_step < len(self.curriculum)-1:
                curriculum_step = curriculum_step + 1
            if val_done:
                return self.env.model.epochs

            if self.writer:
                for i in range(self.env.num_actions):
                    add_summary(self.writer, "probabilities/task_%d" % (i + 1), p[i], self.env.model.epochs)

        return self.env.model.epochs
コード例 #6
0
    def teach(self, num_timesteps=2000):
        for t in range(num_timesteps // self.window_size):
            p = self.policy(np.abs(self.Q) if self.abs else self.Q)
            scores = [[] for _ in range(len(self.Q))]
            for i in range(self.window_size):
                r, train_done, val_done = self.env.step(p)
                if val_done:
                    return self.env.model.epochs
                for a, score in enumerate(r):
                    if not np.isnan(score):
                        scores[a].append(score)
            s = [
                estimate_slope(list(range(len(sc))), sc) if len(sc) > 1 else 1
                for sc in scores
            ]
            self.Q += self.lr * (s - self.Q)

            if self.writer:
                for i in range(self.env.num_actions):
                    add_summary(self.writer, "Q_values/task_%d" % (i + 1),
                                self.Q[i], self.env.model.epochs)
                    add_summary(self.writer, "slopes/task_%d" % (i + 1), s[i],
                                self.env.model.epochs)
                    add_summary(self.writer, "probabilities/task_%d" % (i + 1),
                                p[i], self.env.model.epochs)

        return self.env.model.epochs
コード例 #7
0
    def teach(self, num_timesteps=2000):
        for t in range(num_timesteps):
            p = self.policy(np.abs(self.Q) if self.abs else self.Q)
            r, train_done, val_done = self.env.step(p)
            if val_done:
                return self.env.model.epochs
            s = r - self.prevr

            # safeguard against not sampling particular action at all
            s = np.nan_to_num(s)
            self.Q += self.lr * (s - self.Q)
            self.prevr = r

            if self.writer:
                for i in range(self.env.num_actions):
                    add_summary(
                        self.writer,
                        "Q_values/task_%d_%d" % (i // self.env.max_digits + 1,
                                                 i % self.env.max_digits + 1),
                        self.Q[i], self.env.model.epochs)
                    add_summary(
                        self.writer,
                        "slopes/task_%d_%d" % (i // self.env.max_digits + 1,
                                               i % self.env.max_digits + 1),
                        s[i], self.env.model.epochs)
                    add_summary(
                        self.writer, "probabilities/task_%d_%d" %
                        (i // self.env.max_digits + 1,
                         i % self.env.max_digits + 1), p[i],
                        self.env.model.epochs)

        return self.env.model.epochs
コード例 #8
0
    def step(self, train_dist):
        print("Training on", train_dist)
        train_data = self.model.generate_data(train_dist, self.train_size)
        history = self.model.train_epoch(train_data, self.val_data)
        #train_accs = self.model.accuracy_per_length(*train_data)
        val_accs = self.model.accuracy_per_length(*self.val_data)

        train_done = history['full_number_accuracy'][-1] > 0.99
        val_done = history['val_full_number_accuracy'][-1] > 0.99

        if self.writer:
            for k, v in history.items():
                add_summary(self.writer, "model/" + k, v[-1],
                            self.model.epochs)
            for i in range(self.num_actions):
                #add_summary(self.writer, "train_accuracies/task_%d_%d" % (i // self.max_digits + 1, i % self.max_digits + 1), train_accs[i], self.model.epochs)
                add_summary(
                    self.writer, "valid_accuracies/task_%d_%d" %
                    (i // self.max_digits + 1, i % self.max_digits + 1),
                    val_accs[i], self.model.epochs)

        return val_accs, train_done, val_done
コード例 #9
0
def trainer(num_episodes, fifos, shared_buffer, model, memory, writer):
    callbacks = [
        EarlyStopping(monitor='val_loss',
                      min_delta=0.001,
                      patience=5,
                      verbose=1,
                      mode='auto')
    ]
    while num_episodes < args.num_episodes:
        while True:
            # pick random fifo (agent)
            fifo = random.choice(fifos)
            try:
                # wait for a new trajectory and statistics
                trace, reward, rewards, agent_id, hit_probs, avg_lengths, tree_size, entropies, iters_sec = fifo.get(
                    timeout=args.queue_timeout)
                # break out of the infinite loop
                break
            except Empty:
                # just ignore empty fifos
                pass

        num_episodes += 1

        # add samples to replay memory
        # TODO: add_batch would be more efficient?
        for obs, pi in trace:
            memory.add_sample(obs, pi, reward)

        add_summary(writer, "tree/size", tree_size, num_episodes)
        add_summary(writer, "tree/mean_hit_prob", float(np.mean(hit_probs)),
                    num_episodes)
        add_summary(writer, "tree/mean_rollout_len",
                    float(np.mean(avg_lengths)), num_episodes)
        add_summary(writer, "tree/iters_sec", float(np.mean(iters_sec)),
                    num_episodes)
        add_histogram(writer, "tree/hit_probability", hit_probs, num_episodes)
        add_histogram(writer, "tree/rollout_length", avg_lengths, num_episodes)
        add_histogram(writer, "tree/entropies", entropies, num_episodes)
        add_summary(writer, "episode/mean_entropy", float(np.mean(entropies)),
                    num_episodes)
        add_summary(writer, "episode/reward", reward, num_episodes)
        add_summary(writer, "episode/length", len(trace), num_episodes)
        add_summary(writer, "rewards/agent_id", agent_id, num_episodes)
        for i in range(len(rewards)):
            add_summary(writer, "rewards/agent%d" % i, rewards[i],
                        num_episodes)
        add_summary(writer, "replay_memory/size", memory.size, num_episodes)
        add_summary(writer, "replay_memory/count", memory.count, num_episodes)
        add_summary(writer, "replay_memory/current", memory.current,
                    num_episodes)

        #print("Replay memory size: %d, count: %d, current: %d" % (memory.size, memory.count, memory.current))
        X, y, z = memory.dataset()
        assert len(X) != 0

        # reset weights?
        if args.reset_network:
            #model.set_weights(init_weights)
            model = model_from_json(model.to_json())
            model.compile(optimizer='adam',
                          loss=['categorical_crossentropy', 'mse'])
        # train for limited epochs to avoid overfitting?
        history = model.fit(X, [y, z],
                            batch_size=args.batch_size,
                            epochs=args.num_epochs,
                            callbacks=callbacks,
                            validation_split=args.validation_split)
        # log loss values
        for k, v in history.history.items():
            add_summary(writer, "training/" + k, v[-1], num_episodes)
        # shared weights with runners
        shared_buffer.raw = pickle.dumps(model.get_weights(),
                                         pickle.HIGHEST_PROTOCOL)
        # save weights
        if num_episodes % args.save_interval == 0:
            model.save(os.path.join(logdir, "model_%d.hdf5" % num_episodes))
コード例 #10
0
ファイル: MCTS.py プロジェクト: AntonPotapchuk/playground
def trainer(num_iters, num_rollouts, model, writer, logdir):
    while num_iters < args.num_iters:
        print("################### ITERATION %d ###################" %
              (num_iters + 1))
        # Stats
        stat_tree_size = []
        stat_hit_probs = []
        stat_avg_lengths = []
        stat_entropies = []
        stat_reward_agent = [[], [], [], []]
        stat_episode_length = []

        memory = ReplayMemory()
        # -------Generate training set based on MCTS-------
        print("Generate dataset")
        # use spawn method for starting subprocesses
        # this seems to be more compatible with TensorFlow?
        ctx = multiprocessing.get_context('spawn')
        # create boolean to signal end
        finished = ctx.Value('i', 0)

        # create fifos and processes for all runners
        print("Creating child processes")
        fifos = []
        model_file, _ = get_model_path(args, logdir)
        for i in range(args.num_runners):
            fifo = ctx.Queue(1)
            fifos.append(fifo)
            process = ctx.Process(target=runner,
                                  args=(i, model_file, fifo, finished, args))
            process.start()

        for i in tqdm(range(num_rollouts)):
            while True:
                # pick random fifo (agent)
                fifo = random.choice(fifos)
                try:
                    # wait for a new trajectory and statistics
                    trace, reward, agent_id, hit_probs, avg_lengths, tree_size, entropies = fifo.get(
                        timeout=1)
                    break
                except Empty:
                    pass
            # save stats
            stat_tree_size.append(tree_size)
            stat_hit_probs.append(np.mean(hit_probs))
            stat_avg_lengths.append(np.mean(avg_lengths))
            stat_entropies.append(np.mean(entropies))
            stat_reward_agent[agent_id].append(reward)
            stat_episode_length.append(len(trace))
            # add samples to replay memory
            for obs, pi in trace:
                memory.add_sample(obs, pi, reward)

        # Kill subprocesses
        finished.value = 1
        print("Finishing")
        # empty queues until all child processes have exited
        while len(multiprocessing.active_children()) > 0:
            for i, fifo in enumerate(fifos):
                if not fifo.empty():
                    fifo.get_nowait()
        print("All childs was killed")
        # -------Train a model-------
        callbacks = [
            EarlyStopping(monitor='loss',
                          min_delta=0,
                          patience=5,
                          verbose=1,
                          mode='auto'),
            ModelCheckpoint(os.path.join(logdir, "model_%d.hdf5" % num_iters),
                            monitor='loss',
                            save_best_only=True),
            ReduceLROnPlateau(monitor='loss', patience=1, factor=0.1)
        ]
        add_summary(writer, "tree/mean_size", np.mean(stat_tree_size),
                    num_iters)
        try:
            add_summary(writer, "tree/mean_hit_prob",
                        float(np.mean(stat_hit_probs)), num_iters)
        except:
            pass
        add_summary(writer, "tree/mean_rollout_len",
                    float(np.mean(stat_avg_lengths)), num_iters)
        add_summary(writer, "episode/mean_entropy",
                    float(np.mean(stat_entropies)), num_iters)
        try:
            add_summary(writer, "episode/reward", np.mean(stat_reward_agent),
                        num_iters)
        except:
            pass
        add_summary(writer, "episode/length", np.mean(stat_episode_length),
                    num_iters)
        add_summary(writer, "rewards/agent_id", agent_id, num_iters)
        for i in range(len(stat_reward_agent)):
            try:
                add_summary(writer, "rewards/agent%d" % i,
                            np.mean(stat_reward_agent[i]), num_iters)
            except:
                pass

        X, y, z = memory.dataset()
        assert len(X) != 0

        # train for limited epochs to avoid overfitting?
        # TODO class weights??
        history = model.fit(X, [y, z],
                            batch_size=args.batch_size,
                            epochs=args.num_epochs,
                            callbacks=callbacks,
                            validation_split=args.validation_split,
                            shuffle=True)
        # log loss values
        for k, v in history.history.items():
            add_summary(writer, "training/" + k, v[-1], num_iters)
        num_iters += 1