Esempio n. 1
0
    def _log_best_network_env_info(self,
                                   net,
                                   summary_writer,
                                   env,
                                   test_env,
                                   gen=1):
        # train
        infos = self.best_network.get_env_info(env, False)
        print('Traning Data (best) ->> ', infos)
        for key in infos:
            summary_writer.add_summary(tfSummary(key, float(infos[key])),
                                       global_step=gen)
        infos = net.get_env_info(env, False)
        print('Traning Data (latest) ->> ', infos)

        # Test
        if test_env:
            infos = self.best_network.get_env_info(test_env, False)
            print('Test Data (best)->> ', infos)
            for key in infos:
                summary_writer.add_summary(tfSummary('test_' + key,
                                                     float(infos[key])),
                                           global_step=gen)
            infos = net.get_env_info(test_env, False)
            print('Test Data (latest)->> ', infos)
Esempio n. 2
0
    def train(self, env, args, summary_writer):
        results = []

        # First, gather experience
        # tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes")
        # for e in tqdm_e:
        for e in range(args.nb_episodes):
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

            while not done:
                if args.render: env.render()
                # env.render()
                # Actor picks an action (following the deterministic policy)

                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                a = np.clip(a + noise.generate(time), -self.act_range,
                            self.act_range)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Add outputs to memory buffer
                self.memorize(old_state, a, r, done, new_state)
                # Sample experience from buffer
                states, actions, rewards, dones, new_states, _ = self.sample_batch(
                    args.batch_size)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict(
                    [new_states,
                     self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1

            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            print("score", score)

            # Display score
            # tqdm_e.set_description("Score: " + str(cumul_reward))
            # tqdm_e.refresh()

        return results
Esempio n. 3
0
    def train(self, env, args, summary_writer):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")

        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # print('action ', a)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # print('reward', r)
                # Memorize for experience replay
                self.memorize(old_state, a, r, done, new_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network
                if (self.buffer.size() > args.batch_size):
                    # print('train agent')
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            # print('memory buffer:', self.buffer.size())
            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()
            # print('e', e)
            if (e % self.save_interval == 0) & (e != 0):
                # print('save')
                self.save_weights(self.export_path, e)

        return results
Esempio n. 4
0
    def train(self, env, args, summary_writer):
        """ Main A2C Training Algorithm
        """

        results = []

        # Main Loop
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")
        for e in tqdm_e:

            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)
                # Memorize (s, a, r) for training
                actions.append(to_categorical(a, self.act_dim))
                rewards.append(r)
                states.append(old_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1

            # Train using discounted rewards ie. compute updates
            self.train_models(states, actions, rewards, done)

            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results
Esempio n. 5
0
    def train(self, env, args, summary_writer):
        """ Main DDQN Training Algorithm                DDQN主要训练算法
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")

        for e in tqdm_e:
            # Reset episode                             重设episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()

            while not done:
                if args.render: env.render()
                # Actor picks an action (following the policy)                      演员选择动作(遵循政策)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal     检索新状态,奖励以及该状态是否为终端
                new_state, r, done, _ = env.step(a)
                # Memorize for experience replay                                    保存经验重播
                self.memorize(old_state, a, r, done, new_state)
                # Update current state                                              更新当前状态
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network                 训练DDQN并将权重转移到目标网络
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            # Gather stats every episode for plotting                   收集每个情节的统计数据以进行绘图
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard                            为Tensorboard导出结果
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score             显示分数
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results
Esempio n. 6
0
def training_thread(agent, Nmax, env, action_dim, f, summary_writer, tqdm,
                    render):
    """ Build threads to run shared computation across
    """

    global episode
    while episode < Nmax:

        # Reset episode
        time, cumul_reward, done = 0, 0, False
        old_state = env.reset()
        actions, states, rewards = [], [], []
        while not done and episode < Nmax:
            if render:
                with lock:
                    env.render()
            # Actor picks an action (following the policy)
            a = agent.policy_action(np.expand_dims(old_state, axis=0))
            # Retrieve new state, reward, and whether the state is terminal
            new_state, r, done, _ = env.step(a)
            # Memorize (s, a, r) for training
            actions.append(to_categorical(a, action_dim))
            rewards.append(r)
            states.append(old_state)
            # Update current state
            old_state = new_state
            cumul_reward += r
            time += 1
            # Asynchronous training
            if (time % f == 0 or done):
                lock.acquire()
                agent.train_models(states, actions, rewards, done)
                agent.global_rewards.append(cumul_reward)
                lock.release()
                actions, states, rewards = [], [], []

        # Export results for Tensorboard
        score = tfSummary('score', cumul_reward)
        summary_writer.add_summary(score, global_step=episode)
        summary_writer.flush()
        # Update episode count
        with lock:
            tqdm.set_description("Score: " + str(cumul_reward))
            tqdm.update(1)
            if (episode < Nmax):
                episode += 1
Esempio n. 7
0
    def train(self, env, args, summary_writer, envtest=None):
        """ Main DDQN Training Algorithm
        """

        results = []
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")
        epoch = 0
        gross_profit = 0
        WritetoCsvFile("logFile_1.csv", [
            "stage", "file", "history_win", "stop", "usevol", "dueling",
            "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2",
            "madifference", "hidema", "candlenum", "hidden_dim", "maxProfit",
            "maxLOSS", "avgProfit", "avgLOSS", "countprofit", "countloss",
            "maxdrop", "Total profit", "total_reward", "TRADES", "epoch"
        ])
        WritetoCsvFile("logFileDetail.csv", [
            "stage", "file", "history_win", "stop", "usevol", "dueling",
            "traineval", "allprices", "allprices2", "allprices3", "ma1", "ma2",
            "madifference", "hidema", "candlenum", "hidden_dim", 'maxProfit',
            'maxLOSS', 'avgProfit', 'avgLOSS', 'maxdrop', 'Total profit',
            'gross profit', "total_reward", 'TRADES', 'epoch'
        ])

        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            ##########################################
            total_reward = 0
            total_profit = 0
            total_loss = 0
            total_profitMax = 0
            total_profitMin = 0
            max_drop = 0
            profitLst = []
            lossLst = []
            trades = 0
            step = 0
            #####################################3####

            while not done:
                #if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state)
                # Retrieve new state, reward, and whether the state is terminal
                #new_state, r, done, _ = env.step(a)

                #######################################################
                new_state, r, done, buy, sell, profit = env.step(a)

                total_reward += r
                if profit != 0:
                    trades += 1
                    total_profit += profit
                    if total_profit > total_profitMax:
                        total_profitMax = total_profit
                        total_profitMin = total_profit
                    if total_profit < total_profitMin:
                        total_profitMin = total_profit
                        try:
                            if total_profitMax != 0 and max_drop < (
                                    total_profitMax -
                                    total_profitMin) / total_profitMax:
                                max_drop = (total_profitMax -
                                            total_profitMin) / total_profitMax
                        except:
                            max_drop = 0

                if profit > 0:
                    profitLst.append(profit)
                elif profit < 0:
                    lossLst.append(profit)

                step += 1
                if step % 1500 == 0:
                    print(
                        'maxProfit: {} maxLOSS: {} avgProfit: {:01.2f} avgLOSS: {:01.2f} maxdrop: {:.2%} Total profit: {}/{} TRADES: {}  '
                        .format(np.max(profitLst + [0]),
                                -np.min(lossLst + [0]),
                                np.mean(profitLst + [0]),
                                -np.mean(lossLst + [0]), max_drop,
                                total_profit, gross_profit, trades))

                    WritetoCsvFile("logFileDetail.csv", [
                        "train", args.trainf, args.history_win, args.stop,
                        args.usevol, args.dueling, args.traineval,
                        args.allprices, args.allprices2, args.allprices3,
                        args.ma1, args.ma2, args.madifference, args.hidema,
                        args.candlenum, args.hidden_dim,
                        np.max(profitLst + [0]), -np.min(lossLst + [0]),
                        np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                        max_drop, total_profit, gross_profit, total_reward,
                        trades, epoch
                    ])
                #done = True if step == len(env.data) - 3 else False
                ######################################################
                # Memorize for experience replay
                self.memorize(old_state, a, r, done, new_state)
                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1
                # Train DDQN and transfer weights to target network
                if (self.buffer.size() > args.batch_size):
                    self.train_agent(args.batch_size)
                    self.agent.transfer_weights()

            gross_profit += total_profit
            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            l_profit = tfSummary('profit', total_profit)
            l_aprofit = tfSummary('average profit', np.mean(profitLst))
            l_aloss = tfSummary('l_aloss', -np.mean(lossLst))
            l_trades = tfSummary('l_trades', trades)
            np.mean(profitLst), -np.mean(lossLst)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.add_summary(l_profit, global_step=e)
            summary_writer.add_summary(l_aprofit, global_step=e)
            summary_writer.add_summary(l_aloss, global_step=e)
            summary_writer.add_summary(l_trades, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()
            self.agent.saveModel("./models/model_ep", "")
            results = [
                np.max(profitLst + [0]), -np.min(lossLst + [0]),
                np.mean(profitLst + [0]), -np.mean(lossLst + [0]),
                len(profitLst),
                len(lossLst), max_drop, total_profit, total_reward, trades
            ]

            WritetoCsvFile("logFile_1.csv", [
                "train", args.trainf, args.history_win, args.stop, args.usevol,
                args.dueling, args.traineval, args.allprices, args.allprices2,
                args.allprices3, args.ma1, args.ma2, args.madifference,
                args.hidema, args.candlenum, args.hidden_dim
            ] + results + [epoch])
            if envtest:  # Если задано окружение для тестирования то тестируем каждую эпоху
                newargs = args
                newargs.traineval = False
                self.evaluate(envtest,
                              newargs,
                              summary_writer,
                              model=None,
                              epoch=epoch)

            epoch += 1
        return results
Esempio n. 8
0
    def train(self, summary_writer):
        env = CarEnv()
        results = []
        i = 0
        # First, gather experience
        tqdm_e = tqdm(range(2000), desc='Score', leave=True, unit=" episodes")
        for e in tqdm_e:
            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            old_state = np.array(old_state).reshape(40, )
            actions, states, rewards = [], [], []
            noise = OrnsteinUhlenbeckProcess(size=self.act_dim)

            while not done:
                # if args.render: env.render()
                # Actor picks an action (following the deterministic policy)
                a = self.policy_action(old_state)
                # Clip continuous values to be valid w.r.t. environment
                a = np.clip(a + noise.generate(time), -self.act_range,
                            self.act_range)
                a = float(a[0])
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a, time)
                print("Now r is {}".format(r))
                # Add outputs to memory buffer
                temp_next = old_state.copy()
                temp_next[:4] = temp_next[4:8]
                temp_next[4:8] = temp_next[8:12]
                temp_next[8:12] = temp_next[12:16]
                temp_next[12:16] = temp_next[16:20]
                temp_next[16:20] = temp_next[20:24]
                temp_next[20:24] = temp_next[24:28]
                temp_next[24:28] = temp_next[28:32]
                temp_next[28:32] = temp_next[32:36]
                temp_next[32:36] = temp_next[36:40]
                temp_next[36:40] = new_state
                temp_next = np.array(temp_next).reshape(40, )
                self.memorize(old_state, a, r, done, temp_next)
                old_state = temp_next.copy()
                cumul_reward += r
                time += 1

            # since episode is over destroying actors in the scenario
            for actor in env.actor_list:
                actor.destroy()
            # Sample experience from buffer
            for i in range(50):
                states, actions, rewards, dones, new_states, _ = self.sample_batch(
                    64)
                # Predict target q-values using target networks
                q_values = self.critic.target_predict(
                    [new_states,
                     self.actor.target_predict(new_states)])
                # Compute critic target
                critic_target = self.bellman(rewards, q_values, dones)
                # Train both networks on sampled batch, update target networks
                self.update_models(states, actions, critic_target)
                print("learning happened")

            # mean, stdev, velocity_data, action_data, vehicle_x_data, vehicle_y_data, obj_x_data, obj_y_data = gather_stats(self, env, velocity_data, action_data, vehicle_x_data, vehicle_y_data, obj_x_data, obj_y_data)
            mean, stdev = gather_stats(self, env)
            results.append([e, mean, stdev])

            # Export results for Tensorboard
            print(cumul_reward)
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()
            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()
            i += 1
            if i % 10 == 0:
                df = pd.DataFrame(np.array(results))
                df.to_csv("DDPG" + "/logs.csv",
                          header=['Episode', 'Mean', 'Stddev'],
                          float_format='%10.5f')

        return results
Esempio n. 9
0
    def fit(self,
            env,
            summary_writer,
            debug=False,
            num_cpus=4,
            is_market=False,
            env_args={},
            test_env_args=None,
            env_version='v1'):
        stagnation = 1
        best_so_far = 0

        # Init test env
        test_env = None
        if env_version == 'v1':
            test_env = MarketEnvironmentV1(
                **test_env_args) if test_env_args else None
        if env_version == 'v2':
            test_env = MarketEnvironmentV2(
                **test_env_args) if test_env_args else None

        envs = []

        # Create environements for all population
        if is_market:
            if env_version == 'v1':
                envs = [
                    MarketEnvironmentV1(**env_args)
                    for i in range(self.population_size)
                ]
            if env_version == 'v2':
                envs = [
                    MarketEnvironmentV2(**env_args)
                    for i in range(self.population_size)
                ]
        else:
            envs = [
                Environment(**env_args) for i in range(self.population_size)
            ]

        # Iterating over all generations
        tqdm_e = tqdm(total=self.generations,
                      desc='Generation',
                      leave=True,
                      unit=" gen")
        for gen_i in range(self.generations):

            # Doing our evaluations
            args = [(self, self.networks[i], envs[i])
                    for i in range(self.population_size)]
            with Pool(num_cpus) as p:
                rewards = np.array(p.map(_run_par_evaluate, args))

            # Tracking best score per generation
            self.fitness.append(np.max(rewards))

            # Selecting the best network
            best_network = np.argmax(rewards)

            # Selecting top n networks
            n = int(self.survival_ratio * self.population_size)
            top_n_index = np.argsort(rewards)[-n:]

            # Creating our child networks
            new_networks = []
            for _ in range(self.population_size - n):
                # Origin will take -> 0 if both parent -> 1 if one parent and -> 2 if just get another network from previous run
                origin = np.random.choice([0, 1, 2],
                                          p=[
                                              self.both_parent_percentage,
                                              self.one_parent_percentage,
                                              1 - self.both_parent_percentage -
                                              self.one_parent_percentage
                                          ])

                # both parents
                if origin == 0:
                    new_net = NeuralNet(parent1=self.networks[random.randint(
                        0,
                        len(top_n_index) - 1)],
                                        parent2=self.networks[random.randint(
                                            0,
                                            len(top_n_index) - 1)],
                                        var=self.mutation_variance)
                # One parent
                elif origin == 1:
                    new_net = NeuralNet(parent1=self.networks[random.randint(
                        0,
                        len(top_n_index) - 1)],
                                        parent2=None,
                                        var=self.mutation_variance)
                else:
                    # Copy from other run (aside from the choosen best)
                    index = top_n_index[0]
                    while index not in top_n_index:
                        index = random.randint(0, len(self.networks) - 1)
                    new_net = self.networks[index]

                new_networks.append(new_net)

            # Setting our new networks
            maintain_best_n = [self.networks[i] for i in top_n_index]
            self.networks = maintain_best_n + new_networks

            # Export results for Tensorboard
            r_max = rewards.max()
            r_mean = rewards.mean()
            r_std = rewards.std()
            self.insert_info(r_max, r_mean, r_std)
            summary_writer.add_summary(tfSummary('Max rewards', r_max),
                                       global_step=gen_i)
            summary_writer.add_summary(tfSummary('Mean rewards', r_mean),
                                       global_step=gen_i)
            summary_writer.add_summary(tfSummary('STD rewards', r_std),
                                       global_step=gen_i)

            # Update stagnation
            if r_max > best_so_far:
                best_so_far = r_max
                stagnation = 1
            else:
                stagnation += 1

            #Update tqdm
            tqdm_e.set_description('Generation:' + str(gen_i + 1) +
                                   '| Highest Reward:' + str(r_max) +
                                   '| Average Reward:' + str(r_mean) +
                                   '| std Reward: ' + str(r_std) +
                                   '| Stagnation: ' + str(stagnation) +
                                   '| Population size: ' +
                                   str(len(self.networks)))

            # Save current weights
            self.best_network = self.networks[best_network]
            if debug:
                self._log_best_network_env_info(maintain_best_n[0],
                                                summary_writer, envs[0],
                                                test_env, gen_i)
            self.save_weights(gen_i, maintain_best_n[0], self.save_path)

            # Update logs
            summary_writer.flush()
            tqdm_e.update(1)
            tqdm_e.refresh()

            # Se estiver estagnado por muito tempo, eu paro
            if stagnation > 10 and self.stagnation_end: break

        # Close the environments
        [e.close() for e in envs]

        # Returning the best network
        self.best_network = self.networks[best_network]

        return self.global_info
Esempio n. 10
0
    def train(self, env, args, summary_writer):
        """ Main A2C Training Algorithm
        """
        # self.pretrain_random(env, args, summary_writer)
        results = []
        possible_states = [np.asarray(0), np.asarray(1)]

        # Main Loop
        tqdm_e = tqdm(range(args.nb_episodes),
                      desc='Score',
                      leave=True,
                      unit=" episodes")
        for e in tqdm_e:

            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []

            while not done:
                if args.render: env.render()
                # if (e%64==1)&(e>30):
                #     if args.render: env.render()
                # Actor picks an action (following the policy)

                if e < 30:
                    a = [
                        random.choice(possible_states),
                        random.choice(possible_states),
                        random.choice(possible_states),
                        random.choice(possible_states)
                    ]
                elif np.random.rand() < 0.5:
                    a = [
                        random.choice(possible_states),
                        random.choice(possible_states),
                        random.choice(possible_states),
                        random.choice(possible_states)
                    ]
                else:
                    a = self.policy_action(old_state, e)
                #feedforward
                # Retrieve new state, reward, and whether the state is terminal
                new_state, r, done, _ = env.step(a)

                #self.c_opt([states, discounted_rewards])

                # print(novelty)

                # Memorize (s, a, r) for training
                # actions.append(to_categorical(a, self.act_dim))
                actions.append(a)
                states.append(old_state)

                # compute the novelty
                # print(self.env_dim[1])
                last_state = states[-1].reshape((1, 4, self.env_dim[1]))
                novelty = 0
                # novelty=self.rnd_opt([last_state,last_state])[0]
                rewards.append(r + 0.0001 * novelty)
                # Update current state
                old_state = new_state
                cumul_reward += r + 0.0001 * novelty
                time += 1

            # Train using discounted rewards ie. compute updates
            self.her.add(states, np.asarray(actions), rewards)
            # print(np.asarray(states).shape, np.asarray(actions).shape, np.asarray(rewards).shape)
            # only update every 10 episodes?
            if e > 24:
                for item in self.her.sample():
                    states, actions, rewards, completed = item
                    # print(np.asarray(states).shape, np.asarray(actions).shape, np.asarray(rewards).shape)
                    states = np.asarray(states)[-min(1000, len(rewards)):]
                    actions = np.asarray(actions)[-min(1000, len(rewards)):]
                    rewards = np.asarray(rewards)[-min(1000, len(rewards)):]
                    self.train_models(states, actions, rewards, completed)
                # try:
                #     for item in self.her.sample():
                #         states, actions, rewards, completed=item
                #         print(np.asarray(states).shape, np.asarray(actions).shape, np.asarray(rewards).shape)
                #         states=np.asarray(states)[-max(1000, len(rewards)):]
                #         actions=np.asarray(actions)[-max(1000, len(rewards)):]
                #         rewards=np.asarray(rewards)[-max(1000, len(rewards)):]
                #         self.train_models(states, actions, rewards, completed)
                # except:
                #     print('error training critic')
                #     for item in self.her.sample():
                #         states, actions, rewards, completed=item
                #         self.train_models(states, np.asarray(actions), rewards, done)
                #     # self.train_models(states, np.asarray(actions), rewards, done)

            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score: " + str(cumul_reward))
            tqdm_e.refresh()

        return results
Esempio n. 11
0
    def pretrain_random(self,
                        env,
                        args,
                        summary_writer,
                        train_steps=200,
                        env_steps=100):
        """
        Generate a somewhat random output so that the agent explores.
        """
        results = []

        # Main Loop
        tqdm_e = tqdm(range(train_steps),
                      desc='pretrain',
                      leave=True,
                      unit=" episodes")
        for e in tqdm_e:

            # Reset episode
            time, cumul_reward, done = 0, 0, False
            old_state = env.reset()
            actions, states, rewards = [], [], []
            old_a = np.asarray(np.zeros_like(self.policy_action(old_state, e)))

            while not done:
                # if args.render: env.render()
                # Actor picks an action (following the policy)
                a = self.policy_action(old_state, e)
                # if np.random.rand()<0.1:
                #     print(a)
                # print(a)
                #feedforward
                # Retrieve new state, reward, and whether the state is terminal
                new_state, _, done, _ = env.step(a)

                r = np.random.choice(
                    ((np.asarray(a).reshape(-1) - old_a.reshape(-1))**2)[:2])
                old_a = np.asarray(a)
                #self.c_opt([states, discounted_rewards])

                # Memorize (s, a, r) for training
                actions.append(to_categorical(a, self.act_dim))
                rewards.append(r)
                states.append(old_state)

                # compute the novelty
                last_state = states[-1].reshape((1, 4, 4))
                novelty = self.rnd_opt([last_state, last_state])

                # Update current state
                old_state = new_state
                cumul_reward += r
                time += 1

            # Train using discounted rewards ie. compute updates
            try:
                self.train_models(states, np.asarray(actions), rewards, done)
            except:
                print('error training critic')
                self.train_models(states, np.asarray(actions), rewards, done)

            # Gather stats every episode for plotting
            if (args.gather_stats):
                mean, stdev = gather_stats(self, env)
                results.append([e, mean, stdev])

            # Export results for Tensorboard
            score = tfSummary('score', cumul_reward)
            summary_writer.add_summary(score, global_step=e)
            summary_writer.flush()

            # Display score
            tqdm_e.set_description("Score:{}, Nov.: {}".format(
                str(cumul_reward), novelty))
            tqdm_e.refresh()