コード例 #1
0
ファイル: food_exp_ql.py プロジェクト: Jontahan/kvad
        for i in range(1, len(layer_sizes)):
            self.layers.append(nn.ReLU())
            self.layers.append(nn.Linear(layer_sizes[i - 1], layer_sizes[i]))
        
        self.nn = nn.Sequential(*self.layers)

    def forward(self, x):
        x = torch.flatten(x)
        return self.nn.forward(x)

gamma = .95
alpha = .002

all_mean_diffs = []

all_states = env.get_all_states()        
ql_a = Qlearning(env, n_states=len(all_states), n_actions=env.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998)
ql_b = Qlearning(env, n_states=len(all_states), n_actions=env.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998)
        
for ne in range(0, 10):
    np.random.seed(10)
    num_episodes = 100 #* ne
    convergence_durations = []
    ql_agents = []
    for i in range(2):
        #print('Simulation {}/{}'.format(i, 50))
        
        ql = Qlearning(env, n_states=len(all_states), n_actions=env.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998)
        #dqn = DQN(env, qnet=net, plotter=plot, render=True, memory_length=2000, gamma=gamma, alpha=alpha, epsilon_start=0.3, caching_interval=3000)

        for e in range(num_episodes):
コード例 #2
0
ファイル: value_comp_test.py プロジェクト: Jontahan/numedal
    his = 50
    if len(durations_t) >= his:
        means = durations_t.unfold(0, his, 1).mean(1).view(-1)
        means = torch.cat((torch.zeros(his - 1), means))
        plt.plot(means.numpy(), c='green')

    plt.pause(0.001)


gamma = .95
alpha = .002

all_mean_diffs = []

all_states = env_a.get_all_states()
ql_a = Qlearning(env_a,
                 n_states=len(all_states),
                 n_actions=env_a.action_space.n,
                 plotter=plot,
                 epsilon=1.0,
                 epsilon_decay=lambda e, i: e * .998)
ql_b = Qlearning(env_b,
                 n_states=len(all_states),
                 n_actions=env_b.action_space.n,
                 plotter=plot,
                 epsilon=1.0,
                 epsilon_decay=lambda e, i: e * .998)

for ne in range(0, 20):
    #np.random.seed(10)
コード例 #3
0
        super(PolicyNet, self).__init__()
        self.fc1 = nn.Linear(64, 32)
        self.fc2 = nn.Linear(32, 16)
        self.fc3 = nn.Linear(16, 4)
        self.linreg = nn.Linear(64, 4)

    def forward(self, x):
        x = torch.flatten(x)
        return self.linreg(x)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x


all_states = env_a.get_all_states() + env_b.get_all_states()
    

#ql_a = Qlearning(env_a, n_states=len(all_states), n_actions=env_a.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998)
#ql_b = Qlearning(env_b, n_states=len(all_states), n_actions=env_b.action_space.n, plotter=plot, epsilon=1.0, epsilon_decay=lambda e, i: e * .998)
dqn_a = DQN(env_a, qnet=PolicyNet().double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1)
dqn_b = DQN(env_b, qnet=PolicyNet().double(), plotter=None, render=False, memory_length=2000, gamma=.99, alpha=.001, epsilon_start=0.1)
#dqn.train(2000, 4, plot=True, verbose=True)

for ne in range(0, 30):
    #np.random.seed(10)
    num_episodes = 100 #* ne
    convergence_durations = []
    ql_agents = []
    
    dqn_a.train(num_episodes, 4, plot=False)