def step(policy, positions, hidden=None): directions = [get_valid_directions(p) for p in positions] batch = to_batch(directions) if hidden is not None: policies, hidden = policy(batch, hidden) else: policies, hidden = policy(batch) # Sample Aktionen (Indizes) aus der aktuellen Policy distributions = torch.distributions.Categorical(policies) actions = distributions.sample() probs = distributions.log_prob(actions) # Transformation der Aktionen in Strings (left, up ...) actions = [direction_strings[index.item()] for index in actions] rewards = torch.zeros((batch_size, ), device=device) next_positions = [] # Ausführen der Aktionen und Feedback speichern for i, (action, position) in enumerate(zip(actions, positions)): next_position, reward = move(action, position) rewards[i] = reward next_positions.append(next_position) return next_positions, probs, rewards, hidden
def best_move(position): best_action = None best_value = -10 for direction in env.get_valid_directions(position): target, _ = env.move(direction, position) if values[target] > best_value: best_action = direction best_value = values[target] return best_action
def train_with_policy_gradient(): net = Net() optimizer = torch.optim.Adam(net.parameters(), lr=learnrate) for epoch in range(epochs): positions = [env.entry_id for _ in range(batch_size)] probs = torch.empty((length, batch_size), device=device) rewards = torch.empty((length, batch_size), device=device) #DEBUG render_positions = [0 for i in range(36)] #/DEBUG for step in range(length): input = onehots(positions) policy = net(input) distributions = torch.distributions.Categorical(policy) actions = distributions.sample() probs_tmp = distributions.log_prob(actions) actions = [env.direction_strings[index.item()] for index in actions] rew_tmp = torch.zeros((batch_size), device=device) pos_tmp = [] for i, (action, position) in enumerate(zip(actions, positions)): target, _ = env.move(action, position) rew_tmp[i] = values[target] - values[position] pos_tmp.append(target) #DEBUG render_positions[target] += 1 #/DEBUG probs[step] = probs_tmp rewards[step] = rew_tmp positions = pos_tmp total = torch.zeros((batch_size,), device=device) optimizer.zero_grad() for step, (p, r) in enumerate(zip(probs, rewards)): total = total + discount**step * r * p total = torch.sum(total) / batch_size loss = -total loss.backward() optimizer.step() #DEBUG env.prettyprint(render_positions)
def train_with_crossentropy(): net = Net() optimizer = torch.optim.Adam(net.parameters(), lr=learnrate) criterion = torch.nn.NLLLoss() for epoch in range(epochs): positions = [env.entry_id for _ in range(batch_size)] #DEBUG render_positions = [0 for i in range(36)] #/DEBUG for step in range(length): input = onehots(positions) policy = net(input) distributions = torch.distributions.Categorical(torch.exp(policy)) actions = distributions.sample() actions = [env.direction_strings[index.item()] for index in actions] labels = torch.zeros((batch_size,), device=device, dtype=torch.long) pos_tmp = [] for i, (action, position) in enumerate(zip(actions, positions)): target, _ = env.move(action, position) pos_tmp.append(target) #DEBUG render_positions[target] += 1 #/DEBUG best_action_string = best_move(position) best_action_index = env.direction_indices[best_action_string] labels[i] = best_action_index positions = pos_tmp optimizer.zero_grad() loss = criterion(policy, labels) loss.backward() optimizer.step() #DEBUG env.prettyprint(render_positions)
def floyd(): distances = numpy.full((36,36), 1000) for i in range(36): distances[i, i] = 0 dirs = env.get_valid_directions(i) neighbors = [env.move(d, i)[0] for d in dirs] for j in neighbors: distances[i, j] = 1 for i in range(36): for j in range(36): for k in range(36): if distances[j, k] > distances[j, i] + distances[i, k]: distances[j, k] = distances[j, i] + distances[i, k] return distances
def step(policy, positions): batch = to_batch(positions) policies = policy(batch) # Sample Aktionen (kodiert als Indizes) mit den Wahrscheinlichkeiten der aktuellen Policy distributions = torch.distributions.Categorical(policies) actions = distributions.sample() probs = distributions.log_prob(actions) # Transormation der Aktionen in Strings (left, up ...) actions = [directions[index.item()] for index in actions] rewards = torch.zeros((batch_size, ), device=device) next_positions = [] # Ausführen der Aktionen und Feedback speichern for i, (action, position) in enumerate(zip(actions, positions)): next_position, reward = move(action, position) rewards[i] = reward next_positions.append(next_position) return next_positions, probs, rewards
v_l_act, v_r_act, v_t, w_t, err_l, err_r, col, v_l_des, v_r_des = \ sim.update_wheel_speeds(x_cur, y_cur, theta_cur, motor_firing_rates) # Save dsired and actual speeds simdata['speed_log'].append((v_l_act, v_r_act)) simdata['desired_speed_log'].append((v_l_des, v_r_des)) # Save motor neurons firing rates simdata['motor_fr'].append(motor_firing_rates) # Save linear and angualr velocities simdata['linear_velocity_log'].append(v_t) simdata['angular_velocity_log'].append(w_t) # Move robot according to the read-out speeds from motor neurons x_cur, y_cur, theta_cur = env.move(x_cur, y_cur, theta_cur, v_t, w_t) nrns_st, rctrs_st = learning.get_spike_times(nrns_sd, rctrs_sd) rec_nrn_tags, nrn_nrn_tags, rctr_t, nrn_t, pt = learning.get_eligibility_trace( nrns_st, rctrs_st, simtime, simdata['rctr_nrn_trace'][t], simdata['nrn_nrn_trace'][t]) simdata['rctr_nrn_trace'].append(rec_nrn_tags) simdata['nrn_nrn_trace'].append(nrn_nrn_tags) if (t+1) % 10 == 0: fitness = ev.get_fitness_value(simdata['speed_log'][t-10:]) print "Fitness" print fitness print "Reward" reward = learning.get_reward(reward, fitness, x_cur, y_cur) print reward