def step(policy, positions, hidden=None):
    directions = [get_valid_directions(p) for p in positions]
    batch = to_batch(directions)

    if hidden is not None:
        policies, hidden = policy(batch, hidden)

    else:
        policies, hidden = policy(batch)

    # Sample Aktionen (Indizes) aus der aktuellen Policy

    distributions = torch.distributions.Categorical(policies)
    actions = distributions.sample()
    probs = distributions.log_prob(actions)

    # Transformation der Aktionen in Strings (left, up ...)

    actions = [direction_strings[index.item()] for index in actions]
    rewards = torch.zeros((batch_size, ), device=device)
    next_positions = []

    # Ausführen der Aktionen und Feedback speichern

    for i, (action, position) in enumerate(zip(actions, positions)):
        next_position, reward = move(action, position)

        rewards[i] = reward
        next_positions.append(next_position)

    return next_positions, probs, rewards, hidden
Beispiel #2
0
def best_move(position):
    best_action = None
    best_value = -10

    for direction in env.get_valid_directions(position):
        target, _ = env.move(direction, position)
                    
        if values[target] > best_value:
            best_action = direction
            best_value = values[target]

    return best_action
Beispiel #3
0
def train_with_policy_gradient():
    net = Net()
    optimizer = torch.optim.Adam(net.parameters(), lr=learnrate)

    for epoch in range(epochs):
        positions = [env.entry_id for _ in range(batch_size)]
        probs = torch.empty((length, batch_size), device=device)
        rewards = torch.empty((length, batch_size), device=device)

        #DEBUG
        render_positions = [0 for i in range(36)]
        #/DEBUG

        for step in range(length):
            input = onehots(positions)
            policy = net(input)

            distributions = torch.distributions.Categorical(policy)
            actions = distributions.sample()
            probs_tmp = distributions.log_prob(actions)

            actions = [env.direction_strings[index.item()] for index in actions]
            rew_tmp = torch.zeros((batch_size), device=device)
            pos_tmp = []

            for i, (action, position) in enumerate(zip(actions, positions)):
                target, _ = env.move(action, position)
                rew_tmp[i] = values[target] - values[position]
                pos_tmp.append(target)

                #DEBUG
                render_positions[target] += 1
                #/DEBUG

            probs[step] = probs_tmp
            rewards[step] = rew_tmp
            positions = pos_tmp

        total = torch.zeros((batch_size,), device=device)
        optimizer.zero_grad()

        for step, (p, r) in enumerate(zip(probs, rewards)):
            total = total + discount**step * r * p

        total = torch.sum(total) / batch_size

        loss = -total
        loss.backward()
        optimizer.step()

        #DEBUG
        env.prettyprint(render_positions)
Beispiel #4
0
def train_with_crossentropy():
    net = Net()
    optimizer = torch.optim.Adam(net.parameters(), lr=learnrate)
    criterion = torch.nn.NLLLoss()

    for epoch in range(epochs):
        positions = [env.entry_id for _ in range(batch_size)]

        #DEBUG
        render_positions = [0 for i in range(36)]
        #/DEBUG

        for step in range(length):
            input = onehots(positions)
            policy = net(input)

            distributions = torch.distributions.Categorical(torch.exp(policy))
            actions = distributions.sample()
            actions = [env.direction_strings[index.item()] for index in actions]

            labels = torch.zeros((batch_size,), device=device, dtype=torch.long)
            pos_tmp = []

            for i, (action, position) in enumerate(zip(actions, positions)):
                target, _ = env.move(action, position)
                pos_tmp.append(target)

                #DEBUG
                render_positions[target] += 1
                #/DEBUG
                
                best_action_string = best_move(position)
                best_action_index = env.direction_indices[best_action_string]

                labels[i] = best_action_index

            positions = pos_tmp

            optimizer.zero_grad()
            loss = criterion(policy, labels)
            loss.backward()
            optimizer.step()

            #DEBUG
            env.prettyprint(render_positions)
Beispiel #5
0
def floyd():
    distances = numpy.full((36,36), 1000)

    for i in range(36):
        distances[i, i] = 0

        dirs = env.get_valid_directions(i)
        neighbors = [env.move(d, i)[0] for d in dirs]

        for j in neighbors:
            distances[i, j] = 1

    for i in range(36):
        for j in range(36):
            for k in range(36):
                if distances[j, k] > distances[j, i] + distances[i, k]:
                    distances[j, k] = distances[j, i] + distances[i, k]

    return distances
def step(policy, positions):
    batch = to_batch(positions)

    policies = policy(batch)

    # Sample Aktionen (kodiert als Indizes) mit den Wahrscheinlichkeiten der aktuellen Policy
    distributions = torch.distributions.Categorical(policies)
    actions = distributions.sample()
    probs = distributions.log_prob(actions)

    # Transormation der Aktionen in Strings (left, up ...)
    actions = [directions[index.item()] for index in actions]
    rewards = torch.zeros((batch_size, ), device=device)
    next_positions = []

    # Ausführen der Aktionen und Feedback speichern
    for i, (action, position) in enumerate(zip(actions, positions)):
        next_position, reward = move(action, position)

        rewards[i] = reward
        next_positions.append(next_position)

    return next_positions, probs, rewards
Beispiel #7
0
            v_l_act, v_r_act, v_t, w_t, err_l, err_r, col, v_l_des, v_r_des = \
            sim.update_wheel_speeds(x_cur, y_cur, theta_cur, motor_firing_rates)

            # Save dsired and actual speeds
            simdata['speed_log'].append((v_l_act, v_r_act))
            simdata['desired_speed_log'].append((v_l_des, v_r_des))
    
            # Save motor neurons firing rates
            simdata['motor_fr'].append(motor_firing_rates)
    
            # Save linear and angualr velocities
            simdata['linear_velocity_log'].append(v_t)
            simdata['angular_velocity_log'].append(w_t)
    
            # Move robot according to the read-out speeds from motor neurons
            x_cur, y_cur, theta_cur = env.move(x_cur, y_cur, theta_cur, v_t, w_t)
    
            nrns_st, rctrs_st = learning.get_spike_times(nrns_sd, rctrs_sd)
            rec_nrn_tags, nrn_nrn_tags, rctr_t, nrn_t, pt = learning.get_eligibility_trace(
                        nrns_st, rctrs_st, simtime,
                        simdata['rctr_nrn_trace'][t], simdata['nrn_nrn_trace'][t])
            
            simdata['rctr_nrn_trace'].append(rec_nrn_tags)
            simdata['nrn_nrn_trace'].append(nrn_nrn_tags)
            if (t+1) % 10 == 0:
                fitness = ev.get_fitness_value(simdata['speed_log'][t-10:])
                print "Fitness"
                print fitness
                print "Reward"
                reward = learning.get_reward(reward, fitness, x_cur, y_cur)
                print reward