Example #1
0
def main():
    args = parser.get_args()
    path = args["path"]
    if not os.path.exists(path):
        print("File does not exist")
        sys.exit(1)
    todos = utils.load_todos(path)
    utils.get_action(args, todos)
def assign_tasks(data):
    # TODO Ask Sadegh about node structure
    current_state = unpack(data)

    pub = rospy.Publisher('task_assigner/assignment', TaskAssignment)
    rospy.init_node('task_assigner')

    with open('mdp_info.json', 'r') as mdp_info_file:
        mdp_info = json.loads(mdp_info_file)

        # TODO Make a a map service
        with open(WORLDS_DIRECTORY + "world.json", "r") as world_file:
            world = json.load(world_file)

            while not rospy.is_shutdown():
                # TODO Query state from some source
                current_action = utils.get_action(mdp_info, current_state)

                for t, r in current_action:
                    msg = TaskAssignment()
                    msg.robot_id = r.get_id()
                    msg.problem = problem_generator.generate_escort_problem(r, t, world)

                    rospy.loginfo(msg)
                    pub.publish(msg)

                rospy.sleep(TASK_DURATION)
    def extract_tree_expression(self, node, index_mark='_'):
        if node == None or node.data == None:
            return self.expression_str
        feature, parentheses, action, wl_scalar, wl_power, parentheses_bias, wl_activation, parentheses_activation, wl_bias, parentheses_power = Individual.get_all_merged_values(
            node.data)

        if parentheses == 1:
            self.expression_str += utils.get_activation(
                parentheses_activation) + '('
        self.expression_str += utils.get_activation(wl_activation)
        self.expression_str += '({}'.format(wl_scalar) + '*'  ## add wl scalar
        self.expression_str += '{}{}{}'.format(index_mark, feature, index_mark)
        self.expression_str += '**{}'.format(wl_power) + '+{}'.format(wl_bias)
        self.expression_str += ')'
        self.expression_str += utils.get_action(action)

        self.expression_str = self.extract_tree_expression(
            node.left, index_mark)
        self.expression_str = self.extract_tree_expression(
            node.right, index_mark)

        if parentheses == 1:
            self.expression_str = self.expression_str[:-1] + '+{})'.format(
                parentheses_bias) + '**{}'.format(
                    parentheses_power) + self.expression_str[
                        -1]  ## put closing parentesis before action
        return self.expression_str
Example #4
0
def _get_path(net, dataset, map, map_index, start_pos, goal_pos,
              max_number_steps):
    with torch.no_grad():
        success = True
        path = [start_pos]
        pos = start_pos

        for idx in range(max_number_steps):
            # ensure that whole perceptive area lies within grid world
            if pos[0] >= 3 * map.size()[0] // 4 or pos[0] < map.size(
            )[0] // 4 or pos[1] >= 3 * map.size()[1] // 4 or pos[1] < map.size(
            )[1] // 4:
                return (path, False)

            # reached goal
            if pos[0] == goal_pos[0] and pos[1] == goal_pos[1] and pos[
                    2] == goal_pos[2]:
                return (path, success)

            if idx > 0:
                # get indices of the cells that contain the wheels
                fl, fr, bl, br = get_wheel_coord(pos, net.rotation_step_size,
                                                 net.leg_x, net.leg_y)
                fl, fr, bl, br = fl.round().long(), fr.round().long(
                ), bl.round().long(), br.round().long()

                # check collision for each wheel
                if map[fl[0], fl[1]] == 1 or map[fr[0], fr[1]] == 1 or map[
                        bl[0], bl[1]] == 1 or map[br[0], br[1]] == 1:
                    success = False

            # get net input for current position
            start_orientation = pos[2].to(net.device)
            occ_map, goal_map = dataset.get_inputs((map_index, pos, goal_pos))
            occ_map, goal_map = occ_map.unsqueeze_(0).to(
                net.device), goal_map.unsqueeze_(0).to(net.device)

            # predict next action
            action_vector = net.forward(occ_map, goal_map, start_orientation)
            action = get_action(action_vector[0], dim=3)

            # update position and orientation
            new_pos = pos + action
            if new_pos[2] < 0:
                new_pos[2] += net.num_orientations
            elif new_pos[2] >= net.num_orientations:
                new_pos[2] -= net.num_orientations

            path.append(new_pos)
            pos = new_pos

        if pos[0] == goal_pos[0] and pos[1] == goal_pos[1] and pos[
                2] == goal_pos[2]:
            # reached goal
            return (path, success)
        else:
            # did not reach goal
            return (path, False)
Example #5
0
    def choose_action(self, obs):
        action_id = get_action(self.qval[obs], self.player_lambda)
        self.probs[obs] = np.array(current_probs)
        chosen_action = ID_TO_ACTION[self.player_type][action_id]

        if self.save_history:
            self.history['states'].append(obs)
            self.history['actions'].append(chosen_action)
        return chosen_action
Example #6
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = 3
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = QNet(num_actions)
    net.load_state_dict(torch.load(args.save_path + 'model.pth'))

    net.to(device)
    net.eval()

    epsilon = 0

    for e in range(5):
        done = False

        score = 0
        state = env.reset()

        state = pre_process(state)
        state = torch.Tensor(state).to(device)
        history = torch.stack((state, state, state, state))

        for i in range(3):
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            state = pre_process(state)
            state = torch.Tensor(state).to(device)
            state = state.unsqueeze(0)
            history = torch.cat((state, history[:-1]), dim=0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(history.unsqueeze(0))
            action = get_action(0, qvalue, num_actions)

            next_state, reward, done, info = env.step(action + 1)

            next_state = pre_process(next_state)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            next_history = torch.cat((next_state, history[:-1]), dim=0)

            score += reward
            history = next_history

        print('{} episode | score: {:.2f}'.format(e, score))
Example #7
0
def run_caesar():
    action = utils.get_action()
    encrypting = action == 'E'
    data = clean_caesar(utils.get_input(binary=False))

    print("* Transform *")
    print("{}crypting {} using Caesar cipher...".format(
        'En' if encrypting else 'De', data))

    output = (encrypt_caesar if encrypting else decrypt_caesar)(data)

    utils.set_output(output)
Example #8
0
    def test_validate_mask(self):
        env = tf_py_environment.TFPyEnvironment(self.env)
        policy = random_tf_policy.RandomTFPolicy(
            time_step_spec=env.time_step_spec(),
            action_spec=env.action_spec(),
            observation_and_action_constraint_splitter=GameEnv.obs_and_mask_splitter)

        driver = dynamic_step_driver.DynamicStepDriver(env, policy, num_steps=1)
        for i in range(10):
            time_step, _ = driver.run()
            action_step = policy.action(time_step)
            print(utils.get_action(action_step.action.numpy()[0], 3))
Example #9
0
def run_vigenere():
    action = utils.get_action()
    encrypting = action == 'E'
    data = clean_vigenere(utils.get_input(binary=False))

    print("* Transform *")
    keyword = clean_vigenere(input("Keyword? "))

    print("{}crypting {} using Vigenere cipher and keyword {}...".format(
        'En' if encrypting else 'De', data, keyword))

    output = (encrypt_vigenere if encrypting else decrypt_vigenere)(data,
                                                                    keyword)

    utils.set_output(output)
Example #10
0
    def choose_action(self, obs):
        features = []
        for action_id in range(2):
            features.append(self.features[-1] + [STATE_TO_ID[obs], action_id])
        self.qval[obs] = self.model(torch.tensor(
            features, dtype=torch.float32)).data.numpy().ravel()

        action_id = get_action(self.qval[obs], self.player_lambda)
        chosen_action = ID_TO_ACTION[self.player_type][action_id]

        if self.save_history:
            self.history['states'].append(obs)
            self.history['actions'].append(chosen_action)
            self.rounds += 1
        return chosen_action
Example #11
0
def _get_path(net, dataset, map, map_index, start_pos, goal_pos,
              max_number_steps):
    with torch.no_grad():
        success = True
        path = [start_pos]
        pos = start_pos

        for idx in range(max_number_steps):
            # ensure that whole perceptive area lies within grid world
            if pos[0] >= 3 * map.size()[0] // 4 or pos[0] < map.size(
            )[0] // 4 or pos[1] >= 3 * map.size()[1] // 4 or pos[1] < map.size(
            )[1] // 4:
                return (path, False)

            # reached goal
            if pos[0] == goal_pos[0] and pos[1] == goal_pos[1]:
                return (path, success)

            # check collision
            if map[pos[0], pos[1]] == 1:
                success = False

            # get input maps for current position
            occ_map, goal_map = dataset.get_inputs((map_index, pos, goal_pos))
            occ_map, goal_map = occ_map.unsqueeze_(0).to(
                net.device), goal_map.unsqueeze_(0).to(net.device)

            # predict next action
            action_vector = net.forward(occ_map, goal_map)
            action = get_action(action_vector[0], dim=2)

            # update position
            new_pos = pos + action
            path.append(new_pos)
            pos = new_pos

        if pos[0] == goal_pos[0] and pos[1] == goal_pos[1] and pos[
                2] == goal_pos[2]:
            # reached goal
            return (path, success)
        else:
            # did not reach goal
            return (path, False)
Example #12
0
def run_merkle_hellman():
    action = utils.get_action()

    print("* Seed *")
    seed = input("Set Seed [enter for random]: ")
    import random
    random.seed(seed)
    print("* Building private key...")

    private_key = generate_private_key()
    public_key = create_public_key(private_key)

    if action == 'E':  # Encrypt
        data = utils.get_input(binary=True)
        print("* Transform *")
        chunks = encrypt_mh(data, public_key)
        output = ' '.join(map(str, chunks))
    else:  # Decrypt
        data = utils.get_input(binary=False)
        chunks = [int(line.strip()) for line in data.split() if line.strip()]
        print("* Transform *")
        output = decrypt_mh(chunks, private_key)

    utils.set_output(output)
Example #13
0
def _rollout(net, batch_size=128, validation=True, num_workers=4):
    with torch.no_grad():
        diff = 0.
        net_length = 0.
        expert_length = 0.

        # load dataset and make it available to all workers
        global rollout_data
        if validation:
            rollout_data = GridDataset_2d(net.size,
                                          data_type='validation',
                                          full_paths=True)
        else:
            rollout_data = GridDataset_2d(net.size,
                                          data_type='evaluation',
                                          full_paths=True)
        iterations = rollout_data.num_examples

        # list of all tasks (describes task through map and path indices)
        open_paths = [(i, j) for i in range(rollout_data.num_examples)
                      for j in range(rollout_data.num_paths_per_map)]
        paths = [[[rollout_data.expert_paths[map_id][path_id][0]]
                  for path_id in range(rollout_data.num_paths_per_map)]
                 for map_id in range(rollout_data.num_examples)]
        success = [[
            False for path_id in range(rollout_data.num_paths_per_map)
        ] for map_id in range(rollout_data.num_examples)]

        path_length = 0
        if not validation:
            print("Starting Rollout-Test.")
            print("Max expert path length:", rollout_data.max_path_length)
            start_time = time.time()

        pool = Pool(processes=num_workers)
        while len(open_paths
                  ) != 0 and path_length < 2 * rollout_data.max_path_length:
            parameters = []
            # get map indices and current positions for all open paths
            for map_id, path_id in open_paths:
                parameters.append(
                    (map_id, paths[map_id][path_id][-1],
                     rollout_data.expert_paths[map_id][path_id][-1]))

            # get inputs for all open paths
            inputs = pool.map(_get_inputs, parameters)

            path_length += 1
            current_open_task_id = 0

            # predict next step for each open path
            for input_batch in batch(inputs, batch_size):
                # unpack inputs
                occ_maps, goal_maps = zip(*input_batch)
                occ_maps, goal_maps = torch.stack(occ_maps, dim=0).to(
                    net.device), torch.stack(goal_maps, dim=0).to(net.device)

                # predict next action
                action_vectors = net.forward(occ_maps, goal_maps)

                for i in range(action_vectors.size(0)):
                    # update positions and paths
                    map_id, path_id = open_paths[current_open_task_id]
                    action = get_action(action_vectors[i], dim=2)
                    pos = paths[map_id][path_id][-1] + action

                    paths[map_id][path_id].append(pos)
                    goal_pos = rollout_data.expert_paths[map_id][path_id][-1]

                    # reached goal
                    if pos[0] == goal_pos[0] and pos[1] == goal_pos[1]:
                        success[map_id][path_id] = True
                        del open_paths[current_open_task_id]
                        continue

                    # check upper border for path length
                    # (to detect oscillation)
                    if path_length > 2 * len(
                            rollout_data.expert_paths[map_id][path_id]):
                        del open_paths[current_open_task_id]
                        continue

                    # ensure that perceptive area lies completely within grid world
                    if pos[0] >= 3 * rollout_data.grids[map_id].size(
                    )[0] // 4 or pos[0] < rollout_data.grids[map_id].size(
                    )[0] // 4 or pos[1] >= 3 * rollout_data.grids[map_id].size(
                    )[1] // 4 or pos[1] < rollout_data.grids[map_id].size(
                    )[1] // 4:
                        del open_paths[current_open_task_id]
                        continue

                    # check collision
                    if rollout_data.grids[map_id][pos[0], pos[1]] == 1:
                        del open_paths[current_open_task_id]
                        continue

                    current_open_task_id += 1

            if not validation:
                if path_length % 20 == 0:
                    print("Computed paths up to length ", path_length)

        pool.close()

        # count successful paths
        num_successful = 0
        for i in range(rollout_data.num_examples):
            for j in range(rollout_data.num_paths_per_map):
                paths[i][j] = torch.stack(paths[i][j], dim=0)
                if success[i][j]:
                    num_successful += 1
                    if not validation:
                        # compare length of network and expert paths
                        diff += get_path_length(
                            paths[i][j], dim=2) - get_path_length(
                                rollout_data.expert_paths[i][j], dim=2)
                        net_length += get_path_length(paths[i][j], dim=2)
                        expert_length += get_path_length(
                            rollout_data.expert_paths[i][j], dim=2)

        if not validation:
            print("Success: ", num_successful / len(rollout_data))
            print("Path length (network): ", net_length)
            print("Path length (expert): ", expert_length)
            print("Average absolute path difference: ", diff / num_successful)
            print("average relative path difference: ",
                  net_length / expert_length)
            print("Duration: ", time.time() - start_time)
            print("")
        return num_successful / len(rollout_data)
Example #14
0
def train_dqn(episode,
              rand_obs=0,
              rand_act=0,
              noise_obs_level=0.01,
              noise_act_level=0.1):
    loss = []
    agent = DQN(env.action_space.n, env.observation_space.shape[0])
    all_actions = []
    all_rand_acts = []
    all_rewards = []
    for e in range(episode):
        curr_acts = []
        curr_rand_acts = []
        curr_rewards = []
        state = env.reset()
        state = np.reshape(state, (1, 8))
        score = 0
        max_steps = 5000
        for i in range(max_steps):
            if rand_obs == 1:
                state = get_observation(state,
                                        option=0,
                                        noise_obs_level=noise_obs_level)
            action = agent.act(state)
            if rand_act == 1:
                action, is_rand = get_action(action)
            else:
                action, is_rand = action, 0
            curr_acts.append(action)
            curr_rand_acts.append(is_rand)
            # env.render()
            next_state, reward, done, _ = env.step(action)
            curr_rewards.append(reward)
            score += reward
            next_state = np.reshape(next_state, (1, 8))
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            agent.replay()
            if done:
                print("episode: {}/{}, score: {}".format(e, episode, score))
                break
        loss.append(score)
        all_actions.append(np.array(curr_acts))
        all_rand_acts.append(np.array(curr_rand_acts))
        all_rewards.append(np.array(curr_rewards))
        # Average score of last 100 episode
        is_solved = np.mean(loss[-100:])
        # if is_solved > 50:
        #     print('\n Task Completed! \n')
        #     break
        print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
    # np.savez("./saved/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + ".npz",
    #                       acts=np.array(all_actions),
    #                       rand_actions=np.array(all_rand_acts),
    #                       rewards=np.array(all_rewards),
    #                       scores=np.array(loss))
    # np.savez("./saved_dqn/dqn_rand_act_" + str(rand_act) + "_rand_obs_" + str(rand_obs) + "_noise_obs_lvl_" + str(noise_obs_level) + ".npz",
    #                       acts=np.array(all_actions),
    #                       rand_actions=np.array(all_rand_acts),
    #                       rewards=np.array(all_rewards),
    #                       scores=np.array(loss))
    np.savez("./saved_dqn/dqn_rand_act_" + str(rand_act) + "_rand_obs_" +
             str(rand_obs) + "_noise_act_lvl_" + str(noise_act_level) + ".npz",
             acts=np.array(all_actions),
             rand_actions=np.array(all_rand_acts),
             rewards=np.array(all_rewards),
             scores=np.array(loss))
    return loss
Example #15
0
def train_a3c(episode, rand_obs=0, rand_act=0):
    # Defaults parameters:
    #    gamma = 0.99
    #    lr = 0.02
    #    betas = (0.9, 0.999)
    #    random_seed = 543

    render = False
    gamma = 0.99
    lr = 0.02
    betas = (0.9, 0.999)
    random_seed = 543

    torch.manual_seed(random_seed)

    policy = ActorCritic()
    optimizer = optim.Adam(policy.parameters(), lr=lr, betas=betas)
    print(lr, betas)

    running_reward = 0
    loss_ls = []
    all_actions = []
    all_rand_acts = []
    all_rewards = []
    for i_episode in range(0, episode):
        curr_acts = []
        curr_rand_acts = []
        curr_rewards = []
        state = env.reset()
        score = 0
        for t in range(10000):
            if rand_obs == 1:
                state = get_observation(state, option=1)
            # action = agent.act(state)
            # state = get_observation(state, option=1)
            action = policy(state)
            if rand_act == 1:
                action, is_rand = get_action(action)
            else:
                action, is_rand = action, 0
            curr_acts.append(action)
            curr_rand_acts.append(is_rand)
            # action = get_action(action)
            state, reward, done, _ = env.step(action)
            curr_rewards.append(reward)

            policy.rewards.append(reward)
            running_reward += reward
            score += reward
            if render and i_episode > 1000:
                env.render()
            if done:
                break
        loss_ls.append(score)
        # Updating the policy :
        optimizer.zero_grad()
        loss = policy.calculateLoss(gamma)
        loss.backward()
        optimizer.step()
        policy.clearMemory()
        all_actions.append(np.array(curr_acts))
        all_rand_acts.append(np.array(curr_rand_acts))
        all_rewards.append(np.array(curr_rewards))

        # # saving the model if episodes > 999 OR avg reward > 200
        # if i_episode > 999:
        #     torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))

        # if running_reward > 4000:
        #     torch.save(policy.state_dict(), './preTrained/LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
        #     print("########## Solved! ##########")
        #     test(name='LunarLander_{}_{}_{}.pth'.format(lr, betas[0], betas[1]))
        #     break

        if i_episode % 20 == 0:
            running_reward = running_reward / 20
            print('Episode {}\tlength: {}\treward: {}'.format(
                i_episode, t, running_reward))
            running_reward = 0
    np.savez("./saved/a3c_rand_act_" + str(rand_act) + "_rand_obs_" +
             str(rand_obs) + ".npz",
             acts=np.array(all_actions),
             rand_actions=np.array(all_rand_acts),
             rewards=np.array(all_rewards),
             scores=np.array(loss_ls))
    return loss_ls
Example #16
0
# training loop
losses = []
ti = time()
for e in range(epochs):
    n_hands = 0
    winner = None
    actions = np.zeros((2)).astype(np.int)
    og_states = np.zeros((2, state_cards + 2))
    aux_state = None
    game_over = False
    while winner is None:
        n_hands += 1

        og_states[env.turn, :] = env.get_state().flatten()
        action_A = get_action(model, og_states[env.turn][np.newaxis],
                              env.legal_moves(), epsilon[e])
        actions[env.turn] = action_A
        hand_over, new_state, rewards, winner = env.play_card(action_A)

        if aux_state is not None:
            exp_replay.remember(aux_state[np.newaxis], aux_action,
                                env.get_state(),
                                rewards[int(not env.hand_winner)], game_over)

        og_states[env.turn, :] = env.get_state().flatten()
        action_B = get_action(model, og_states[env.turn][np.newaxis],
                              env.legal_moves(), epsilon[e])
        actions[env.turn] = action_B
        hand_over, new_state, rewards, winner = env.play_card(action_B)

        aux_state = og_states[int(not env.hand_winner)]
Example #17
0
    def do_episode(self, config):
        """

        :param config:
        :return:
        """

        # Initial values
        done = False
        score_e = 0
        step_e = 0

        # Get epsilon for initial state
        self.update_epsilon_step()

        # Episodic decay (only after linear decay)
        self.update_alpha_episode()
        self.update_epsilon_episode()

        # Get current state s, act based on s
        state = self.discretize_state(self.env.reset())
        action = self.act(state)

        # Continue while not crashed
        all_acts = []
        rand_acts = []
        all_rewards = []
        while not done:

            # Update for other steps
            self.update_alpha_step()
            self.update_epsilon_step()

            # Get next state s' and reward, act based on s'
            state_, reward, done, _ = self.env.step(action)
            if config['rand_obs'] == 1:
                state_ = get_observation(state_, option=1)
            state_ = self.discretize_state(state_)
            action_ = self.act(state_)
            if config['rand_act'] == 1:
                action, is_rand = get_action(action)
            else:
                action, is_rand = action, 0
            all_acts.append(action)
            if is_rand:
                rand_acts.append(1)
            else:
                rand_acts.append(0)
            # Learn
            self.learn(done, state, action, reward, state_, action_)
            all_rewards.append(reward)

            # Set next state and action to current
            state = state_
            action = action_

            # Increment score and steps
            score_e += reward
            step_e += 1
            self.step += 1

        # Append score
        self.score.append(score_e)
        self.score_100.append(score_e)
        self.actions.append(np.array(all_acts))
        self.rand_actions.append(np.array(rand_acts))
        self.rewards.append(np.array(all_rewards))
        mean_score = np.mean(self.score_100)

        # Increment episode
        self.episode += 1
Example #18
0
                      outputs={
                          'pi': opt_action,
                          'q': opt_action_value
                      })

# Main loop
start_time = time.time()
ep_len, rewd, s = 0, 0.0, env.reset()
for t in range(num_epochs * ep_per_epoch):

    if random_steps >= t:
        a = action_space.sample()
        if random_steps == t:
            print("Finished pure random episodes")
    else:
        a = get_action({s_ph: s.reshape(1, -1)}, opt_action, sess, max_act,
                       action_dim)

    s2, r, done, _ = env.step(a)
    rewd += r
    ep_len += 1
    #env.render()

    # Ignoring done signal if comes from end of episode
    # because d is about if the agent died because of a
    # very bad action, not because the episode ended
    # TODO could send wrong information
    d = False if ep_len == max_ep_len else done

    # Store transition
    buf.store(s, a, r, s2, d)
    s = s2
Example #19
0
# parameter
datetime_minute_cached = None
position = 1

# order
while ws.ws.sock.connected:
    try:
        if datetime_minute_cached != datetime.now().minute:
            cur_time = datetime.now(pytz.timezone('Asia/Seoul'))

            df = get_minute_data(client,
                                 args.symbol,
                                 minutes=1,
                                 cur_time=cur_time)

            action = get_action(df)

            if action == 1 and position == 1:
                # market_order(client, args.symbol, "buy", args.amount)

                position = -1
                print("BUY")

            elif action == -1 and position == -1:
                # market_order(client, args.symbol, "sell", args.amount)

                position = 1
                print("SELL")

            else:
                print("HOLD")
def train_for_n(nb_epoch=5000, BATCH_SIZE=32):
    for e in tqdm(range(nb_epoch)):
        ### Shuffle and Batch the data
        _random = np.random.randint(0, emb_cs.shape[0], size=BATCH_SIZE)
        _random2 = np.random.randint(0, emb_zh.shape[0], size=BATCH_SIZE)
        if not WORD_ONLY:
            pos_seq_cs_batch = pos_seq_cs[_random]
            pos_seq_zh_batch = pos_seq_zh[_random2]
        emb_cs_batch = emb_cs[_random]
        emb_zh_batch = emb_zh[_random2]
        noise_g = np.random.normal(0,
                                   1,
                                   size=(BATCH_SIZE, MAX_SEQUENCE_LENGTH,
                                         NOISE_SIZE))
        reward_batch = np.zeros((BATCH_SIZE, 1))

        #############################################
        ### Train generator
        #############################################
        for ep in range(1):  # G v.s. D training ratio
            if not WORD_ONLY:
                output_g = generator.predict(
                    [emb_zh_batch, pos_seq_zh_batch, noise_g, reward_batch])
            else:
                output_g = generator.predict(
                    [emb_zh_batch, noise_g, reward_batch])
            action_g, action_one_hot_g = get_action(output_g)
            emb_g = translate(emb_zh_batch, action_g)
            text_g = translate_output(emb_zh_batch, action_g)

            # tag POS
            if not WORD_ONLY:
                pos_seq_g = []
                for line in text_g:
                    words = pseg.cut(line)
                    sub_data = []
                    idx = 0
                    for w in words:
                        if w.flag == "x":
                            idx = 0
                        elif idx == 0:
                            sub_data.append(postag[w.flag])
                            idx = 1
                    pos_seq_g.append(sub_data)

                pos_seq_g = pad_sequences(pos_seq_g,
                                          maxlen=MAX_SEQUENCE_LENGTH,
                                          padding='post',
                                          truncating='post',
                                          value=0)

            one_hot_action = action_one_hot_g.reshape(BATCH_SIZE,
                                                      MAX_SEQUENCE_LENGTH, 2)

            make_trainable(generator, True)

            if not WORD_ONLY:
                reward_batch = discriminator.predict([emb_g, pos_seq_g])[:, 0]
                g_loss = generator.train_on_batch(
                    [emb_zh_batch, pos_seq_zh_batch, noise_g, reward_batch],
                    one_hot_action)
            else:
                reward_batch = discriminator.predict([emb_g])[:, 0]
                g_loss = generator.train_on_batch(
                    [emb_zh_batch, noise_g, reward_batch], one_hot_action)

            losses["g"].append(g_loss)
            write_log(callbacks, log_g, g_loss, len(losses["g"]))
            if g_loss < 0.15:  # early stop
                break

        #############################################
        ### Train discriminator on generated sentence
        #############################################
        X_emb = np.concatenate((emb_cs_batch, emb_g))
        if not WORD_ONLY:
            X_pos = np.concatenate((pos_seq_cs_batch, pos_seq_g))
        y = np.zeros([2 * BATCH_SIZE])
        y[0:BATCH_SIZE] = 0.7 + np.random.random([BATCH_SIZE]) * 0.3
        y[BATCH_SIZE:] = 0 + np.random.random([BATCH_SIZE]) * 0.3

        make_trainable(discriminator, True)
        model.embedding_word.trainable = False
        if not WORD_ONLY:
            model.embedding_pos.trainable = False
        model.g_bi.trainable = False

        for ep in range(1):  # G v.s. D training ratio
            if not WORD_ONLY:
                d_loss = discriminator.train_on_batch([X_emb, X_pos], y)
            else:
                d_loss = discriminator.train_on_batch([X_emb], y)
            losses["d"].append(d_loss)
            write_log(callbacks, log_d, d_loss, len(losses["d"]))
            if d_loss < 0.6:  # early stop
                break

    ### Save model
    generator.save_weights(MODEL_PATH + "gen.mdl")
    discriminator.save_weights(MODEL_PATH + "dis.mdl")
callbacks.set_model(generator)
earlystopper = EarlyStopping(monitor='val_loss', patience=2, verbose=1)

### Pre-train the discriminator network ...
print("========== PretrainING Discriminator START!")
t1 = time.time()

noise_g = np.random.normal(0,
                           1,
                           size=(ntrain, MAX_SEQUENCE_LENGTH, NOISE_SIZE))
if not WORD_ONLY:
    output_g = generator.predict([input_g_emb, input_g_pos, noise_g, reward])
else:
    output_g = generator.predict([input_g_emb, noise_g, reward])
action_g, action_one_hot_g = get_action(output_g)
emb_g = translate(input_g_emb, action_g)
text_g = translate_output(input_g_emb, action_g)

if not WORD_ONLY:
    pos_seq_g = []
    for line in text_g:
        words = pseg.cut(line)
        sub_data = []
        idx = 0
        for w in words:
            if w.flag == "x":
                idx = 0
            elif idx == 0:
                sub_data.append(postag[w.flag])
                idx = 1
Example #22
0
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = env.action_space.n
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = FuN(num_actions)

    optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)
    
    net.to(device)
    net.train()

    
    epsilon = 1.0
    steps = 0
    
    for e in range(10000):
        memory = Memory(capacity=400)
        done = False
        dead = False

        score = 0
        avg_loss = []
        start_life = 6
        state = env.reset()

        state = pre_process(state)
        state = torch.Tensor(state).to(device)
        state = state.permute(2, 0, 1)

        m_hx = torch.zeros(1, 288).to(device)
        m_cx = torch.zeros(1, 288).to(device)
        m_lstm = (m_hx, m_cx)

        w_hx = torch.zeros(1, 288).to(device)
        w_cx = torch.zeros(1, 288).to(device)
        w_lstm = (w_hx, w_cx)

        goals = torch.zeros(1, 288, 1).to(device)

        while not done:
            if args.render:
                env.render()

            steps += 1
            net_output = net(state.unsqueeze(0), m_lstm, w_lstm, goals)
            policy, goal, goals, m_lstm, w_lstm, m_value, w_value, m_state = net_output
            action = get_action(policy, num_actions)
            next_state, reward, done, info = env.step(action)

            next_state = pre_process(next_state)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.permute(2, 0, 1)

            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']
            
            score += reward
            reward = np.clip(reward, -1, 1)

            mask = 0 if dead else 1

            memory.push(action, reward, mask, goal, policy,
                        m_lstm, w_lstm, m_value, w_value, m_state)

            if dead:
                batch = memory.sample()
                loss = train_model(net, optimizer, batch, args.gamma)
                avg_loss.append(loss.cpu().data)

                dead = False
                m_hx = torch.zeros(1, 288).to(device)
                m_cx = torch.zeros(1, 288).to(device)
                m_lstm = (m_hx, m_cx)

                w_hx = torch.zeros(1, 288).to(device)
                w_cx = torch.zeros(1, 288).to(device)
                w_lstm = (w_hx, w_cx)

                goals = torch.zeros(1, 288, 1).to(device)
                
            state = next_state


        if e % args.log_interval == 0:
            print('{} episode | score: {:.2f} | steps: {} | loss: {:.4f}'.format(
                e, score, steps, np.mean(avg_loss)))
            writer.add_scalar('log/score', float(score), steps)
            writer.add_scalar('log/score', np.mean(avg_loss), steps)

        if score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break
Example #23
0
        memory = deque()

        steps = 0
        scores = []
        while steps < 2048:
            episodes += 1
            state = env.reset()
            state = running_state(state)
            score = 0
            for _ in range(10000):
                if episodes % 50 == 0:
                    env.render()

                steps += 1
                mu, std, _ = actor(torch.Tensor(state).unsqueeze(0))
                action = get_action(mu, std)[0]
                next_state, reward, done, _ = env.step(action)
                next_state = running_state(next_state)

                if done:
                    mask = 0
                else:
                    mask = 1

                memory.append([state, action, reward, mask])

                score += reward
                state = next_state

                if done:
                    break
Example #24
0
def self_play(agent, cur_memory, rank=0):
    agent.model.eval()
    state_black = deque()
    state_white = deque()
    pi_black = deque()
    pi_white = deque()
    episode = 0
    while True:
        if (episode + 1) % 10 == 0:
            logging.info('Playing Episode {:3}'.format(episode + 1))

        env = game.GameState('text')
        board = np.zeros((BOARD_SIZE, BOARD_SIZE), 'float')
        turn = 0
        root_id = (0, )
        win_index = 0
        time_steps = 0
        action_index = None

        while win_index == 0:
            if PRINT_SELFPLAY and rank == 0:
                utils.render_str(board, BOARD_SIZE, action_index)

            # ====================== start MCTS ============================ #

            if time_steps < TAU_THRES:
                tau = 1
            else:
                tau = 0

            pi = agent.get_pi(root_id, tau, rank)

            # ===================== collect samples ======================== #

            state = utils.get_state_pt(root_id, BOARD_SIZE, IN_PLANES)

            if turn == 0:
                state_black.appendleft(state)
                pi_black.appendleft(pi)
            else:
                state_white.appendleft(state)
                pi_white.appendleft(pi)

            # ======================== get action ========================== #

            action, action_index = utils.get_action(pi)
            root_id += (action_index, )

            # ====================== print evaluation ====================== #

            if PRINT_SELFPLAY and rank == 0:
                with torch.no_grad():
                    state_input = torch.tensor([state]).to(device).float()
                    p, v = agent.model(state_input)
                    p = p.cpu().numpy()[0]
                    v = v.item()

                    print('\nPi:\n{}'.format(
                        pi.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2)))
                    print('\nPolicy:\n{}'.format(
                        p.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2)))

                if turn == 0:
                    print("\nBlack's win%: {:.2f}%".format((v + 1) / 2 * 100))
                else:
                    print("\nWhite's win%: {:.2f}%".format((v + 1) / 2 * 100))

            # =========================== step ============================= #

            board, _, win_index, turn, _ = env.step(action)
            time_steps += 1

            # ========================== result ============================ #

            if win_index != 0:
                if win_index == 1:
                    reward_black = 1.
                    reward_white = -1.
                    result['Black'] += 1

                elif win_index == 2:
                    reward_black = -1.
                    reward_white = 1.
                    result['White'] += 1

                else:
                    reward_black = 0.
                    reward_white = 0.
                    result['Draw'] += 1

            # ====================== store in memory ======================= #

                while state_black or state_white:
                    if state_black:
                        cur_memory.append(
                            (state_black.pop(), pi_black.pop(), reward_black))
                    if state_white:
                        cur_memory.append(
                            (state_white.pop(), pi_white.pop(), reward_white))

            # =========================  result  =========================== #

                if PRINT_SELFPLAY and rank == 0:
                    utils.render_str(board, BOARD_SIZE, action_index)

                    bw, ww, dr = result['Black'], result['White'], \
                        result['Draw']
                    print('')
                    print('=' * 20, " {:3} Game End   ".format(episode + 1),
                          '=' * 20)
                    print('Black Win: {:3}   '
                          'White Win: {:3}   '
                          'Draw: {:2}   '
                          'Win%: {:.2f}%'.format(bw, ww, dr, (bw + 0.5 * dr) /
                                                 (bw + ww + dr) * 100))
                    print('current memory size:', len(cur_memory))
                episode += 1
                agent.reset()
                if len(cur_memory) >= MEMORY_SIZE:
                    return utils.augment_dataset(cur_memory, BOARD_SIZE)
Example #25
0
def self_play(n_selfplay):
    global cur_memory, rep_memory
    global Agent

    state_black = deque()
    state_white = deque()
    pi_black = deque()
    pi_white = deque()

    if RESIGN_MODE:
        resign_val_balck = []
        resign_val_white = []
        resign_val = []
        resign_v = -1.0
        n_resign_thres = N_SELFPLAY // 4

    for episode in range(n_selfplay):
        if (episode + 1) % 10 == 0:
            logging.warning('Playing Episode {:3}'.format(episode + 1))

        env = game.GameState('text')
        board = np.zeros((BOARD_SIZE, BOARD_SIZE), 'float')
        turn = 0
        root_id = (0, )
        win_index = 0
        time_steps = 0
        action_index = None

        if RESIGN_MODE:
            resign_index = 0

        while win_index == 0:
            if PRINT_SELFPLAY:
                utils.render_str(board, BOARD_SIZE, action_index)

            # ====================== start MCTS ============================ #

            if time_steps < TAU_THRES:
                tau = 1
            else:
                tau = 0

            pi = Agent.get_pi(root_id, tau)

            # ===================== collect samples ======================== #

            state = utils.get_state_pt(root_id, BOARD_SIZE, IN_PLANES)

            if turn == 0:
                state_black.appendleft(state)
                pi_black.appendleft(pi)
            else:
                state_white.appendleft(state)
                pi_white.appendleft(pi)

            # ======================== get action ========================== #

            action, action_index = utils.get_action(pi)
            root_id += (action_index, )

            # ====================== print evaluation ====================== #

            if PRINT_SELFPLAY:
                Agent.model.eval()
                with torch.no_grad():
                    state_input = torch.tensor([state]).to(device).float()
                    p, v = Agent.model(state_input)
                    p = p.cpu().numpy()[0]
                    v = v.item()

                    print('\nPi:\n{}'.format(
                        pi.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2)))
                    print('\nPolicy:\n{}'.format(
                        p.reshape(BOARD_SIZE, BOARD_SIZE).round(decimals=2)))

                if turn == 0:
                    print("\nBlack's win%: {:.2f}%".format((v + 1) / 2 * 100))
                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            resign_val_balck.append(v)
                        elif v < resign_v:
                            resign_index = 2
                            if PRINT_SELFPLAY:
                                print('"Black Resign!"')
                else:
                    print("\nWhite's win%: {:.2f}%".format((v + 1) / 2 * 100))
                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            resign_val_white.append(v)
                        elif v < resign_v:
                            resign_index = 1
                            if PRINT_SELFPLAY:
                                print('"White Resign!"')

            # =========================== step ============================= #

            board, _, win_index, turn, _ = env.step(action)
            time_steps += 1

            # ========================== result ============================ #

            if RESIGN_MODE:
                if resign_index != 0:
                    win_index = resign_index
                    result['Resign'] += 1

            if win_index != 0:
                if win_index == 1:
                    reward_black = 1.
                    reward_white = -1.
                    result['Black'] += 1

                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            for val in resign_val_balck:
                                resign_val.append(val)
                            resign_val_balck.clear()
                            resign_val_white.clear()

                elif win_index == 2:
                    reward_black = -1.
                    reward_white = 1.
                    result['White'] += 1

                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            for val in resign_val_white:
                                resign_val.append(val)
                            resign_val_white.clear()
                            resign_val_balck.clear()
                else:
                    reward_black = 0.
                    reward_white = 0.
                    result['Draw'] += 1

                    if RESIGN_MODE:
                        if episode < n_resign_thres:
                            for val in resign_val_balck:
                                resign_val.append(val)
                            for val in resign_val_white:
                                resign_val.append(val)
                            resign_val_balck.clear()
                            resign_val_white.clear()

                if RESIGN_MODE:
                    if episode + 1 == n_resign_thres:
                        resign_v = min(resign_val)
                        resign_val.clear()

                    if PRINT_SELFPLAY:
                        print('Resign win%: {:.2f}%'.format(
                            (resign_v + 1) / 2 * 100))

            # ====================== store in memory ======================= #

                while state_black or state_white:
                    if state_black:
                        cur_memory.append(
                            (state_black.pop(), pi_black.pop(), reward_black))
                    if state_white:
                        cur_memory.append(
                            (state_white.pop(), pi_white.pop(), reward_white))

            # =========================  result  =========================== #

                if PRINT_SELFPLAY:
                    utils.render_str(board, BOARD_SIZE, action_index)

                    bw, ww, dr, rs = result['Black'], result['White'], \
                        result['Draw'], result['Resign']
                    print('')
                    print('=' * 20, " {:3} Game End   ".format(episode + 1),
                          '=' * 20)
                    print('Black Win: {:3}   '
                          'White Win: {:3}   '
                          'Draw: {:2}   '
                          'Win%: {:.2f}%'
                          '\nResign: {:2}'.format(bw, ww, dr, (bw + 0.5 * dr) /
                                                  (bw + ww + dr) * 100, rs))
                    print('current memory size:', len(cur_memory))

                Agent.reset()

    rep_memory.extend(utils.augment_dataset(cur_memory, BOARD_SIZE))
def main():
    env = gym.make(args.env_name)
    env.seed(500)
    torch.manual_seed(500)

    img_shape = env.observation_space.shape
    num_actions = 3
    print('image size:', img_shape)
    print('action size:', num_actions)

    net = QNet(num_actions)
    target_net = QNet(num_actions)
    update_target_model(net, target_net)

    optimizer = optim.RMSprop(net.parameters(), lr=0.00025, eps=0.01)
    writer = SummaryWriter('logs')

    if not os.path.isdir(args.save_path):
        os.makedirs(args.save_path)

    net.to(device)
    target_net.to(device)
    net.train()
    target_net.train()
    memory = Memory(100000)
    running_score = 0
    epsilon = 1.0
    steps = 0

    for e in range(10000):
        done = False
        dead = False

        score = 0
        avg_loss = []
        start_life = 5
        state = env.reset()

        state = pre_process(state)
        state = torch.Tensor(state).to(device)
        history = torch.stack((state, state, state, state))

        for i in range(3):
            action = env.action_space.sample()
            state, reward, done, info = env.step(action)
            state = pre_process(state)
            state = torch.Tensor(state).to(device)
            state = state.unsqueeze(0)
            history = torch.cat((state, history[:-1]), dim=0)

        while not done:
            if args.render:
                env.render()

            steps += 1
            qvalue = net(history.unsqueeze(0))
            action = get_action(epsilon, qvalue, num_actions)

            next_state, reward, done, info = env.step(action + 1)

            next_state = pre_process(next_state)
            next_state = torch.Tensor(next_state).to(device)
            next_state = next_state.unsqueeze(0)
            next_history = torch.cat((next_state, history[:-1]), dim=0)

            if start_life > info['ale.lives']:
                dead = True
                start_life = info['ale.lives']

            score += reward
            reward = np.clip(reward, -1, 1)

            mask = 0 if dead else 1
            memory.push(history.cpu(), next_history.cpu(), action, reward,
                        mask)

            if dead:
                dead = False

            if steps > args.initial_exploration:
                epsilon -= 1e-6
                epsilon = max(epsilon, 0.1)

                batch = memory.sample(args.batch_size)
                loss = train_model(net, target_net, optimizer, batch)

                if steps % args.update_target:
                    update_target_model(net, target_net)
            else:
                loss = 0

            avg_loss.append(loss)
            history = next_history

        if e % args.log_interval == 0:
            print(
                '{} episode | score: {:.2f} | epsilon: {:.4f} | steps: {} | loss: {:.4f}'
                .format(e, score, epsilon, steps, np.mean(avg_loss)))
            writer.add_scalar('log/score', float(score), steps)
            writer.add_scalar('log/score', np.mean(avg_loss), steps)

        if score > args.goal_score:
            ckpt_path = args.save_path + 'model.pth'
            torch.save(net.state_dict(), ckpt_path)
            print('running score exceeds 400 so end')
            break