Ejemplo n.º 1
0
def train(env, agent, args):
    monitor = Monitor(train=True, spec="-{}".format(args.method))
    monitor.init_log(args.log, "roi_m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name))
    env.reset()

    S = set()

    corWs = queue.Queue()

    # add two extreme points
    corWs.put(FloatTensor([1.0, 0.0]))
    corWs.put(FloatTensor([0.0, 1.0]))

    # outer_loop!
    for _ in range(args.ws):

        print(colored("size of corWs: {}".format(corWs.qsize()), "green"))

        if corWs.qsize() == 0:
            corWs.put(FloatTensor([1.0, 0.0]))
            corWs.put(FloatTensor([0.0, 1.0]))

        corner_w = corWs.get_nowait()
        while not is_corner(corner_w, S) and corWs.qsize()>0:
            corner_w = corWs.get_nowait()
            print(colored("{} left....".format(corWs.qsize()), "green"))
        if not is_corner(corner_w, S):
            print(colored("no more corner w...", "green"))
            print(colored("Final S contains", "green"))
            for s in S:
                print(colored(s, "green"))
            break
        print(colored("solve for w: {}".format(corner_w), "green"))

        for num_eps in range(int(args.episode_num / args.ws)):
            terminal = False
            env.reset()
            loss = 0
            cnt = 0
            tot_reward = 0

            tot_reward_mo = 0

            probe = None
            if args.env_name == "dst":
                probe = corner_w
            elif args.env_name in ['ft', 'ft5', 'ft7']:
                probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0])

            while not terminal:
                state = env.observe()
                action = agent.act(state, corner_w)
                agent.w_kept = corner_w
                next_state, reward, terminal = env.step(action)
                if args.log:
                    monitor.add_log(state, action, reward, terminal, agent.w_kept)
                agent.memorize(state, action, next_state, reward, terminal, roi=True)
                loss += agent.learn(corner_w)
                if cnt > 100:
                    terminal = True
                    agent.reset()
                tot_reward = tot_reward + (probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt)

                tot_reward_mo = tot_reward_mo + reward * np.power(args.gamma, cnt)

                cnt = cnt + 1

            _, q = agent.predict(probe)

            if args.env_name == "dst":
                act_1 = q[0, 3]
                act_2 = q[0, 1]
            elif args.env_name in ['ft', 'ft5', 'ft7']:
                act_1 = q[0, 1]
                act_2 = q[0, 0]

            if args.method == "crl-naive":
                act_1 = act_1.data.cpu()
                act_2 = act_2.data.cpu()
            elif args.method == "crl-envelope":
                act_1 = probe.dot(act_1.data)
                act_2 = probe.dot(act_2.data)
            elif args.method == "crl-energy":
                act_1 = probe.dot(act_1.data)
                act_2 = probe.dot(act_2.data)
            print("end of eps %d with total reward (1) %0.2f (%0.2f, %0.2f), the Q is %0.2f | %0.2f; loss: %0.4f" % (
                num_eps,
                tot_reward,
                tot_reward_mo[0],
                tot_reward_mo[1],
                act_1,
                act_2,
                # q__max,
                loss / cnt))
            monitor.update(num_eps,
                           tot_reward,
                           act_1,
                           act_2,
                           #    q__max,
                           loss / cnt)


        # agent.is_train=False
        terminal = False
        env.reset()
        cnt = 0
        tot_reward_mo = 0
        while not terminal:
            state = env.observe()
            action = agent.act(state, corner_w)
            agent.w_kept = corner_w
            next_state, reward, terminal = env.step(action)
            if cnt > 100:
                terminal = True
                agent.reset()
            tot_reward_mo = tot_reward_mo + reward * np.power(args.gamma, cnt)
            cnt = cnt + 1
        agent.is_train=True

        S, corWs = update_ccs(S, corWs, tot_reward_mo)

        print(colored("----------------\n", "red"))
        print(colored("Current S contains", "red"))
        for s in S:
            print(colored(s, "red"))
        print(colored("----------------\n", "red"))

    # if num_eps+1 % 100 == 0:
    # 	agent.save(args.save, args.model+args.name+"_tmp_{}".format(number))
    agent.save(args.save, "roi_m.{}_e.{}_n.{}".format(args.model, args.env_name, args.name))
Ejemplo n.º 2
0
def train(env, agent, args):
    monitor = Monitor(train=True, spec="-{}".format(args.method))
    monitor.init_log(
        args.log, "m.{}_e.{}_n.{}".format(args.model, args.env_name,
                                          args.name))
    env.reset()
    for num_eps in range(args.episode_num):
        terminal = False
        env.reset()
        loss = 0
        cnt = 0
        tot_reward = 0

        probe = None
        if args.env_name == "dst":
            probe = FloatTensor([0.8, 0.2])
        elif args.env_name in ['ft', 'ft5', 'ft7']:
            probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0])

        while not terminal:
            state = env.observe()
            action = agent.act(state)
            next_state, reward, terminal = env.step(action)
            if args.log:
                monitor.add_log(state, action, reward, terminal, agent.w_kept)
            agent.memorize(state, action, next_state, reward, terminal)
            loss += agent.learn()
            if cnt > 100:
                terminal = True
                agent.reset()
            tot_reward = tot_reward + (
                probe.cpu().numpy().dot(reward)) * np.power(args.gamma, cnt)
            cnt = cnt + 1

        _, q = agent.predict(probe)

        if args.env_name == "dst":
            act_1 = q[0, 3]
            act_2 = q[0, 1]
        elif args.env_name in ['ft', 'ft5', 'ft7']:
            act_1 = q[0, 1]
            act_2 = q[0, 0]

        if args.method == "crl-naive":
            act_1 = act_1.data.cpu()
            act_2 = act_2.data.cpu()
        elif args.method == "crl-envelope":
            act_1 = probe.dot(act_1.data)
            act_2 = probe.dot(act_2.data)
        elif args.method == "crl-energy":
            act_1 = probe.dot(act_1.data)
            act_2 = probe.dot(act_2.data)
        print(
            "end of eps %d with total reward (1) %0.2f, the Q is %0.2f | %0.2f; loss: %0.4f"
            % (
                num_eps,
                tot_reward,
                act_1,
                act_2,
                # q__max,
                loss / cnt))
        monitor.update(
            num_eps,
            tot_reward,
            act_1,
            act_2,
            #    q__max,
            loss / cnt)
    if num_eps + 1 % 500 == 0:
        agent.save(
            args.save, "m.{}_e.{}_n.{}".format(args.model, args.env_name,
                                               args.name))
Ejemplo n.º 3
0
def train(env, agent, args):
    monitor = Monitor(train=True, spec="-{}".format(args.method))
    monitor.init_log(
        args.log, "m.{}_e.{}_n.{}".format(args.model, args.env_name,
                                          args.name))
    env.reset()
    initial_state = env.observe()
    for num_eps in range(args.episode_num):
        terminal = False
        env.reset()
        loss = 0
        cnt = 0
        act1 = 0
        act2 = 0
        tot_reward = 0
        tot_reward_nc = 0
        tot_reward_dist = 0
        mask = None
        next_mask = None
        probe = None
        if args.env_name == "dst":
            probe = FloatTensor([0.8, 0.2])
        elif args.env_name == "crp":
            probe = FloatTensor([0.5, 0.5])
        elif args.env_name in ['ft', 'ft5', 'ft7']:
            probe = FloatTensor([0.8, 0.2, 0.0, 0.0, 0.0, 0.0])

        while not terminal:
            t_now = time.time()
            state = env.observe()
            t_obs = time.time() - t_now
            t_now = time.time()
            if args.env_name == "crp":
                mask = env.env.get_action_out_mask()
            action = agent.act(state, mask=mask)
            t_policy = time.time() - t_now
            t_now = time.time()
            next_state, reward, terminal = env.step(action, step=0.5)
            t_step = time.time() - t_now
            if args.env_name == "crp":
                next_mask = env.env.get_action_out_mask()
            if args.log:
                monitor.add_log(state, action, reward, terminal, agent.w_kept)
            t_now = time.time()
            agent.memorize(state, action, next_state, reward, terminal, mask,
                           next_mask)
            t_mem = time.time() - t_now
            t_now = time.time()
            loss += agent.learn()
            t_learn = time.time() - t_now
            if terminal:
                # terminal = True
                t_now = time.time()
                agent.reset()
                t_reset = time.time() - t_now
            tot_reward = tot_reward + (probe.cpu().numpy().dot(reward))
            act1 += reward[0]
            act2 += reward[1]
            tot_reward_nc = tot_reward_nc + 1 - reward[0]
            tot_reward_dist = tot_reward_dist + env.env.get_distortion(
                absolute=True, tollerance=0) / 10
            cnt = cnt + 1

        # _, q = agent.predict(probe, initial_state=initial_state)

        # if args.env_name == "dst":
        #     act_1 = q[0, 3]
        #     act_2 = q[0, 1]
        if args.env_name == "crp":
            act_1 = act1
            act_2 = act2
        # elif args.env_name in ['ft', 'ft5', 'ft7']:
        # act_1 = q[0, 1]
        # act_2 = q[0, 0]

        # if args.method == "crl-naive":
        #     act_1 = act_1.data.cpu()
        #     act_2 = act_2.data.cpu()
        # elif args.method == "crl-envelope":
        #     act_1 = probe.dot(act_1.data)
        #     act_2 = probe.dot(act_2.data)
        # elif args.method == "crl-energy":
        #     act_1 = probe.dot(act_1.data)
        #     act_2 = probe.dot(act_2.data)
        print(
            "end of eps %d with total reward (1) %0.2f, the Q is %0.2f | %0.2f; loss: %0.4f;  total_nc: %0.2f; total_dist: %0.2f;beta : %0.2f;eps : %0.2f;"
            % (
                num_eps,
                tot_reward,
                act_1,
                act_2,
                # q__max,
                loss / cnt,
                tot_reward_nc,
                tot_reward_dist,
                agent.beta,
                agent.epsilon))
        # print("t_obs : %0.2f;t_policy : %0.2f;t_step : %0.2f;t_mem : %0.2f;t_learn : %0.2f;t_reset : %0.2f" % (
        #     t_obs,
        #     t_policy,
        #     t_step,
        #     t_mem,
        #     t_learn,
        #     t_reset,))

        monitor.update(
            num_eps,
            tot_reward,
            act_1,
            act_2,
            #    q__max,
            loss / cnt)
        if (num_eps) % 10 == 0:
            agent.save(
                args.save, "m.{}_e.{}_n.{}".format(args.model, args.env_name,
                                                   args.name))
            agent.save(
                args.save,
                "m.{}_e.{}_n.{}.ep{}".format(args.model, args.env_name,
                                             args.name, num_eps // 100))