Ejemplo n.º 1
0
def test(env, args):
    current_model = DQN(env, args).to(args.device)
    current_model.eval()

    load_model(current_model, args)

    episode_reward = 0
    episode_length = 0

    state = env.reset()
    while True:
        if args.render:
            env.render()

        action = current_model.act(
            torch.FloatTensor(state).to(args.device), 0.)

        next_state, reward, done, _ = env.step(action)

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            break

    print("Test Result - Reward {} Length {}".format(episode_reward,
                                                     episode_length))
Ejemplo n.º 2
0
def round_01():
    how_many = 10
    model_file_path = "models\\random_forest\\rf-01-all-music.joblib"
    new_model = True
    models_root_dir = "models\\random_forest"
    model_name = "rf-01-all-music.joblib"
    if new_model and model_name is None or model_name == "":
        raise ValueError("Provide a model name")
    elif new_model:
        model = create_a_rf_classifier()
    else:
        model = load_model(model_file_path)
    train, test, features = get_test_and_train_data(how_many, training_percentage=0.75)
    model = train_model_using_data_from_file(model, features, train)

    pred = predict(model, test, features)
    acc = accuracy_score(test["class"], pred)
    print("Accuracy on test set {}".format(acc))

    pred = predict(model, train, features)
    acc = accuracy_score(train["class"], pred)
    print("Accuracy on training set {}".format(acc))

    if new_model:
        store_model(model, os.path.join(models_root_dir, model_name))
Ejemplo n.º 3
0
def predict_room_price_from_model(conf_model, room_param):
    err = validate_room_param(room_param)
    if err:
        status = {'success': False, 'err': err}
    else:
        obj_param = standardize_room_param(room_param)
        model_param_1D = np.array(list(obj_param.values()))
        model_param = model_param_1D.reshape(1, -1)
        price = -1

        # Load models nếu lần đầu chưa load được
        if conf_model['reload']:
            conf_model['model'] = utl.load_model(root_path +
                                                 conf_model['path'])
            conf_model['reload'] = False

        if conf_model['model'] is not None:
            try:
                price = conf_model['model'].predict(model_param)
            except Exception as e:
                conf_model['reload'] = True
                print("[Error] : Prediction failed !!")
        else:
            conf_model['reload'] = True

        # Kiểm tra giá hợp lệ không
        if price <= 0:
            status = {'success': False, 'predict': -1}
        else:
            price = price.flatten()
            price_room = np.float64(price[0])
            str_price = round_currency_up(price_room)
            status = {'success': True, 'predict': str_price}
    return status
Ejemplo n.º 4
0
def check_video():
    # TODO: create general preprocessing function for all user title inputs
    if request.method == "POST":
        title = request.form["title"]
        classifier_type = request.form.get("classifier_type")

        # catch empty titles
        if not title:
            return render_template(
                "main.html"
            )  # add the flash message instead of this for clarity on UI

        # and now we start the magic
        # TODO: show on results page what kind of classifier was used
        if classifier_type == "simple_heuristics":
            model = simple_heuristics
            if model.predict(title):
                return render_template("result_good.html")
            return render_template("result_bad.html")

        else:
            # (bad) linear and logistic regression methods for now
            model = load_model(classifier_type)
            result = model.predict(simple_regression_test_processing(title))
            if result >= 36:
                return render_template("result_good.html")
            return render_template("result_bad.html")

    return render_template("main.html")
Ejemplo n.º 5
0
def test(env, args): 
    p1_current_model = DQN(env, args).to(args.device)
    p2_current_model = DQN(env, args).to(args.device)
    p1_policy = Policy(env).to(args.device)
    p2_policy = Policy(env).to(args.device)
    p1_current_model.eval(), p2_current_model.eval()
    p1_policy.eval(), p2_policy.eval()

    load_model(models={"p1": p1_current_model, "p2": p2_current_model},
               policies={"p1": p1_policy, "p2": p2_policy}, args=args)

    p1_reward_list = []
    p2_reward_list = []
    length_list = []

    for _ in range(30):
        (p1_state, p2_state) = env.reset()
        p1_episode_reward = 0
        p2_episode_reward = 0
        episode_length = 0
        while True:
            if args.render:
                env.render()
                sleep(0.01)

            # Agents follow average strategy
            p1_action = p1_policy.act(torch.FloatTensor(p1_state).to(args.device))
            p2_action = p2_policy.act(torch.FloatTensor(p2_state).to(args.device))

            actions = {"1": p1_action, "2": p2_action}

            (p1_next_state, p2_next_state), reward, done, _ = env.step(actions)

            (p1_state, p2_state) = (p1_next_state, p2_next_state)
            p1_episode_reward += reward[0]
            p2_episode_reward += reward[1]
            episode_length += 1

            if done:
                p1_reward_list.append(p1_episode_reward)
                p2_reward_list.append(p2_episode_reward)
                length_list.append(episode_length)
                break
    
    print("Test Result - Length {:.2f} p1/Reward {:.2f} p2/Reward {:.2f}".format(
        np.mean(length_list), np.mean(p1_reward_list), np.mean(p2_reward_list)))
    
Ejemplo n.º 6
0
def test(env, args): 
    p1_current_model = DQN(env, args).to(args.device)
    p2_current_model = DQN(env, args).to(args.device)
    p1_current_model.eval()
    p2_current_model.eval()

    load_model(p1_current_model, args, 1)
    load_model(p2_current_model, args, 2)

    p1_reward_list = []
    p2_reward_list = []
    length_list = []

    for _ in range(30):
        (p1_state, p2_state) = env.reset()
        p1_episode_reward = 0
        p2_episode_reward = 0
        episode_length = 0
        while True:
            if args.render:
                env.render()
            from time import sleep
            sleep(0.2)

            p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), 0.0)
            p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), 0.0)

            actions = {"1": p1_action, "2": p2_action}

            (p1_next_state, p2_next_state), reward, done, _ = env.step(actions)

            (p1_state, p2_state) = (p1_next_state, p2_next_state)
            p1_episode_reward += reward[0]
            p2_episode_reward += reward[1]
            episode_length += 1

            if done:
                p1_reward_list.append(p1_episode_reward)
                p2_reward_list.append(p2_episode_reward)
                length_list.append(episode_length)
                break
    
    print("Test Result - p1/Reward {} p2/Reward Length {}".format(
        np.mean(p1_reward_list), np.mean(p2_reward_list)))
    
Ejemplo n.º 7
0
def test_round_1():
    model_file_path = "models\\random_forest\\rf-01-all-music-music.joblib"
    model = load_model(model_file_path)
    data = load_data(how_many=4, last=True)
    data = data.astype({'class': str})
    features = data.columns[:705]

    pred = predict(model, data, features)
    acc = accuracy_score(data["class"], pred)
    print("Accuracy on validation set {}".format(acc))
Ejemplo n.º 8
0
    def __init__(self):
        self._relevant_tags = {
            'control': {
                'noun': read_relevant_set('nouns', 'control'),
                'verb': read_relevant_set('verbs', 'control')
            },
            'patients': {
                'noun': read_relevant_set('nouns', 'patients'),
                'verb': read_relevant_set('verbs', 'patients')
            }
        }
        self._reference_tags = {
            'noun': read_reference_set('nouns'),
            'verb': read_reference_set('verbs')
        }

        # get part of speech tags
        self._answers_to_user_id_pos_data = {}
        pos_tags_generator = pos_tags_jsons_generator()
        for answer_num, ans_pos_tags in pos_tags_generator:
            self._answers_to_user_id_pos_data[answer_num] = ans_pos_tags

        # init model
        self._model = load_model('word2vec_dep.pickle')

        # calculate idf scores for words
        self._idf_scores = IdfScores(self.get_documents(), repair_document)
        self._idf_scores.calculate_idf_scores()

        self.missing_idf = {}
        self.words_without_embeddings = []
        self.missing_words = []
        self.pos_tags_used = {
            'control': {
                'nouns': [],
                'verbs': []
            },
            'patients': {
                'nouns': [],
                'verbs': []
            }
        }
        self.modifiers_used = {
            'control': {
                'noun': [],
                'verb': []
            },
            'patients': {
                'noun': [],
                'verb': []
            }
        }
Ejemplo n.º 9
0
def test_whole(current_model, env, args, num):
    load_model(current_model, args)
    episode_reward = 0
    episode_length = 0

    state = env.reset()
    lives = env.unwrapped.ale.lives()
    live = lives
    while live > 0:
        for i in range(5000):
            if args.render:
                env.render()
            if args.noisy:
                current_model.update_noisy_modules()

            action = current_model.act(
                torch.FloatTensor(state).to(args.device), 0.)

            next_state, reward, done, _ = env.step(action)

            state = next_state
            episode_reward += reward
            episode_length += 1
            if done:
                state = env.reset()
                live -= 1
                break

        if not done:
            while not done:
                if args.render:
                    env.render()
                _, _, done, _ = env.step(random.randrange(env.action_space.n))
            state = env.reset()
            live -= 1

    print("Test Result - Reward {} Length {} at {}".format(
        episode_reward, episode_length, num))
    return episode_reward
Ejemplo n.º 10
0
def test(env, args): 
    current_model = DQN(env, args).to(args.device)
    current_model.eval()

    load_model(current_model, args)

    episode_reward = 0
    episode_length = 0

    state_buffer = deque(maxlen=args.action_repeat)
    states_deque = actions_deque = rewards_deque = None
    state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat)
    while True:

        action = current_model.act(torch.FloatTensor(state).to(args.device), 0.)
        next_state, _, done, end = env.step(action, save_screenshots=True)
        add_state(next_state, state_buffer)
        next_state = recent_state(state_buffer)

        state = next_state

        if end:
            break
        # delete the agents that have reached the goal
        r_index = 0
        for r in range(len(done)):
            if done[r] is True:
                state_buffer, states_deque, actions_deque, rewards_deque = \
                    del_record(r_index, state_buffer, states_deque, actions_deque, rewards_deque)
                r_index -= 1
            r_index += 1
        next_state = recent_state(state_buffer)

        state = next_state
    PanicEnv.display(True)
    print("Test Result - Reward {} Length {}".format(episode_reward, episode_length))
    
Ejemplo n.º 11
0
    def calculate_all_scores(self):
        if os.path.isfile(self.derailment_precalc_scores):
            results_df = pd.read_csv(self.derailment_precalc_scores)
        else:
            self._model = load_model(self._embeddings_path)
            # iterate users
            results = []

            for user_data in tqdm(self._data,
                                  total=len(self._data),
                                  desc="Creating scores csv file"):
                result = self._calc_scores_per_user(user_data)
                results.extend(result)

            columns = [
                "user_id", "label", "answer_num", "valid_words_cnt", "score"
            ]
            results_df = pd.DataFrame(results, columns=columns)
            results_df.to_csv(self.derailment_precalc_scores, index=False)

        self._update_users_data(results_df)
Ejemplo n.º 12
0
def train(env, args, writer):
    p1_current_model = DQN(env, args).to(args.device)
    p1_target_model = DQN(env, args).to(args.device)
    update_target(p1_current_model, p1_target_model)
    p2_current_model = DQN(env, args).to(args.device)
    p2_target_model = DQN(env, args).to(args.device)
    update_target(p2_current_model, p2_target_model)

    if args.noisy:
        p1_current_model.update_noisy_modules()
        p1_target_model.update_noisy_modules()
        p2_current_model.update_noisy_modules()
        p2_target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(p1_current_model, args, 1)
        load_model(p2_current_model, args, 2)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
        p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        p1_replay_buffer = ReplayBuffer(args.buffer_size)
        p2_replay_buffer = ReplayBuffer(args.buffer_size)
    
    p1_state_deque = deque(maxlen=args.multi_step)
    p2_state_deque = deque(maxlen=args.multi_step)
    p1_reward_deque = deque(maxlen=args.multi_step)
    p1_action_deque = deque(maxlen=args.multi_step)
    p2_reward_deque = deque(maxlen=args.multi_step)
    p2_action_deque = deque(maxlen=args.multi_step)

    p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr)
    p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr)

    length_list = []
    p1_reward_list, p1_loss_list = [], []
    p2_reward_list, p2_loss_list = [], []
    p1_episode_reward, p2_episode_reward = 0, 0
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    (p1_state, p2_state) = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.noisy:
            p1_current_model.sample_noise()
            p1_target_model.sample_noise()
            p2_current_model.sample_noise()
            p2_target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon)
        p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon)

        if args.render:
            env.render()

        actions = {"1": p1_action, "2": p2_action}
        (p1_next_state, p2_next_state), reward, done, _ = env.step(actions)


        p1_state_deque.append(p1_state)
        p2_state_deque.append(p2_state)
        if args.negative:
            p1_reward_deque.append(reward[0] - 1)
        else:
            p1_reward_deque.append(reward[0])
        p1_action_deque.append(p1_action)
        if args.negative:
            p2_reward_deque.append(reward[1] - 1)
        else:
            p2_reward_deque.append(reward[1])
        p2_action_deque.append(p2_action)

        if len(p1_state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(p1_reward_deque, args.gamma)
            n_state = p1_state_deque[0]
            n_action = p1_action_deque[0]
            p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done))

            n_reward = multi_step_reward(p2_reward_deque, args.gamma)
            n_state = p2_state_deque[0]
            n_action = p2_action_deque[0]
            p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done))

        (p1_state, p2_state) = (p1_next_state, p2_next_state)
        p1_episode_reward += (reward[0])
        p2_episode_reward += (reward[1])
        if args.negative:
            p1_episode_reward -= 1
            p2_episode_reward -= 1
        episode_length += 1

        if done or episode_length > args.max_episode_length:
            (p1_state, p2_state) = env.reset()
            p1_reward_list.append(p1_episode_reward)
            p2_reward_list.append(p2_episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx)
            writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0
            p1_state_deque.clear()
            p2_state_deque.clear()
            p1_reward_deque.clear()
            p2_reward_deque.clear()
            p1_action_deque.clear()
            p2_action_deque.clear()

        if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta)
            p1_loss_list.append(loss.item())
            writer.add_scalar("data/p1_loss", loss.item(), frame_idx)

            loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta)
            p2_loss_list.append(loss.item())
            writer.add_scalar("data/p2_loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(p1_current_model, p1_target_model)
            update_target(p2_current_model, p2_target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list)
            print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list)
            p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear()
            p1_loss_list.clear(), p2_loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(p1_current_model, args, 1)
            save_model(p2_current_model, args, 2)

    save_model(p1_current_model, args, 1)
    save_model(p2_current_model, args, 2)
Ejemplo n.º 13
0
import gym
from a2c_algorithm_step_update import A2C

import sys
sys.path.append('..')
from common.utils import load_model

# env_name = 'Pendulum-v0'
# env = gym.make(env_name)
# a2c = A2C(env_name, env, is_continue_action_space=True)

env_name = 'MountainCar-v0'
env = gym.make(env_name)
a2c = A2C(env_name, env, is_continue_action_space=False, is_test=True)

load_model(a2c.actor, 'model_step_update/{}_model/best_actor'.format(env_name))
load_model(a2c.critic,
           'model_step_update/{}_model/best_critic'.format(env_name))
for _ in range(10):
    eval_r = a2c.evaluate(5, is_render=True)
    print('evaluate reward', eval_r)
Ejemplo n.º 14
0
def validate_3DDynamic(env, args):
    current_model = DQN_3D(env, args).to(args.device)
    load_model(current_model, args)
    current_model.update_noisy_modules()
    current_model.eval()

    TEST_EPISODES_PER_PLAN = 200
    NUM_TEST_PLANS = 10

    lowest_reward = 1e4
    highest_iou = 0.0
    lowest_iou = 1.0
    plan = env.plan
    episode_reward = 0
    count_brick_save = None
    count_step_save = None

    fig = plt.figure(figsize=[10, 5])
    ax1 = fig.add_subplot(1, 2, 1, projection='3d')
    ax2 = fig.add_subplot(1, 2, 2)

    cumulative_reward = 0
    cumulative_iou = 0
    best_env_memory = None

    for tests_set in range(3, 4):
        print("Validation Plan: ", tests_set)
        env.set_tests_set(tests_set)
        test_episode_count = 0
        total_reward = 0
        total_iou = 0
        for i in range(TEST_EPISODES_PER_PLAN):
            test_episode_count += 1
            state = env.reset()
            count_brick = 0
            count_step = 0
            while True:
                count_step += 1

                if args.noisy:
                    current_model.reset_parameters()
                    current_model.sample_noise()

                epsilon = 0.0
                with torch.no_grad():
                    action = current_model.act(torch.FloatTensor(state).to(args.device), epsilon)
                if action == 2:
                    count_brick += 1

                next_state, reward, done = env.step(action)

                state = next_state
                episode_reward += reward

                if done:
                    total_reward += episode_reward
                    lowest_reward = min(lowest_reward, episode_reward)

                    environment_memory = env.environment_memory[0, args.half_window_size: 34 - args.half_window_size]
                    iou = env._iou()
                    total_iou += iou
                    if iou > highest_iou:
                        highest_iou = iou
                        best_env_memory = environment_memory
                        count_brick_save = count_brick
                        count_step_save = count_step

                    if iou < lowest_iou:
                        lowest_iou = iou

                    episode_reward = 0
                    break
            print("\tTest Episode: ", test_episode_count, " / {} Average Reward: {} Average IOU: {}"
                  .format(TEST_EPISODES_PER_PLAN, total_reward / test_episode_count, total_iou / test_episode_count))

        avg_reward = total_reward / TEST_EPISODES_PER_PLAN
        avg_iou = total_iou / TEST_EPISODES_PER_PLAN

        cumulative_reward += avg_reward
        cumulative_iou += avg_iou

        print("\tTest Result - Plan: {} Average Reward: {} Lowest Reward: {} Average IOU: {}".format(
            tests_set, avg_reward, lowest_reward, avg_iou))

    avg_reward_allplans = cumulative_reward / NUM_TEST_PLANS
    avg_iou_allplans = cumulative_iou / NUM_TEST_PLANS
    print("\n\tTest Result (over all plans) - Average Reward: {} Lowest Reward: {} Average IOU: {}\n\n".format(
        avg_reward_allplans, lowest_reward, avg_iou_allplans))

    env.render(ax1, ax2, args, best_env_memory, plan, highest_iou, count_step_save,
               count_brick_save, args.load_model, iou_min=lowest_iou, iou_average=avg_iou_allplans,
               iter_times=TEST_EPISODES_PER_PLAN)

    plt.close(fig)
Ejemplo n.º 15
0
def validate_3DStatic(env, args):
    current_model = DQN_3D(env, args).to(args.device)
    current_model.update_noisy_modules()
    load_model(current_model, args)
    current_model.eval()

    NUM_TEST_EPISODES = 500

    lowest_reward = 1e4
    highest_iou = 0.0
    lowest_iou = 1.0
    plan = env.plan
    episode_reward = 0
    count_brick_save = None
    count_step_save = None
    best_env_memory = None

    fig = plt.figure(figsize=[10, 5])
    ax1 = fig.add_subplot(1, 2, 1, projection='3d')
    ax2 = fig.add_subplot(1, 2, 2)

    test_episode_count = 0
    total_reward = 0
    total_iou = 0

    for i in range(NUM_TEST_EPISODES):
        test_episode_count += 1
        state = env.reset()
        count_brick = 0
        count_step = 0

        while True:
            if args.noisy:
                # current_model.reset_parameters()
                current_model.sample_noise()

            count_step += 1

            epsilon = 0.0
            with torch.no_grad():
                action = current_model.act(torch.FloatTensor(state).to(args.device), epsilon)
            if action == 2:
                count_brick += 1

            next_state, reward, done = env.step(action)

            state = next_state
            episode_reward += reward

            if done:
                total_reward += episode_reward
                lowest_reward = min(lowest_reward, episode_reward)

                environment_memory = env.environment_memory[0, args.half_window_size: 34 - args.half_window_size]
                iou = env._iou()
                total_iou += iou
                if iou > highest_iou:
                    highest_iou = iou
                    best_env_memory = environment_memory
                    count_brick_save = count_brick
                    count_step_save = count_step
                if iou < lowest_iou:
                    lowest_iou = iou

                episode_reward = 0
                break

        if test_episode_count % 5 == 0:
            print("\tTest Episode: ", test_episode_count, " / {} Average Reward: {} Average IOU: {}"
                  .format(NUM_TEST_EPISODES, total_reward / test_episode_count, total_iou / test_episode_count))

    avg_reward = total_reward / NUM_TEST_EPISODES
    avg_iou = total_iou / NUM_TEST_EPISODES

    print("\tTest Result - Average Reward: {} Lowest Reward: {} Average IOU: {}".format(
        avg_reward, lowest_reward, avg_iou))

    env.render(ax1, ax2, args, best_env_memory, plan, highest_iou, count_step_save,
               count_brick_save, args.load_model, iou_min=lowest_iou, iou_average=avg_iou, iter_times=NUM_TEST_EPISODES)

    plt.close(fig)
Ejemplo n.º 16
0
def validate_2DDynamic(env, args):
    current_model = DQN_2D(env, args).to(args.device)
    load_model(current_model, args)
    current_model.eval()

    TEST_EPISODES_PER_PLAN = 200
    NUM_TEST_PLANS = 1

    lowest_reward = 1e4
    highest_iou = 0.0
    lowest_iou = 1.0
    episode_reward = 0
    count_brick_save = None
    count_step_save = None

    fig = plt.figure(figsize=(5, 5))
    ax = fig.add_subplot(1, 1, 1)

    cumulative_reward = 0
    cumulative_iou = 0

    for tests_set in range(6, 7):
        print("Validation Plan: ", tests_set)
        env.set_tests_set(tests_set)
        test_episode_count = 0
        total_reward = 0
        total_iou = 0
        for i in range(TEST_EPISODES_PER_PLAN):
            test_episode_count += 1
            state = env.reset()
            plan = env.plan
            count_brick = 0
            count_step = 0

            while True:
                count_step += 1

                epsilon = 0.0
                with torch.no_grad():
                    # print(state)
                    action = current_model.act(torch.FloatTensor(state[0]).to(args.device), epsilon)
                if action == 2:
                    count_brick += 1

                next_state, reward, done = env.step(action)

                state = next_state
                episode_reward += reward

                if done:
                    total_reward += episode_reward
                    lowest_reward = min(lowest_reward, episode_reward)

                    environment_memory = env.environment_memory[0, args.half_window_size: 34 - args.half_window_size]
                    iou = env._iou()
                    print(iou)
                    total_iou += iou
                    if iou > highest_iou:
                        highest_iou = iou
                        # print("NEW HIGH: ", highest_iou)
                        best_env_memory = environment_memory
                        count_brick_save = count_brick
                        count_step_save = count_step
                    if iou < lowest_iou:
                        lowest_iou = iou
                        # print("NEW LOW: ", lowest_iou)

                    episode_reward = 0
                    break
            if test_episode_count % 5 == 0:
                print("\tTest Episode: ", test_episode_count, " / {} Average Reward: {} Average IOU: {}"
                      .format(TEST_EPISODES_PER_PLAN, total_reward / test_episode_count, total_iou / test_episode_count))

        # print(total_iou)
        avg_reward = total_reward / TEST_EPISODES_PER_PLAN
        avg_iou = total_iou / TEST_EPISODES_PER_PLAN

        cumulative_reward += avg_reward
        cumulative_iou += avg_iou

        print("\tTest Result - Plan: {} Average Reward: {} Lowest Reward: {} Average IOU: {}".format(
            tests_set, avg_reward, lowest_reward, avg_iou))

    avg_reward_allplans = cumulative_reward / NUM_TEST_PLANS
    avg_iou_allplans = cumulative_iou / NUM_TEST_PLANS
    print("\n\tTest Result (over all plans) - Average Reward: {} Lowest Reward: {} Average IOU: {}\n\n".format(
        avg_reward_allplans, lowest_reward, avg_iou_allplans))

    env.render(ax, args, best_env_memory, plan, highest_iou, count_step_save,
               count_brick_save, args.load_model, iou_min=lowest_iou, iou_average=avg_iou_allplans,
               iter_times=TEST_EPISODES_PER_PLAN)

    plt.close(fig)
Ejemplo n.º 17
0
def train(env, args, writer):
    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

    if args.load_model:  # and os.path.isfile(args.load_model)
        load_model(current_model, args)
        load_model(target_model, args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    state_buffer = deque(maxlen=args.action_repeat)
    states_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]
    rewards_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]
    actions_deque = [
        deque(maxlen=args.multi_step) for _ in range(args.num_agents)
    ]

    optimizer = optim.Adam(current_model.parameters(), lr=args.lr)

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0
    episode_length = 0
    episode = 0

    prev_time = time.time()
    prev_frame = 1

    state, state_buffer = get_initial_state(env, state_buffer,
                                            args.action_repeat)
    for frame_idx in range(1, args.max_frames + 1):

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, reward, done, end = env.step(action,
                                                 save_screenshots=False)
        add_state(next_state, state_buffer)
        next_state = recent_state(state_buffer)

        for agent_index in range(len(done)):
            states_deque[agent_index].append((state[agent_index]))
            rewards_deque[agent_index].append(reward[agent_index])
            actions_deque[agent_index].append(action[agent_index])
            if len(states_deque[agent_index]
                   ) == args.multi_step or done[agent_index]:
                n_reward = multi_step_reward(rewards_deque[agent_index],
                                             args.gamma)
                n_state = states_deque[agent_index][0]
                n_action = actions_deque[agent_index][0]
                replay_buffer.push(n_state, n_action, n_reward,
                                   next_state[agent_index],
                                   np.float32(done[agent_index]))

        # delete the agents that have reached the goal
        r_index = 0
        for r in range(len(done)):
            if done[r] is True:
                state_buffer, states_deque, actions_deque, rewards_deque = del_record(
                    r_index, state_buffer, states_deque, actions_deque,
                    rewards_deque)
                r_index -= 1
            r_index += 1
        next_state = recent_state(state_buffer)

        state = next_state
        episode_reward += np.array(reward).mean()
        episode_length += 1

        if end:
            if args.save_video and episode % 10 == 0:
                evaluate(env, current_model, args)
            state, state_buffer = get_initial_state(env, state_buffer,
                                                    args.action_repeat)
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("data/episode_reward", episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            episode_reward, episode_length = 0, 0
            for d in range(len(states_deque)):
                states_deque[d].clear()
                rewards_deque[d].clear()
                actions_deque[d].clear()
            states_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            rewards_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            actions_deque = [
                deque(maxlen=args.multi_step) for _ in range(args.num_agents)
            ]
            episode += 1

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            losses = 0
            for _ in range(1):
                loss = compute_td_loss(current_model, target_model,
                                       replay_buffer, optimizer, args, beta)
                losses += loss.item()
            loss_list.append(losses)
            writer.add_scalar("data/loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list,
                      length_list, loss_list)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(current_model, args)

    save_model(current_model, args)
Ejemplo n.º 18
0
def train(env, args, writer, datetime):
    best_iou = -1.0
    if args.env in ['1DStatic', '1DDynamic']:
        current_model = DQN_1D(env, args).to(args.device)
        target_model = DQN_1D(env, args).to(args.device)
    elif args.env in ['2DStatic', '2DDynamic']:
        current_model = DQN_2D(env, args).to(args.device)
        target_model = DQN_2D(env, args).to(args.device)
    elif args.env in ['3DStatic', '3DDynamic']:
        current_model = DQN_3D(env, args).to(args.device)
        target_model = DQN_3D(env, args).to(args.device)

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    optimizer = optim.Adam(current_model.parameters(), lr=args.lr)

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0
    episode_length = 0
    episode = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        if args.env in ['2DDynamic']:
            action = current_model.act(
                torch.FloatTensor(state).to(args.device), epsilon)
        else:
            action = current_model.act(
                torch.FloatTensor(state).to(args.device), epsilon)
        next_state, reward, done = env.step(action)
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            episode += 1

            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            writer.add_scalar("Episode_reward/train", episode_reward, episode)
            writer.add_scalar("Episode_length/train", episode_length, episode)
            episode_reward = 0
            episode_length = 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            writer.add_scalar("Loss/train", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list,
                      length_list, loss_list, args)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()

            best_iou = test(env, args, current_model, best_iou, writer,
                            episode, datetime)
import gym
from ddpg_algorithm import DDPG

import sys
sys.path.append('..')
from common.utils import load_model

env_names = ['Pendulum-v0', 'HalfCheetah-v2', 'Hopper-v2']
env_name = env_names[1]
env = gym.make(env_name)
ddpg = DDPG(env_name, env, is_test=True)

load_model(ddpg.actor, 'model/{}_model/best_actor'.format(env_name))
load_model(ddpg.critic, 'model/{}_model/best_critic'.format(env_name))
for _ in range(10):
    eval_r = ddpg.evaluate(1, is_render=True)
    print('evaluate reward', eval_r)
Ejemplo n.º 20
0
def roc_speech_rf():
    model = load_model("models\\random_forest\\rf-01-all-speech.joblib")
    roc(model, features_nr=103, data_type="speech")
Ejemplo n.º 21
0
    words = data['words']
    vectors = []

    for word in words:
        try:
            vectors.append(model[word].tolist())
        except SystemError as e:
            log.error(f'An error occurred in <get_vectors>: \n{e}')

    return json.dumps(vectors)


if __name__ == '__main__':
    parser = optparse.OptionParser()
    parser.add_option('--embeddings_file', action="store")
    parser.add_option('--is_rsdd', action="store_true", default=False)
    options, remainder = parser.parse_args()

    start = datetime.datetime.now()
    print('Start loading FastText word embeddings at {}'.format(start))
    if options.is_rsdd:
        rsdd_data_path = os.path.join('..', DATA_DIR, 'ft_pretrained',
                                      'en_word2vec.pickle')
        model = pickle.load(rsdd_data_path)
    else:
        model = load_model(get_words(), options.embeddings_file)
    end = datetime.datetime.now()
    print('Finished! took: {}'.format(end - start))

    app.run(use_reloader=False, threaded=True)
Ejemplo n.º 22
0
def train(env, args, writer):
    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    for para in target_model.parameters():
        para.requires_grad = False

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()
    #target_model.eval()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    update_target(current_model, target_model)
    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
        args.buffer_size = replay_buffer.it_capacity
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    print_args(args)
    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    if args.optim == 'adam':
        optimizer = optim.Adam(current_model.parameters(),
                               lr=args.lr,
                               eps=args.adam_eps,
                               betas=(0.9, args.beta2))
    elif args.optim == 'laprop':
        optimizer = laprop.LaProp(current_model.parameters(),
                                  lr=args.lr,
                                  betas=(0.9, args.beta2))

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0.
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    evaluation_interval = args.evaluation_interval
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, raw_reward, done, _ = env.step(action)
        if args.clip_rewards:
            reward = np.clip(raw_reward, -1., 1.)
        else:
            reward = raw_reward
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += raw_reward
        episode_length += 1

        if episode_length >= 9950:
            while not done:
                _, _, done, _ = env.step(random.randrange(env.action_space.n))

        if done:
            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            if episode_length > 10000:
                print('{:.2f}'.format(episode_reward), end='')
            writer.add_scalar("data/episode_reward", episode_reward, frame_idx)
            writer.add_scalar("data/episode_length", episode_length, frame_idx)
            episode_reward, episode_length = 0., 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            writer.add_scalar("data/loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % evaluation_interval == 0:
            if len(length_list) > 0:
                print_log(frame_idx, prev_frame, prev_time, reward_list,
                          length_list, loss_list, args)
                reward_list.clear(), length_list.clear(), loss_list.clear()
                prev_frame = frame_idx
                prev_time = time.time()
                save_model(current_model, args)
            else:
                evaluation_interval += args.evaluation_interval
        if frame_idx % 200000 == 0:
            if args.adam_eps == 1.5e-4:
                save_model(current_model,
                           args,
                           name="{}_{}".format(args.optim, frame_idx))
            else:
                save_model(current_model,
                           args,
                           name="{}{:.2e}_{}".format(args.optim, args.adam_eps,
                                                     frame_idx))

    reward_list.append(episode_reward)
    length_list.append(episode_length)
    print_log(frame_idx, prev_frame, prev_time, reward_list, length_list,
              loss_list, args)
    reward_list.clear(), length_list.clear(), loss_list.clear()
    prev_frame = frame_idx
    prev_time = time.time()

    save_model(current_model, args)
Ejemplo n.º 23
0
def train(env, args, writer):
    # RL Model for Player 1
    p1_current_model = DQN(env, args).to(args.device)
    p1_target_model = DQN(env, args).to(args.device)
    update_target(p1_current_model, p1_target_model)

    # RL Model for Player 2
    p2_current_model = DQN(env, args).to(args.device)
    p2_target_model = DQN(env, args).to(args.device)
    update_target(p2_current_model, p2_target_model)

    # SL Model for Player 1, 2
    p1_policy = Policy(env).to(args.device)
    p2_policy = Policy(env).to(args.device)

    if args.load_model and os.path.isfile(args.load_model):
        load_model(models={
            "p1": p1_current_model,
            "p2": p2_current_model
        },
                   policies={
                       "p1": p1_policy,
                       "p2": p2_policy
                   },
                   args=args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)

    # Replay Buffer for Reinforcement Learning - Best Response
    p1_replay_buffer = ReplayBuffer(args.buffer_size)
    p2_replay_buffer = ReplayBuffer(args.buffer_size)

    # Reservoir Buffer for Supervised Learning - Average Strategy
    # TODO(Aiden): How to set buffer size of SL?
    p1_reservoir_buffer = ReservoirBuffer(args.buffer_size)
    p2_reservoir_buffer = ReservoirBuffer(args.buffer_size)

    # Deque data structure for multi-step learning
    p1_state_deque = deque(maxlen=args.multi_step)
    p1_reward_deque = deque(maxlen=args.multi_step)
    p1_action_deque = deque(maxlen=args.multi_step)

    p2_state_deque = deque(maxlen=args.multi_step)
    p2_reward_deque = deque(maxlen=args.multi_step)
    p2_action_deque = deque(maxlen=args.multi_step)

    # RL Optimizer for Player 1, 2
    p1_rl_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr)
    p2_rl_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr)

    # SL Optimizer for Player 1, 2
    # TODO(Aiden): Is it necessary to seperate learning rate for RL/SL?
    p1_sl_optimizer = optim.Adam(p1_policy.parameters(), lr=args.lr)
    p2_sl_optimizer = optim.Adam(p2_policy.parameters(), lr=args.lr)

    # Logging
    length_list = []
    p1_reward_list, p1_rl_loss_list, p1_sl_loss_list = [], [], []
    p2_reward_list, p2_rl_loss_list, p2_sl_loss_list = [], [], []
    p1_episode_reward, p2_episode_reward = 0, 0
    tag_interval_length = 0
    prev_time = time.time()
    prev_frame = 1

    # Main Loop
    (p1_state, p2_state) = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        is_best_response = False
        # TODO(Aiden):
        # Action should be decided by a combination of Best Response and Average Strategy
        if random.random() > args.eta:
            p1_action = p1_policy.act(
                torch.FloatTensor(p1_state).to(args.device))
            p2_action = p2_policy.act(
                torch.FloatTensor(p1_state).to(args.device))
        else:
            is_best_response = True
            epsilon = epsilon_by_frame(frame_idx)
            p1_action = p1_current_model.act(
                torch.FloatTensor(p1_state).to(args.device), epsilon)
            p2_action = p2_current_model.act(
                torch.FloatTensor(p2_state).to(args.device), epsilon)

        actions = {"1": p1_action, "2": p2_action}
        (p1_next_state, p2_next_state), reward, done, info = env.step(actions)
        # print(actions)  # {'1': 3, '2': 2}
        # print(p1_next_state) # [[[127 127 .....
        #print(reward, done, info) # [0 0] False None

        # Save current state, reward, action to deque for multi-step learning
        p1_state_deque.append(p1_state)
        p2_state_deque.append(p2_state)

        p1_reward = reward[0] - 1 if args.negative else reward[0]
        p2_reward = reward[1] - 1 if args.negative else reward[1]
        p1_reward_deque.append(p1_reward)
        p2_reward_deque.append(p2_reward)

        p1_action_deque.append(p1_action)
        p2_action_deque.append(p2_action)

        # Store (state, action, reward, next_state) to Replay Buffer for Reinforcement Learning
        if len(p1_state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(p1_reward_deque, args.gamma)
            n_state = p1_state_deque[0]
            n_action = p1_action_deque[0]
            p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state,
                                  np.float32(done))

            n_reward = multi_step_reward(p2_reward_deque, args.gamma)
            n_state = p2_state_deque[0]
            n_action = p2_action_deque[0]
            p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state,
                                  np.float32(done))

        # Store (state, action) to Reservoir Buffer for Supervised Learning
        if is_best_response:
            p1_reservoir_buffer.push(p1_state, p1_action)
            p2_reservoir_buffer.push(p2_state, p2_action)

        (p1_state, p2_state) = (p1_next_state, p2_next_state)

        # Logging
        p1_episode_reward += p1_reward
        p2_episode_reward += p2_reward
        tag_interval_length += 1

        if info is not None:
            length_list.append(tag_interval_length)
            tag_interval_length = 0

        # Episode done. Reset environment and clear logging records
        if done or tag_interval_length >= args.max_tag_interval:
            (p1_state, p2_state) = env.reset()
            p1_reward_list.append(p1_episode_reward)
            p2_reward_list.append(p2_episode_reward)
            writer.add_scalar("p1/episode_reward", p1_episode_reward,
                              frame_idx)
            writer.add_scalar("p2/episode_reward", p2_episode_reward,
                              frame_idx)
            writer.add_scalar("data/tag_interval_length", tag_interval_length,
                              frame_idx)
            p1_episode_reward, p2_episode_reward, tag_interval_length = 0, 0, 0
            p1_state_deque.clear(), p2_state_deque.clear()
            p1_reward_deque.clear(), p2_reward_deque.clear()
            p1_action_deque.clear(), p2_action_deque.clear()

        if (len(p1_replay_buffer) > args.rl_start
                and len(p1_reservoir_buffer) > args.sl_start
                and frame_idx % args.train_freq == 0):

            # Update Best Response with Reinforcement Learning
            loss = compute_rl_loss(p1_current_model, p1_target_model,
                                   p1_replay_buffer, p1_rl_optimizer, args)
            p1_rl_loss_list.append(loss.item())
            writer.add_scalar("p1/rl_loss", loss.item(), frame_idx)

            loss = compute_rl_loss(p2_current_model, p2_target_model,
                                   p2_replay_buffer, p2_rl_optimizer, args)
            p2_rl_loss_list.append(loss.item())
            writer.add_scalar("p2/rl_loss", loss.item(), frame_idx)

            # Update Average Strategy with Supervised Learning
            loss = compute_sl_loss(p1_policy, p1_reservoir_buffer,
                                   p1_sl_optimizer, args)
            p1_sl_loss_list.append(loss.item())
            writer.add_scalar("p1/sl_loss", loss.item(), frame_idx)

            loss = compute_sl_loss(p2_policy, p2_reservoir_buffer,
                                   p2_sl_optimizer, args)
            p2_sl_loss_list.append(loss.item())
            writer.add_scalar("p2/sl_loss", loss.item(), frame_idx)

        if frame_idx % args.update_target == 0:
            update_target(p1_current_model, p1_target_model)
            update_target(p2_current_model, p2_target_model)

        # Logging and Saving models
        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time,
                      (p1_reward_list, p2_reward_list), length_list,
                      (p1_rl_loss_list, p2_rl_loss_list),
                      (p1_sl_loss_list, p2_sl_loss_list))
            p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear()
            p1_rl_loss_list.clear(), p2_rl_loss_list.clear()
            p1_sl_loss_list.clear(), p2_sl_loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(models={
                "p1": p1_current_model,
                "p2": p2_current_model
            },
                       policies={
                           "p1": p1_policy,
                           "p2": p2_policy
                       },
                       args=args)

        # Render if rendering argument is on
        if args.render:
            env.render()

        save_model(models={
            "p1": p1_current_model,
            "p2": p2_current_model
        },
                   policies={
                       "p1": p1_policy,
                       "p2": p2_policy
                   },
                   args=args)
Ejemplo n.º 24
0
def roc_music_rf():
    model = load_model("models\\random_forest\\rf-01-all-music.joblib")
    roc(model)
Ejemplo n.º 25
0
def train(env, args):
    # Init WandB
    wandb.init(config=args)

    current_model = DQN(env, args).to(args.device)
    target_model = DQN(env, args).to(args.device)

    if args.noisy:
        current_model.update_noisy_modules()
        target_model.update_noisy_modules()

    if args.load_model and os.path.isfile(args.load_model):
        load_model(current_model, args)

    epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final,
                                         args.eps_decay)
    beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames)

    if args.prioritized_replay:
        replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha)
    else:
        replay_buffer = ReplayBuffer(args.buffer_size)

    state_deque = deque(maxlen=args.multi_step)
    reward_deque = deque(maxlen=args.multi_step)
    action_deque = deque(maxlen=args.multi_step)

    optimizer = optim.Adam(current_model.parameters(), lr=args.lr)

    reward_list, length_list, loss_list = [], [], []
    episode_reward = 0
    episode_length = 0

    prev_time = time.time()
    prev_frame = 1

    state = env.reset()
    for frame_idx in range(1, args.max_frames + 1):
        if args.render:
            env.render()

        if args.noisy:
            current_model.sample_noise()
            target_model.sample_noise()

        epsilon = epsilon_by_frame(frame_idx)
        action = current_model.act(
            torch.FloatTensor(state).to(args.device), epsilon)

        next_state, reward, done, _ = env.step(action)
        state_deque.append(state)
        reward_deque.append(reward)
        action_deque.append(action)

        if len(state_deque) == args.multi_step or done:
            n_reward = multi_step_reward(reward_deque, args.gamma)
            n_state = state_deque[0]
            n_action = action_deque[0]
            replay_buffer.push(n_state, n_action, n_reward, next_state,
                               np.float32(done))

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            state = env.reset()
            reward_list.append(episode_reward)
            length_list.append(episode_length)
            wandb.log({
                'episode_reward': episode_reward,
                'episode_length': episode_length,
            })
            episode_reward, episode_length = 0, 0
            state_deque.clear()
            reward_deque.clear()
            action_deque.clear()

        if len(replay_buffer
               ) > args.learning_start and frame_idx % args.train_freq == 0:
            beta = beta_by_frame(frame_idx)
            loss = compute_td_loss(current_model, target_model, replay_buffer,
                                   optimizer, args, beta)
            loss_list.append(loss.item())
            wandb.log({'loss': loss.item()})

        if frame_idx % args.update_target == 0:
            update_target(current_model, target_model)

        if frame_idx % args.evaluation_interval == 0:
            print_log(frame_idx, prev_frame, prev_time, reward_list,
                      length_list, loss_list)
            reward_list.clear(), length_list.clear(), loss_list.clear()
            prev_frame = frame_idx
            prev_time = time.time()
            save_model(current_model, args)

    save_model(current_model, args)