def test(env, args): current_model = DQN(env, args).to(args.device) current_model.eval() load_model(current_model, args) episode_reward = 0 episode_length = 0 state = env.reset() while True: if args.render: env.render() action = current_model.act( torch.FloatTensor(state).to(args.device), 0.) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward episode_length += 1 if done: break print("Test Result - Reward {} Length {}".format(episode_reward, episode_length))
def round_01(): how_many = 10 model_file_path = "models\\random_forest\\rf-01-all-music.joblib" new_model = True models_root_dir = "models\\random_forest" model_name = "rf-01-all-music.joblib" if new_model and model_name is None or model_name == "": raise ValueError("Provide a model name") elif new_model: model = create_a_rf_classifier() else: model = load_model(model_file_path) train, test, features = get_test_and_train_data(how_many, training_percentage=0.75) model = train_model_using_data_from_file(model, features, train) pred = predict(model, test, features) acc = accuracy_score(test["class"], pred) print("Accuracy on test set {}".format(acc)) pred = predict(model, train, features) acc = accuracy_score(train["class"], pred) print("Accuracy on training set {}".format(acc)) if new_model: store_model(model, os.path.join(models_root_dir, model_name))
def predict_room_price_from_model(conf_model, room_param): err = validate_room_param(room_param) if err: status = {'success': False, 'err': err} else: obj_param = standardize_room_param(room_param) model_param_1D = np.array(list(obj_param.values())) model_param = model_param_1D.reshape(1, -1) price = -1 # Load models nếu lần đầu chưa load được if conf_model['reload']: conf_model['model'] = utl.load_model(root_path + conf_model['path']) conf_model['reload'] = False if conf_model['model'] is not None: try: price = conf_model['model'].predict(model_param) except Exception as e: conf_model['reload'] = True print("[Error] : Prediction failed !!") else: conf_model['reload'] = True # Kiểm tra giá hợp lệ không if price <= 0: status = {'success': False, 'predict': -1} else: price = price.flatten() price_room = np.float64(price[0]) str_price = round_currency_up(price_room) status = {'success': True, 'predict': str_price} return status
def check_video(): # TODO: create general preprocessing function for all user title inputs if request.method == "POST": title = request.form["title"] classifier_type = request.form.get("classifier_type") # catch empty titles if not title: return render_template( "main.html" ) # add the flash message instead of this for clarity on UI # and now we start the magic # TODO: show on results page what kind of classifier was used if classifier_type == "simple_heuristics": model = simple_heuristics if model.predict(title): return render_template("result_good.html") return render_template("result_bad.html") else: # (bad) linear and logistic regression methods for now model = load_model(classifier_type) result = model.predict(simple_regression_test_processing(title)) if result >= 36: return render_template("result_good.html") return render_template("result_bad.html") return render_template("main.html")
def test(env, args): p1_current_model = DQN(env, args).to(args.device) p2_current_model = DQN(env, args).to(args.device) p1_policy = Policy(env).to(args.device) p2_policy = Policy(env).to(args.device) p1_current_model.eval(), p2_current_model.eval() p1_policy.eval(), p2_policy.eval() load_model(models={"p1": p1_current_model, "p2": p2_current_model}, policies={"p1": p1_policy, "p2": p2_policy}, args=args) p1_reward_list = [] p2_reward_list = [] length_list = [] for _ in range(30): (p1_state, p2_state) = env.reset() p1_episode_reward = 0 p2_episode_reward = 0 episode_length = 0 while True: if args.render: env.render() sleep(0.01) # Agents follow average strategy p1_action = p1_policy.act(torch.FloatTensor(p1_state).to(args.device)) p2_action = p2_policy.act(torch.FloatTensor(p2_state).to(args.device)) actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += reward[0] p2_episode_reward += reward[1] episode_length += 1 if done: p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) break print("Test Result - Length {:.2f} p1/Reward {:.2f} p2/Reward {:.2f}".format( np.mean(length_list), np.mean(p1_reward_list), np.mean(p2_reward_list)))
def test(env, args): p1_current_model = DQN(env, args).to(args.device) p2_current_model = DQN(env, args).to(args.device) p1_current_model.eval() p2_current_model.eval() load_model(p1_current_model, args, 1) load_model(p2_current_model, args, 2) p1_reward_list = [] p2_reward_list = [] length_list = [] for _ in range(30): (p1_state, p2_state) = env.reset() p1_episode_reward = 0 p2_episode_reward = 0 episode_length = 0 while True: if args.render: env.render() from time import sleep sleep(0.2) p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), 0.0) p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), 0.0) actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += reward[0] p2_episode_reward += reward[1] episode_length += 1 if done: p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) break print("Test Result - p1/Reward {} p2/Reward Length {}".format( np.mean(p1_reward_list), np.mean(p2_reward_list)))
def test_round_1(): model_file_path = "models\\random_forest\\rf-01-all-music-music.joblib" model = load_model(model_file_path) data = load_data(how_many=4, last=True) data = data.astype({'class': str}) features = data.columns[:705] pred = predict(model, data, features) acc = accuracy_score(data["class"], pred) print("Accuracy on validation set {}".format(acc))
def __init__(self): self._relevant_tags = { 'control': { 'noun': read_relevant_set('nouns', 'control'), 'verb': read_relevant_set('verbs', 'control') }, 'patients': { 'noun': read_relevant_set('nouns', 'patients'), 'verb': read_relevant_set('verbs', 'patients') } } self._reference_tags = { 'noun': read_reference_set('nouns'), 'verb': read_reference_set('verbs') } # get part of speech tags self._answers_to_user_id_pos_data = {} pos_tags_generator = pos_tags_jsons_generator() for answer_num, ans_pos_tags in pos_tags_generator: self._answers_to_user_id_pos_data[answer_num] = ans_pos_tags # init model self._model = load_model('word2vec_dep.pickle') # calculate idf scores for words self._idf_scores = IdfScores(self.get_documents(), repair_document) self._idf_scores.calculate_idf_scores() self.missing_idf = {} self.words_without_embeddings = [] self.missing_words = [] self.pos_tags_used = { 'control': { 'nouns': [], 'verbs': [] }, 'patients': { 'nouns': [], 'verbs': [] } } self.modifiers_used = { 'control': { 'noun': [], 'verb': [] }, 'patients': { 'noun': [], 'verb': [] } }
def test_whole(current_model, env, args, num): load_model(current_model, args) episode_reward = 0 episode_length = 0 state = env.reset() lives = env.unwrapped.ale.lives() live = lives while live > 0: for i in range(5000): if args.render: env.render() if args.noisy: current_model.update_noisy_modules() action = current_model.act( torch.FloatTensor(state).to(args.device), 0.) next_state, reward, done, _ = env.step(action) state = next_state episode_reward += reward episode_length += 1 if done: state = env.reset() live -= 1 break if not done: while not done: if args.render: env.render() _, _, done, _ = env.step(random.randrange(env.action_space.n)) state = env.reset() live -= 1 print("Test Result - Reward {} Length {} at {}".format( episode_reward, episode_length, num)) return episode_reward
def test(env, args): current_model = DQN(env, args).to(args.device) current_model.eval() load_model(current_model, args) episode_reward = 0 episode_length = 0 state_buffer = deque(maxlen=args.action_repeat) states_deque = actions_deque = rewards_deque = None state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) while True: action = current_model.act(torch.FloatTensor(state).to(args.device), 0.) next_state, _, done, end = env.step(action, save_screenshots=True) add_state(next_state, state_buffer) next_state = recent_state(state_buffer) state = next_state if end: break # delete the agents that have reached the goal r_index = 0 for r in range(len(done)): if done[r] is True: state_buffer, states_deque, actions_deque, rewards_deque = \ del_record(r_index, state_buffer, states_deque, actions_deque, rewards_deque) r_index -= 1 r_index += 1 next_state = recent_state(state_buffer) state = next_state PanicEnv.display(True) print("Test Result - Reward {} Length {}".format(episode_reward, episode_length))
def calculate_all_scores(self): if os.path.isfile(self.derailment_precalc_scores): results_df = pd.read_csv(self.derailment_precalc_scores) else: self._model = load_model(self._embeddings_path) # iterate users results = [] for user_data in tqdm(self._data, total=len(self._data), desc="Creating scores csv file"): result = self._calc_scores_per_user(user_data) results.extend(result) columns = [ "user_id", "label", "answer_num", "valid_words_cnt", "score" ] results_df = pd.DataFrame(results, columns=columns) results_df.to_csv(self.derailment_precalc_scores, index=False) self._update_users_data(results_df)
def train(env, args, writer): p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) if args.noisy: p1_current_model.update_noisy_modules() p1_target_model.update_noisy_modules() p2_current_model.update_noisy_modules() p2_target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(p1_current_model, args, 1) load_model(p2_current_model, args, 2) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: p1_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) p2_replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) p1_state_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) p1_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) length_list = [] p1_reward_list, p1_loss_list = [], [] p2_reward_list, p2_loss_list = [], [] p1_episode_reward, p2_episode_reward = 0, 0 episode_length = 0 prev_time = time.time() prev_frame = 1 (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.noisy: p1_current_model.sample_noise() p1_target_model.sample_noise() p2_current_model.sample_noise() p2_target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act(torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act(torch.FloatTensor(p2_state).to(args.device), epsilon) if args.render: env.render() actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, _ = env.step(actions) p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) if args.negative: p1_reward_deque.append(reward[0] - 1) else: p1_reward_deque.append(reward[0]) p1_action_deque.append(p1_action) if args.negative: p2_reward_deque.append(reward[1] - 1) else: p2_reward_deque.append(reward[1]) p2_action_deque.append(p2_action) if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) (p1_state, p2_state) = (p1_next_state, p2_next_state) p1_episode_reward += (reward[0]) p2_episode_reward += (reward[1]) if args.negative: p1_episode_reward -= 1 p2_episode_reward -= 1 episode_length += 1 if done or episode_length > args.max_episode_length: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) length_list.append(episode_length) writer.add_scalar("data/p1_episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("data/p2_episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) p1_episode_reward, p2_episode_reward, episode_length = 0, 0, 0 p1_state_deque.clear() p2_state_deque.clear() p1_reward_deque.clear() p2_reward_deque.clear() p1_action_deque.clear() p2_action_deque.clear() if len(p1_replay_buffer) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_optimizer, args, beta) p1_loss_list.append(loss.item()) writer.add_scalar("data/p1_loss", loss.item(), frame_idx) loss = compute_td_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_optimizer, args, beta) p2_loss_list.append(loss.item()) writer.add_scalar("data/p2_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, p1_reward_list, length_list, p1_loss_list) print_log(frame_idx, prev_frame, prev_time, p2_reward_list, length_list, p2_loss_list) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_loss_list.clear(), p2_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2) save_model(p1_current_model, args, 1) save_model(p2_current_model, args, 2)
import gym from a2c_algorithm_step_update import A2C import sys sys.path.append('..') from common.utils import load_model # env_name = 'Pendulum-v0' # env = gym.make(env_name) # a2c = A2C(env_name, env, is_continue_action_space=True) env_name = 'MountainCar-v0' env = gym.make(env_name) a2c = A2C(env_name, env, is_continue_action_space=False, is_test=True) load_model(a2c.actor, 'model_step_update/{}_model/best_actor'.format(env_name)) load_model(a2c.critic, 'model_step_update/{}_model/best_critic'.format(env_name)) for _ in range(10): eval_r = a2c.evaluate(5, is_render=True) print('evaluate reward', eval_r)
def validate_3DDynamic(env, args): current_model = DQN_3D(env, args).to(args.device) load_model(current_model, args) current_model.update_noisy_modules() current_model.eval() TEST_EPISODES_PER_PLAN = 200 NUM_TEST_PLANS = 10 lowest_reward = 1e4 highest_iou = 0.0 lowest_iou = 1.0 plan = env.plan episode_reward = 0 count_brick_save = None count_step_save = None fig = plt.figure(figsize=[10, 5]) ax1 = fig.add_subplot(1, 2, 1, projection='3d') ax2 = fig.add_subplot(1, 2, 2) cumulative_reward = 0 cumulative_iou = 0 best_env_memory = None for tests_set in range(3, 4): print("Validation Plan: ", tests_set) env.set_tests_set(tests_set) test_episode_count = 0 total_reward = 0 total_iou = 0 for i in range(TEST_EPISODES_PER_PLAN): test_episode_count += 1 state = env.reset() count_brick = 0 count_step = 0 while True: count_step += 1 if args.noisy: current_model.reset_parameters() current_model.sample_noise() epsilon = 0.0 with torch.no_grad(): action = current_model.act(torch.FloatTensor(state).to(args.device), epsilon) if action == 2: count_brick += 1 next_state, reward, done = env.step(action) state = next_state episode_reward += reward if done: total_reward += episode_reward lowest_reward = min(lowest_reward, episode_reward) environment_memory = env.environment_memory[0, args.half_window_size: 34 - args.half_window_size] iou = env._iou() total_iou += iou if iou > highest_iou: highest_iou = iou best_env_memory = environment_memory count_brick_save = count_brick count_step_save = count_step if iou < lowest_iou: lowest_iou = iou episode_reward = 0 break print("\tTest Episode: ", test_episode_count, " / {} Average Reward: {} Average IOU: {}" .format(TEST_EPISODES_PER_PLAN, total_reward / test_episode_count, total_iou / test_episode_count)) avg_reward = total_reward / TEST_EPISODES_PER_PLAN avg_iou = total_iou / TEST_EPISODES_PER_PLAN cumulative_reward += avg_reward cumulative_iou += avg_iou print("\tTest Result - Plan: {} Average Reward: {} Lowest Reward: {} Average IOU: {}".format( tests_set, avg_reward, lowest_reward, avg_iou)) avg_reward_allplans = cumulative_reward / NUM_TEST_PLANS avg_iou_allplans = cumulative_iou / NUM_TEST_PLANS print("\n\tTest Result (over all plans) - Average Reward: {} Lowest Reward: {} Average IOU: {}\n\n".format( avg_reward_allplans, lowest_reward, avg_iou_allplans)) env.render(ax1, ax2, args, best_env_memory, plan, highest_iou, count_step_save, count_brick_save, args.load_model, iou_min=lowest_iou, iou_average=avg_iou_allplans, iter_times=TEST_EPISODES_PER_PLAN) plt.close(fig)
def validate_3DStatic(env, args): current_model = DQN_3D(env, args).to(args.device) current_model.update_noisy_modules() load_model(current_model, args) current_model.eval() NUM_TEST_EPISODES = 500 lowest_reward = 1e4 highest_iou = 0.0 lowest_iou = 1.0 plan = env.plan episode_reward = 0 count_brick_save = None count_step_save = None best_env_memory = None fig = plt.figure(figsize=[10, 5]) ax1 = fig.add_subplot(1, 2, 1, projection='3d') ax2 = fig.add_subplot(1, 2, 2) test_episode_count = 0 total_reward = 0 total_iou = 0 for i in range(NUM_TEST_EPISODES): test_episode_count += 1 state = env.reset() count_brick = 0 count_step = 0 while True: if args.noisy: # current_model.reset_parameters() current_model.sample_noise() count_step += 1 epsilon = 0.0 with torch.no_grad(): action = current_model.act(torch.FloatTensor(state).to(args.device), epsilon) if action == 2: count_brick += 1 next_state, reward, done = env.step(action) state = next_state episode_reward += reward if done: total_reward += episode_reward lowest_reward = min(lowest_reward, episode_reward) environment_memory = env.environment_memory[0, args.half_window_size: 34 - args.half_window_size] iou = env._iou() total_iou += iou if iou > highest_iou: highest_iou = iou best_env_memory = environment_memory count_brick_save = count_brick count_step_save = count_step if iou < lowest_iou: lowest_iou = iou episode_reward = 0 break if test_episode_count % 5 == 0: print("\tTest Episode: ", test_episode_count, " / {} Average Reward: {} Average IOU: {}" .format(NUM_TEST_EPISODES, total_reward / test_episode_count, total_iou / test_episode_count)) avg_reward = total_reward / NUM_TEST_EPISODES avg_iou = total_iou / NUM_TEST_EPISODES print("\tTest Result - Average Reward: {} Lowest Reward: {} Average IOU: {}".format( avg_reward, lowest_reward, avg_iou)) env.render(ax1, ax2, args, best_env_memory, plan, highest_iou, count_step_save, count_brick_save, args.load_model, iou_min=lowest_iou, iou_average=avg_iou, iter_times=NUM_TEST_EPISODES) plt.close(fig)
def validate_2DDynamic(env, args): current_model = DQN_2D(env, args).to(args.device) load_model(current_model, args) current_model.eval() TEST_EPISODES_PER_PLAN = 200 NUM_TEST_PLANS = 1 lowest_reward = 1e4 highest_iou = 0.0 lowest_iou = 1.0 episode_reward = 0 count_brick_save = None count_step_save = None fig = plt.figure(figsize=(5, 5)) ax = fig.add_subplot(1, 1, 1) cumulative_reward = 0 cumulative_iou = 0 for tests_set in range(6, 7): print("Validation Plan: ", tests_set) env.set_tests_set(tests_set) test_episode_count = 0 total_reward = 0 total_iou = 0 for i in range(TEST_EPISODES_PER_PLAN): test_episode_count += 1 state = env.reset() plan = env.plan count_brick = 0 count_step = 0 while True: count_step += 1 epsilon = 0.0 with torch.no_grad(): # print(state) action = current_model.act(torch.FloatTensor(state[0]).to(args.device), epsilon) if action == 2: count_brick += 1 next_state, reward, done = env.step(action) state = next_state episode_reward += reward if done: total_reward += episode_reward lowest_reward = min(lowest_reward, episode_reward) environment_memory = env.environment_memory[0, args.half_window_size: 34 - args.half_window_size] iou = env._iou() print(iou) total_iou += iou if iou > highest_iou: highest_iou = iou # print("NEW HIGH: ", highest_iou) best_env_memory = environment_memory count_brick_save = count_brick count_step_save = count_step if iou < lowest_iou: lowest_iou = iou # print("NEW LOW: ", lowest_iou) episode_reward = 0 break if test_episode_count % 5 == 0: print("\tTest Episode: ", test_episode_count, " / {} Average Reward: {} Average IOU: {}" .format(TEST_EPISODES_PER_PLAN, total_reward / test_episode_count, total_iou / test_episode_count)) # print(total_iou) avg_reward = total_reward / TEST_EPISODES_PER_PLAN avg_iou = total_iou / TEST_EPISODES_PER_PLAN cumulative_reward += avg_reward cumulative_iou += avg_iou print("\tTest Result - Plan: {} Average Reward: {} Lowest Reward: {} Average IOU: {}".format( tests_set, avg_reward, lowest_reward, avg_iou)) avg_reward_allplans = cumulative_reward / NUM_TEST_PLANS avg_iou_allplans = cumulative_iou / NUM_TEST_PLANS print("\n\tTest Result (over all plans) - Average Reward: {} Lowest Reward: {} Average IOU: {}\n\n".format( avg_reward_allplans, lowest_reward, avg_iou_allplans)) env.render(ax, args, best_env_memory, plan, highest_iou, count_step_save, count_brick_save, args.load_model, iou_min=lowest_iou, iou_average=avg_iou_allplans, iter_times=TEST_EPISODES_PER_PLAN) plt.close(fig)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model: # and os.path.isfile(args.load_model) load_model(current_model, args) load_model(target_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_buffer = deque(maxlen=args.action_repeat) states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 episode = 0 prev_time = time.time() prev_frame = 1 state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) for frame_idx in range(1, args.max_frames + 1): if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, end = env.step(action, save_screenshots=False) add_state(next_state, state_buffer) next_state = recent_state(state_buffer) for agent_index in range(len(done)): states_deque[agent_index].append((state[agent_index])) rewards_deque[agent_index].append(reward[agent_index]) actions_deque[agent_index].append(action[agent_index]) if len(states_deque[agent_index] ) == args.multi_step or done[agent_index]: n_reward = multi_step_reward(rewards_deque[agent_index], args.gamma) n_state = states_deque[agent_index][0] n_action = actions_deque[agent_index][0] replay_buffer.push(n_state, n_action, n_reward, next_state[agent_index], np.float32(done[agent_index])) # delete the agents that have reached the goal r_index = 0 for r in range(len(done)): if done[r] is True: state_buffer, states_deque, actions_deque, rewards_deque = del_record( r_index, state_buffer, states_deque, actions_deque, rewards_deque) r_index -= 1 r_index += 1 next_state = recent_state(state_buffer) state = next_state episode_reward += np.array(reward).mean() episode_length += 1 if end: if args.save_video and episode % 10 == 0: evaluate(env, current_model, args) state, state_buffer = get_initial_state(env, state_buffer, args.action_repeat) reward_list.append(episode_reward) length_list.append(episode_length) writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0, 0 for d in range(len(states_deque)): states_deque[d].clear() rewards_deque[d].clear() actions_deque[d].clear() states_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] rewards_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] actions_deque = [ deque(maxlen=args.multi_step) for _ in range(args.num_agents) ] episode += 1 if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) losses = 0 for _ in range(1): loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) losses += loss.item() loss_list.append(losses) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)
def train(env, args, writer, datetime): best_iou = -1.0 if args.env in ['1DStatic', '1DDynamic']: current_model = DQN_1D(env, args).to(args.device) target_model = DQN_1D(env, args).to(args.device) elif args.env in ['2DStatic', '2DDynamic']: current_model = DQN_2D(env, args).to(args.device) target_model = DQN_2D(env, args).to(args.device) elif args.env in ['3DStatic', '3DDynamic']: current_model = DQN_3D(env, args).to(args.device) target_model = DQN_3D(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 episode = 0 prev_time = time.time() prev_frame = 1 state = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) if args.env in ['2DDynamic']: action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) else: action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done = env.step(action) state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += reward episode_length += 1 if done: episode += 1 state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) writer.add_scalar("Episode_reward/train", episode_reward, episode) writer.add_scalar("Episode_length/train", episode_length, episode) episode_reward = 0 episode_length = 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) writer.add_scalar("Loss/train", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() best_iou = test(env, args, current_model, best_iou, writer, episode, datetime)
import gym from ddpg_algorithm import DDPG import sys sys.path.append('..') from common.utils import load_model env_names = ['Pendulum-v0', 'HalfCheetah-v2', 'Hopper-v2'] env_name = env_names[1] env = gym.make(env_name) ddpg = DDPG(env_name, env, is_test=True) load_model(ddpg.actor, 'model/{}_model/best_actor'.format(env_name)) load_model(ddpg.critic, 'model/{}_model/best_critic'.format(env_name)) for _ in range(10): eval_r = ddpg.evaluate(1, is_render=True) print('evaluate reward', eval_r)
def roc_speech_rf(): model = load_model("models\\random_forest\\rf-01-all-speech.joblib") roc(model, features_nr=103, data_type="speech")
words = data['words'] vectors = [] for word in words: try: vectors.append(model[word].tolist()) except SystemError as e: log.error(f'An error occurred in <get_vectors>: \n{e}') return json.dumps(vectors) if __name__ == '__main__': parser = optparse.OptionParser() parser.add_option('--embeddings_file', action="store") parser.add_option('--is_rsdd', action="store_true", default=False) options, remainder = parser.parse_args() start = datetime.datetime.now() print('Start loading FastText word embeddings at {}'.format(start)) if options.is_rsdd: rsdd_data_path = os.path.join('..', DATA_DIR, 'ft_pretrained', 'en_word2vec.pickle') model = pickle.load(rsdd_data_path) else: model = load_model(get_words(), options.embeddings_file) end = datetime.datetime.now() print('Finished! took: {}'.format(end - start)) app.run(use_reloader=False, threaded=True)
def train(env, args, writer): current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) for para in target_model.parameters(): para.requires_grad = False if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() #target_model.eval() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) update_target(current_model, target_model) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) args.buffer_size = replay_buffer.it_capacity else: replay_buffer = ReplayBuffer(args.buffer_size) print_args(args) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) if args.optim == 'adam': optimizer = optim.Adam(current_model.parameters(), lr=args.lr, eps=args.adam_eps, betas=(0.9, args.beta2)) elif args.optim == 'laprop': optimizer = laprop.LaProp(current_model.parameters(), lr=args.lr, betas=(0.9, args.beta2)) reward_list, length_list, loss_list = [], [], [] episode_reward = 0. episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() evaluation_interval = args.evaluation_interval for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, raw_reward, done, _ = env.step(action) if args.clip_rewards: reward = np.clip(raw_reward, -1., 1.) else: reward = raw_reward state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += raw_reward episode_length += 1 if episode_length >= 9950: while not done: _, _, done, _ = env.step(random.randrange(env.action_space.n)) if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) if episode_length > 10000: print('{:.2f}'.format(episode_reward), end='') writer.add_scalar("data/episode_reward", episode_reward, frame_idx) writer.add_scalar("data/episode_length", episode_length, frame_idx) episode_reward, episode_length = 0., 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) writer.add_scalar("data/loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % evaluation_interval == 0: if len(length_list) > 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) else: evaluation_interval += args.evaluation_interval if frame_idx % 200000 == 0: if args.adam_eps == 1.5e-4: save_model(current_model, args, name="{}_{}".format(args.optim, frame_idx)) else: save_model(current_model, args, name="{}{:.2e}_{}".format(args.optim, args.adam_eps, frame_idx)) reward_list.append(episode_reward) length_list.append(episode_length) print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list, args) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args)
def train(env, args, writer): # RL Model for Player 1 p1_current_model = DQN(env, args).to(args.device) p1_target_model = DQN(env, args).to(args.device) update_target(p1_current_model, p1_target_model) # RL Model for Player 2 p2_current_model = DQN(env, args).to(args.device) p2_target_model = DQN(env, args).to(args.device) update_target(p2_current_model, p2_target_model) # SL Model for Player 1, 2 p1_policy = Policy(env).to(args.device) p2_policy = Policy(env).to(args.device) if args.load_model and os.path.isfile(args.load_model): load_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) # Replay Buffer for Reinforcement Learning - Best Response p1_replay_buffer = ReplayBuffer(args.buffer_size) p2_replay_buffer = ReplayBuffer(args.buffer_size) # Reservoir Buffer for Supervised Learning - Average Strategy # TODO(Aiden): How to set buffer size of SL? p1_reservoir_buffer = ReservoirBuffer(args.buffer_size) p2_reservoir_buffer = ReservoirBuffer(args.buffer_size) # Deque data structure for multi-step learning p1_state_deque = deque(maxlen=args.multi_step) p1_reward_deque = deque(maxlen=args.multi_step) p1_action_deque = deque(maxlen=args.multi_step) p2_state_deque = deque(maxlen=args.multi_step) p2_reward_deque = deque(maxlen=args.multi_step) p2_action_deque = deque(maxlen=args.multi_step) # RL Optimizer for Player 1, 2 p1_rl_optimizer = optim.Adam(p1_current_model.parameters(), lr=args.lr) p2_rl_optimizer = optim.Adam(p2_current_model.parameters(), lr=args.lr) # SL Optimizer for Player 1, 2 # TODO(Aiden): Is it necessary to seperate learning rate for RL/SL? p1_sl_optimizer = optim.Adam(p1_policy.parameters(), lr=args.lr) p2_sl_optimizer = optim.Adam(p2_policy.parameters(), lr=args.lr) # Logging length_list = [] p1_reward_list, p1_rl_loss_list, p1_sl_loss_list = [], [], [] p2_reward_list, p2_rl_loss_list, p2_sl_loss_list = [], [], [] p1_episode_reward, p2_episode_reward = 0, 0 tag_interval_length = 0 prev_time = time.time() prev_frame = 1 # Main Loop (p1_state, p2_state) = env.reset() for frame_idx in range(1, args.max_frames + 1): is_best_response = False # TODO(Aiden): # Action should be decided by a combination of Best Response and Average Strategy if random.random() > args.eta: p1_action = p1_policy.act( torch.FloatTensor(p1_state).to(args.device)) p2_action = p2_policy.act( torch.FloatTensor(p1_state).to(args.device)) else: is_best_response = True epsilon = epsilon_by_frame(frame_idx) p1_action = p1_current_model.act( torch.FloatTensor(p1_state).to(args.device), epsilon) p2_action = p2_current_model.act( torch.FloatTensor(p2_state).to(args.device), epsilon) actions = {"1": p1_action, "2": p2_action} (p1_next_state, p2_next_state), reward, done, info = env.step(actions) # print(actions) # {'1': 3, '2': 2} # print(p1_next_state) # [[[127 127 ..... #print(reward, done, info) # [0 0] False None # Save current state, reward, action to deque for multi-step learning p1_state_deque.append(p1_state) p2_state_deque.append(p2_state) p1_reward = reward[0] - 1 if args.negative else reward[0] p2_reward = reward[1] - 1 if args.negative else reward[1] p1_reward_deque.append(p1_reward) p2_reward_deque.append(p2_reward) p1_action_deque.append(p1_action) p2_action_deque.append(p2_action) # Store (state, action, reward, next_state) to Replay Buffer for Reinforcement Learning if len(p1_state_deque) == args.multi_step or done: n_reward = multi_step_reward(p1_reward_deque, args.gamma) n_state = p1_state_deque[0] n_action = p1_action_deque[0] p1_replay_buffer.push(n_state, n_action, n_reward, p1_next_state, np.float32(done)) n_reward = multi_step_reward(p2_reward_deque, args.gamma) n_state = p2_state_deque[0] n_action = p2_action_deque[0] p2_replay_buffer.push(n_state, n_action, n_reward, p2_next_state, np.float32(done)) # Store (state, action) to Reservoir Buffer for Supervised Learning if is_best_response: p1_reservoir_buffer.push(p1_state, p1_action) p2_reservoir_buffer.push(p2_state, p2_action) (p1_state, p2_state) = (p1_next_state, p2_next_state) # Logging p1_episode_reward += p1_reward p2_episode_reward += p2_reward tag_interval_length += 1 if info is not None: length_list.append(tag_interval_length) tag_interval_length = 0 # Episode done. Reset environment and clear logging records if done or tag_interval_length >= args.max_tag_interval: (p1_state, p2_state) = env.reset() p1_reward_list.append(p1_episode_reward) p2_reward_list.append(p2_episode_reward) writer.add_scalar("p1/episode_reward", p1_episode_reward, frame_idx) writer.add_scalar("p2/episode_reward", p2_episode_reward, frame_idx) writer.add_scalar("data/tag_interval_length", tag_interval_length, frame_idx) p1_episode_reward, p2_episode_reward, tag_interval_length = 0, 0, 0 p1_state_deque.clear(), p2_state_deque.clear() p1_reward_deque.clear(), p2_reward_deque.clear() p1_action_deque.clear(), p2_action_deque.clear() if (len(p1_replay_buffer) > args.rl_start and len(p1_reservoir_buffer) > args.sl_start and frame_idx % args.train_freq == 0): # Update Best Response with Reinforcement Learning loss = compute_rl_loss(p1_current_model, p1_target_model, p1_replay_buffer, p1_rl_optimizer, args) p1_rl_loss_list.append(loss.item()) writer.add_scalar("p1/rl_loss", loss.item(), frame_idx) loss = compute_rl_loss(p2_current_model, p2_target_model, p2_replay_buffer, p2_rl_optimizer, args) p2_rl_loss_list.append(loss.item()) writer.add_scalar("p2/rl_loss", loss.item(), frame_idx) # Update Average Strategy with Supervised Learning loss = compute_sl_loss(p1_policy, p1_reservoir_buffer, p1_sl_optimizer, args) p1_sl_loss_list.append(loss.item()) writer.add_scalar("p1/sl_loss", loss.item(), frame_idx) loss = compute_sl_loss(p2_policy, p2_reservoir_buffer, p2_sl_optimizer, args) p2_sl_loss_list.append(loss.item()) writer.add_scalar("p2/sl_loss", loss.item(), frame_idx) if frame_idx % args.update_target == 0: update_target(p1_current_model, p1_target_model) update_target(p2_current_model, p2_target_model) # Logging and Saving models if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, (p1_reward_list, p2_reward_list), length_list, (p1_rl_loss_list, p2_rl_loss_list), (p1_sl_loss_list, p2_sl_loss_list)) p1_reward_list.clear(), p2_reward_list.clear(), length_list.clear() p1_rl_loss_list.clear(), p2_rl_loss_list.clear() p1_sl_loss_list.clear(), p2_sl_loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args) # Render if rendering argument is on if args.render: env.render() save_model(models={ "p1": p1_current_model, "p2": p2_current_model }, policies={ "p1": p1_policy, "p2": p2_policy }, args=args)
def roc_music_rf(): model = load_model("models\\random_forest\\rf-01-all-music.joblib") roc(model)
def train(env, args): # Init WandB wandb.init(config=args) current_model = DQN(env, args).to(args.device) target_model = DQN(env, args).to(args.device) if args.noisy: current_model.update_noisy_modules() target_model.update_noisy_modules() if args.load_model and os.path.isfile(args.load_model): load_model(current_model, args) epsilon_by_frame = epsilon_scheduler(args.eps_start, args.eps_final, args.eps_decay) beta_by_frame = beta_scheduler(args.beta_start, args.beta_frames) if args.prioritized_replay: replay_buffer = PrioritizedReplayBuffer(args.buffer_size, args.alpha) else: replay_buffer = ReplayBuffer(args.buffer_size) state_deque = deque(maxlen=args.multi_step) reward_deque = deque(maxlen=args.multi_step) action_deque = deque(maxlen=args.multi_step) optimizer = optim.Adam(current_model.parameters(), lr=args.lr) reward_list, length_list, loss_list = [], [], [] episode_reward = 0 episode_length = 0 prev_time = time.time() prev_frame = 1 state = env.reset() for frame_idx in range(1, args.max_frames + 1): if args.render: env.render() if args.noisy: current_model.sample_noise() target_model.sample_noise() epsilon = epsilon_by_frame(frame_idx) action = current_model.act( torch.FloatTensor(state).to(args.device), epsilon) next_state, reward, done, _ = env.step(action) state_deque.append(state) reward_deque.append(reward) action_deque.append(action) if len(state_deque) == args.multi_step or done: n_reward = multi_step_reward(reward_deque, args.gamma) n_state = state_deque[0] n_action = action_deque[0] replay_buffer.push(n_state, n_action, n_reward, next_state, np.float32(done)) state = next_state episode_reward += reward episode_length += 1 if done: state = env.reset() reward_list.append(episode_reward) length_list.append(episode_length) wandb.log({ 'episode_reward': episode_reward, 'episode_length': episode_length, }) episode_reward, episode_length = 0, 0 state_deque.clear() reward_deque.clear() action_deque.clear() if len(replay_buffer ) > args.learning_start and frame_idx % args.train_freq == 0: beta = beta_by_frame(frame_idx) loss = compute_td_loss(current_model, target_model, replay_buffer, optimizer, args, beta) loss_list.append(loss.item()) wandb.log({'loss': loss.item()}) if frame_idx % args.update_target == 0: update_target(current_model, target_model) if frame_idx % args.evaluation_interval == 0: print_log(frame_idx, prev_frame, prev_time, reward_list, length_list, loss_list) reward_list.clear(), length_list.clear(), loss_list.clear() prev_frame = frame_idx prev_time = time.time() save_model(current_model, args) save_model(current_model, args)