Ejemplo n.º 1
0
def main():
    sess = get_session()
    env = atari_env(FLAGS.seed, FLAGS.game)
    action_space = env.action_space.n
    
    logdir = './results/' + FLAGS.game + '_' + FLAGS.arch + '_seed' + str(FLAGS.seed) + '_' + FLAGS.exp + '_' + time.strftime("%d-%m-%Y_%H-%M-%S")
    flags.DEFINE_string('logdir', logdir + '/', 'logdir')
    if not os.path.exists(logdir):
        os.makedirs(logdir)
        
    if FLAGS.arch == 'DQN':
        algo = model.DQN(num_actions = action_space, lr = FLAGS.lr, opt = FLAGS.opt, gamma = FLAGS.gamma, arch = FLAGS.arch)
        
    elif FLAGS.arch == 'C51':
        algo = model.C51(num_actions = action_space, lr = FLAGS.lr, num_heads = FLAGS.num_heads)
        
    elif FLAGS.arch == 'QR_DQN':
        algo = model.QR_DQN(num_actions = action_space, lr = FLAGS.lr, num_heads = FLAGS.num_heads) 
        
    elif FLAGS.arch == 'ENS_DQN':
        algo = model.ENS_DQN(num_actions = action_space, lr = FLAGS.lr, num_heads = FLAGS.num_heads)  
        
    elif FLAGS.arch == 'REM_DQN':
        algo = model.REM_DQN(num_actions = action_space, lr = FLAGS.lr, num_heads = FLAGS.num_heads)  
    
    if FLAGS.online:
        dqnsolver = solver.DQNsolver(env, sess, algo, FLAGS)
    else:
        dqnsolver = solver.offlineDQNsolver(env, sess, algo, FLAGS)
    dqnsolver.train()
Ejemplo n.º 2
0
    def __init__(self):
        available_models = ["dqn", "double_dqn", "dueling_double_dqn"]
        if config.rl_model not in available_models:
            raise Exception("specified model is not available.")
        if config.rl_model == "dqn":
            self.model = model.DQN()
        elif config.rl_model == "double_dqn":
            self.model = model.DoubleDQN()
        elif config.rl_model == "dueling_double_dqn":
            self.model = model.DuelingDoubleDQN()
        self.exploration_rate = config.rl_initial_exploration
        self.total_steps = 0
        self.total_steps_overall = 0
        self.total_time = 0
        self.start_time = time.time()
        gui.controller.glue = self
        gui.canvas.glue = self

        self.state = np.zeros(
            (config.initial_num_cars, config.rl_history_length, 34),
            dtype=np.float32)
        self.prev_state = self.state.copy()
        self.last_action = np.zeros((config.initial_num_cars, ),
                                    dtype=np.uint8)
        self.sum_loss = 0
        self.sum_reward = 0

        self.evaluation_phase = False
        self.population_phase = True
Ejemplo n.º 3
0
    def __init__(self,
                 memory_capacity=1000000,
                 gamma=0.99,
                 input_dims=(4, 84, 84),
                 output_dim=6,
                 lr=0.000025):
        self.device = torch.device(
            'cuda' if torch.cuda.is_available() else 'cpu')

        # DQN model
        self.model = model.DQN(input_dims, output_dim, lr).to(self.device)
        # DQN target
        self.target = model.DQN(input_dims, output_dim, lr).to(self.device)
        self.target.load_state_dict(self.model.state_dict())

        # Loss function is Huber loss, https://en.wikipedia.org/wiki/Huber_loss, feel free to change to MSE
        self.loss = lambda expected, target: F.smooth_l1_loss(expected, target)

        # Agents Experience Replay Memory
        self.memory = memory.ExperienceReplay(memory_capacity)

        # gamma hyperparam for calculating loss
        self.gamma = gamma
Ejemplo n.º 4
0
def train():
    # 初始化网络
    dqn = model.DQN()
    print(dqn)

    # # 获取计算设备
    # if torch.cuda.is_available():
    #     device = torch.device('cuda:0')
    #     num_gpu = torch.cuda.device_count()
    #     if num_gpu > 1:
    #         dqn = nn.DataParallel(dqn)
    #     print('Using %d GPU...' % num_gpu)
    # else:
    #     device = torch.device('cpu')
    #     print('Using CPU...')
    # # 网络转移到设备上
    # dqn.to(device)

    print('\nCollecting experience...')
    for epoch in range(400):
        s0 = dataset.env.reset()
        # s0 = s0.to(device)
        ep_r = 0
        while True:
            dataset.env.render()
            a = dqn.choose_action(s0)
            # take action
            s1, r, done, info = dataset.env.step(a)
            # modify the reward 因为reward一直是1
            x, x_dot, theta, theta_dot = s1
            r1 = (dataset.env.x_threshold -
                  abs(x)) / dataset.env.x_threshold - 0.8
            r2 = (dataset.env.theta_threshold_radians -
                  abs(theta)) / dataset.env.theta_threshold_radians - 0.5
            r = r1 + r2

            # s0,a,r,s1 = s0.to(device),a.to(device),r.to(device),s1.to(device)

            dqn.store_transition(s0, a, r, s1)

            ep_r += r
            if dqn.memory_counter > dataset.MEMORY_CAPACITY:
                dqn.learn()
                if done:
                    print('Ep:', epoch, '\tEp_r:', round(ep_r, 2))

            if done:
                break
            s0 = s1
Ejemplo n.º 5
0
def train_hierarchical_dqn(episodes, agent):
	'''For comparison with RL agent 
	that learns the overall task'''
	#Create model
	loss = []
	hierarchical_model = md.DQN(2,4,'Hierarchical')
	for epi in range(episodes):
		try:
				#agent.reset(False)
				subgoal_reached = False
				target_count = 0
				agent.reset(True)
				print(f'spawn suceeded----------')
				state = [0,100,90,0]
				state = np.reshape(state, (1,4))
				score = 0
				max_step = 250
				for i in range(max_step):
					choice = hierarchical_model.act(state)
					action = choose_action_hierarchical(choice)
					print(f"action ------------------> {action}")
					next_state, reward, done, subgoal_reached, _ = agent.step_hierarchical(action,target_count)
					if subgoal_reached:
						target_count = 1
					print(f'obs----------->{next_state}-----reward--- {reward} -----done--{done}----{target_count}-----{subgoal_reached}')
					time.sleep(0.3)
					score += reward
					next_state = np.reshape(next_state, (1, 4))
					hierarchical_model.remember(state, choice, reward, next_state, done)
					state = next_state
					hierarchical_model.replay(done,epi,loss)
					if done:
						print("episode: {}/{}, score: {}".format(epi, episodes, score))
						break
				loss.append(score)
				# Average score of last 100 episode
				is_solved = 0
				if len(loss)>=100:
					is_solved = np.mean(loss[-100:])
				if is_solved > 150:
					print('\n Task Completed! \n')
					break
				print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
		finally:
			hierarchical_model.save_model()
			if agent != None:
				agent.destroy()
				time.sleep(5)
	return loss
Ejemplo n.º 6
0
def train_right_lane_change(episodes,agent):
	#Create model
	loss = []
	right_lane_model = md.DQN(4,4,'Right_Lane')
	for epi in range(episodes):
		try:
				agent.reset(True)
				#State space normal distance y difference and x difference
				state = [5,5,5,5]
				state = np.reshape(state, (1,4))
				score = 0
				max_step = 1000
				for i in range(max_step):
					choice = right_lane_model.act(state)
					action = choose_action_rightlanechange(choice)
					if(epi>=epi_count and epi_count<210):
						if(agent.lane_id_ego!=agent.lane_id_target and action[1]<0):
							action[1] =  0.21
							choice = 0
						elif agent.lane_id_ego==agent.lane_id_target:
							if(agent.yaw_vehicle>1):
								action[1] = -0.14
								choice = 1
					print(f"action 222------------------> {action}")
					next_state, reward, done, _ = agent.step_rightlanechange(action)
					time.sleep(0.1)
					score += reward
					next_state = np.reshape(next_state, (1, 4))
					right_lane_model.remember(state, choice, reward, next_state, done)
					state = next_state
					right_lane_model.replay(done,epi,loss)
					if done:
						print("episode: {}/{}, score: {}".format(epi, episodes, score))
						break
				loss.append(score)
				# Average score of last 100 episode
				is_solved = 0
				if len(loss)>=100:
					is_solved = np.mean(loss[-100:])
				if is_solved > 1050:
					print('\n Task Completed! \n')
					break
				print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
		finally:
			right_lane_model.save_model()
			if agent != None:
				agent.destroy()
				time.sleep(1)
	return loss
Ejemplo n.º 7
0
def train_overall_dqn(episodes, agent):
	'''For comparison with RL agent 
	that learns the overall task'''
	#Create model
	loss = []
	overall_model = md.DQN(4,3,'Overall')
	for epi in range(episodes):
		try:
				#agent.reset(False)
				agent.reset(True)
				print(f'spawn suceeded----------')
				state = [0,50,0]
				state = np.reshape(state, (1,3))
				score = 0
				max_step = 100
				for i in range(max_step):
					choice = overall_model.act(state)
					action = choose_action_overall(choice)
					print(f"action ------------------> {action}")
					next_state, reward, done, _ = agent.step_overall(action)
					print(f'obs----------->{next_state}-----reward--- {reward} -----done--{done}')
					time.sleep(0.5)
					score += reward
					next_state = np.reshape(next_state, (1, 3))
					overall_model.remember(state, choice, reward, next_state, done)
					state = next_state
					overall_model.replay(done,epi,loss)
					if done:
						print("episode: {}/{}, score: {}".format(epi, episodes, score))
						break
				loss.append(score)
				# Average score of last 100 episode
				is_solved = 0
				if len(loss)>=100:
					is_solved = np.mean(loss[-100:])
				if is_solved > 500:
					print('\n Task Completed! \n')
					break
				print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
		finally:
			overall_model.save_model()
			if agent != None:
				agent.destroy()
				time.sleep(5)
	return loss
Ejemplo n.º 8
0
def train_left_DQN(episodes,agent):
	#Create model
	loss = []
	left_turn_model = md.DQN(4,4,'Left_Turn')  #corrected from 2 to 3
	for epi in range(episodes):
		try:
				#agent.reset(False)
				agent.reset(True)
				traffic_light = None
				print(f'spawn suceeded----------')
				#Get the first state
				state = [50,90,0,0]
				state = np.reshape(state, (1,4))
				score = 0
				max_step = 200
				for i in range(max_step):
					choice = left_turn_model.act(state)
					action = choose_action_leftturn(choice)
					print(f"action ------------------> {action}")
					next_state, reward, done, _ = agent.step_leftturn(action)
					print(f'obs----------->{next_state}-----reward--- {reward} -----done--{done}')
					time.sleep(0.5)
					score += reward
					next_state = np.reshape(next_state, (1, 4))
					left_turn_model.remember(state, choice, reward, next_state, done)
					state = next_state
					left_turn_model.replay(done,epi,loss)
					if done:
						print("episode: {}/{}, score: {}".format(epi, episodes, score))
						break
				loss.append(score)
				# Average score of last 100 episode
				is_solved = 0
				if len(loss)>=100:
					is_solved = np.mean(loss[-100:])
				if is_solved > 800:
					print('\n Task Completed! \n')
					break
				print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
		finally:
			left_turn_model.save_model()
			if agent != None:
				agent.destroy()
				time.sleep(1)
	return loss
def demo(num_episode=1):
    eps = 0.01
    env_raw = make_atari(args.env_name)
    env = wrap_deepmind(env_raw)
    c, h, w = m.fp(env.reset()).shape
    n_actions = env.action_space.n
    policy_net = m.DQN(h, w, n_actions, device).to(device)
    if device == "cuda":
        policy_net.load_state_dict(
            torch.load("models/" +
                       args.env_name.replace("NoFrameskip-v4", "") +
                       "_best.pth"))
    else:
        policy_net.load_state_dict(torch.load("models/"+args.env_name.replace("NoFrameskip-v4","")+\
            "_best.pth", map_location=torch.device('cpu')))
    policy_net.eval()
    sa = m.ActionSelector(eps, eps, policy_net, 100, n_actions, device)
    q = deque(maxlen=5)
    e_rewards = []
    for eee in range(num_episode):
        print("Demo episode %d/%d" % (eee + 1, num_episode) + "...")
        env.reset()
        e_reward = 0
        for _ in range(5):  # no-op
            n_frame, _, done, _ = env.step(0)
            n_frame = m.fp(n_frame)
            q.append(n_frame)

        while not done:
            if num_episode <= 1:
                env.render()
                time.sleep(0.02)
            state = torch.cat(list(q))[1:].unsqueeze(0)
            action, eps = sa.select_action(state, False)
            n_frame, reward, done, _ = env.step(action)
            n_frame = m.fp(n_frame)
            q.append(n_frame)
            e_reward += reward

        e_rewards.append(e_reward)
    avg_reward = float(sum(e_rewards)) / float(num_episode)
    env.close()
    print("Average reward of " + args.env_name + " is %.1f" % (avg_reward))
    print("Average std of " + args.env_name + " is %.1f" % (np.std(e_rewards)))
Ejemplo n.º 10
0
def train_straight_DQN(episodes,agent):
	#Create model
	loss = []
	straight_model = md.DQN(5,4,'Straight_Model')
	for epi in range(episodes):
		try:
				agent.reset(True)
				#Get the first state (speed, distance from junction)
				state = [0,100,0,15]
				state = np.reshape(state, (1,4))
				score = 0
				max_step = 500
				for i in range(max_step):
					choice = straight_model.act(state)
					action = choose_action_straight(choice)
					print(f"action ------------------> {action}")
					next_state, reward, done, _ = agent.step_straight(action)
					print(f'obs----------->{next_state}-----reward--- {reward} -----done--{done}')
					time.sleep(0.5)
					score += reward
					next_state = np.reshape(next_state, (1, 4))
					straight_model.remember(state, choice, reward, next_state, done)
					state = next_state
					straight_model.replay(done,epi,loss)
					if done:
						print("episode: {}/{}, score: {}".format(epi, episodes, score))
						break
				loss.append(score)
				# Average score of last 100 episode
				is_solved = np.mean(loss[-100:])
				if is_solved > 1000:
					print('\n Task Completed! \n')
					break
				print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
		finally:
			straight_model.save_model()
			if agent != None:
				agent.destroy()
				time.sleep(1)
	return loss
Ejemplo n.º 11
0
def train_left_lane_change(episodes,agent):
	#Create model
	loss = []
	left_lane_model = md.DQN(4,4,'Left_Lane')
	for epi in range(episodes):
		try:
				agent.reset(True)
				#State space normal distance y difference and x difference
				state = [5,5,5,5]
				state = np.reshape(state, (1,4))
				score = 0
				max_step = 1000
				for i in range(max_step):
					choice = left_lane_model.act(state)
					action = choose_action_leftlanechange(choice)
					next_state, reward, done, _ = agent.step_leftlanechange(action)
					time.sleep(0.1)
					score += reward
					next_state = np.reshape(next_state, (1, 4))
					left_lane_model.remember(state, choice, reward, next_state, done)
					state = next_state
					left_lane_model.replay(done,epi,loss)
					if done:
						print("episode: {}/{}, score: {}".format(epi, episodes, score))
						break
				loss.append(score)
				# Average score of last 100 episode
				is_solved = 0
				if len(loss)>=100:
					is_solved = np.mean(loss[-100:])
				if is_solved > 1200:
					print('\n Task Completed! \n')
					break
				print("Average over last 100 episode: {0:.2f} \n".format(is_solved))
		finally:
			left_lane_model.save_model()
			if agent != None:
				agent.destroy()
				time.sleep(10)
	return loss
Ejemplo n.º 12
0
def train(params):
    global_step = tf.contrib.framework.get_or_create_global_step()
    #initialize model, environment and experience---------
    env = md.Environment(params['frame_skip'], params['game_name'])
    mod = md.DQN(params)
    args = [
        params['load_prev'], params['input_size'], params['frame_stack'],
        params['max_epi'], params['replay_start'], params['exp_file']
    ]
    exp = md.Experience(*args)  #get all arguments from args list
    #-----------------------------------------------------

    #-----Part 1---------
    frame_stack_ph = tf.placeholder(tf.uint8, [params['frame_stack']] +
                                    params['orig_inp'])
    #frame stack placeholder
    preprocess = mod.preprocess(frame_stack_ph)
    #preprocessed input
    israndom_ph = tf.placeholder(tf.bool)
    #placeholder for getting random action
    action = mod.get_action([preprocess], israndom=israndom_ph)
    #keep in mind that action is of size [1]
    #here, should run action and store experience into Experience data

    #----Part 2----------
    #get batch of state,action,reward,new state,done
    state_shape = [params['batch_size']
                   ] + params['input_size'] + [params['frame_stack']]
    #shape that state (and new state) is in
    state_ph = tf.placeholder(tf.uint8, shape=state_shape)  #state placeholder
    action_ph = tf.placeholder(tf.int64, shape=[params['batch_size']])
    reward_ph = tf.placeholder(tf.float32, shape=[params['batch_size']])
    new_state_ph = tf.placeholder(tf.uint8, shape=state_shape)
    done_ph = tf.placeholder(tf.bool, shape=[params['batch_size']])
    batch_ph = [state_ph, action_ph, reward_ph, new_state_ph, done_ph]
    #batch_ph is not a placeholder itself, but a collection of placeholders
    train_opt = mod.train(global_step, batch_ph)
    assign_list = mod.switch_params()

    #------training session-----------
    if params['load_prev']:
        saver = tf.train.Saver()
        ckpt = tf.train.get_checkpoint_state(params['checkpoint_dir'])
    with tf.train.MonitoredTrainingSession(
            checkpoint_dir=params['checkpoint_dir']) as sess:
        if params['load_prev']:
            saver.restore(sess, ckpt.model_checkpoint_path)
        #document steps
        eps_step = 0  #number of episodes that passed
        time_step = 0  #steps after experience replay has started
        total_start_time = time.time()
        total_step = 0  #steps in total
        while eps_step <= params['step_cap']:
            mod.init_frame_stack()
            #initialize frame stack
            x1 = env.reset(
            )  #start the environment and get initial observation
            eps_run_time = time.time()  #start the runtime of an episode
            step_in_ep = 0  #steps passed in current episode
            mod.add_frame(x1)  #add initial observation into the stack
            total_r = 0
            while True:
                #part 1---------
                experience_dict = {}
                israndom_val = random.random() <= mod.rand_act_prob[0]
                #get a random bool
                experience_dict['state'], [experience_dict['action']
                                           ] = sess.run([preprocess, action],
                                                        feed_dict={
                                                            frame_stack_ph:
                                                            mod.get_stack(),
                                                            israndom_ph:
                                                            israndom_val
                                                        })
                #get state and action values
                if mod.rand_act_prob > params['rand_action'][1]:
                    mod.rand_act_prob[0] -= mod.rand_act_prob[1]
                new_unprocessed_state_val, experience_dict[
                    'reward'], experience_dict['done'] = env.run(
                        experience_dict['action'])
                mod.add_frame(new_unprocessed_state_val)
                experience_dict['new_state'] = sess.run(
                    preprocess, feed_dict={frame_stack_ph: mod.get_stack()})
                exp.add_exp(experience_dict)
                #add experience
                #part 2---------
                batch_val = exp.get_batch(params['batch_size'])
                if not batch_val is None:
                    sess.run([train_opt],
                             feed_dict={
                                 batch_ph[i]: batch_val[i]
                                 for i in range(len(batch_ph))
                             })
                    if not time_step % params['target_update']:
                        sess.run(assign_list)
                    time_step += 1
                total_step += 1
                step_in_ep += 1
                total_r += experience_dict['reward']
                if experience_dict['done']:
                    cur_eps_run_time = ut.timer(time.time() - eps_run_time)
                    total_run_time = ut.timer(time.time() - total_start_time)
                    string = "episodes ran: %d,steps ran in episode: %d, Total steps taken: %d,reward: %.4f,episode run time:%s,total run time:%s"
                    print string % (eps_step, step_in_ep, total_step, total_r,
                                    cur_eps_run_time, total_run_time)
                    break
            eps_step += 1
Ejemplo n.º 13
0
# Main function
if __name__ == '__main__':
    # set unity environment path (file_name)
    env = UnityEnvironment(file_name=config.env_name)
    # env = UnityEnvironment(file_name=config.env_name, worker_id=np.random.randint(100000))

    # setting brain for unity
    default_brain = env.brain_names[0]
    brain = env.brains[default_brain]

    train_mode = config.train_mode

    device = config.device

    model_ = model.DQN(config.action_size, "main").to(device)
    target_model_ = model.DQN(config.action_size, "target").to(device)

    model_RND = model.RND(config.action_size, "RND").to(device)
    models = [model_, model_RND]

    # optimizer 에 넣기 위하여 RND model 에서 학습을 진행할 parameter 만 뽑음
    param_active_list = []
    param_frozen_list = []
    for name, param in model_RND.named_parameters():

        if str(name).startswith('model_active'):
            param_active_list.append(param)
        elif str(name).startswith('model_frozen'):
            param.requires_grad = False
            param_frozen_list.append(param)
Ejemplo n.º 14
0
                        required=True,
                        help="Enter Model File")
    parser.add_argument(
        "--env",
        default=DEFAULT_ENV_NAME,
        help="Enter environment name, default: {}".format(DEFAULT_ENV_NAME))
    parser.add_argument("-r",
                        "--record",
                        help="Enter directory to store recorded video")
    args = parser.parse_args()

    env = wrappers.make_env(args.env)
    if args.record:
        env = gym.wrappers.Monitor(env, args.record)

    net = model.DQN(env.observation_space.shape, env.action_space.n)
    net.load_state_dict(torch.load(args.model))

    state = env.reset()
    total_reward = 0.0

    while True:
        start_time = time.time()
        env.render()
        state_v = torch.tensor(np.array([state], copy=False))
        q_vals = net(state_v).data.numpy()[0]
        action = np.argmax(q_vals)

        state, reward, done, _ = env.step(action)
        total_reward += reward
        if done:
            "batch_size" : 32,
            "replay_initial" : 10000,
            "capacity" : 100000,
            "reward_steps" : 1,
        },
    }

    params = HYPERPARAMS["breakout"]

    scores, eps_history = [], []

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    env = wrappers.make_env(params["env_name"])

    policy_network = model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    target_network = ptan.agent.TargetNet(policy_network)
    optimizer = optim.Adam(policy_network.parameters(), lr=params["learning_rate"])

    action_selector = ptan.actions.EpsilonGreedyActionSelector(epsilon=params["eps_start"])
    agent = ptan.agent.DQNAgent(policy_network, action_selector, device)
    epsilon_tracker = EpsilonTracker(action_selector, params)

    exp_source  = ptan.experience.ExperienceSourceFirstLast(env, agent,
                  gamma=params["gamma"], steps_count=params["reward_steps"])
    buffer = ptan.experience.ExperienceReplayBuffer(exp_source, buffer_size=params["capacity"]) 
    writer = SummaryWriter("run")

    current_step = 0

    with utils.RewardTracker(writer, params) as reward_tracker:
Ejemplo n.º 16
0
        if render:
            time_to_sleep = wait_time - (time.time() - start_time)
            if time_to_sleep > 0:
                time.sleep(time_to_sleep)

    return total_reward


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda",
                        default=False,
                        action="store_true",
                        help="Render on graphics card(cuda:0).")
    parser.add_argument("--env",
                        default=ENV_NAME,
                        help="Name of the environment, default=" + ENV_NAME)
    parser.add_argument("-m", "--model", help="DQN")
    args = parser.parse_args()

    device = torch.device(GRAPHICS_CARD if args.cuda else "cpu")

    env = wrappers.make_atari(args.env)
    env = wrappers.wrap_deepmind(env, False, False, True)

    net = model.DQN(4, env.action_space.n).to(device)
    net.load_state_dict(torch.load(args.model))

    score = play(env, net, True, device)
    print(f"Score: {score}")
Ejemplo n.º 17
0
    output = DQN(x_stack)
    loss = criterion(output, y_stack)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss


env = gym.make('CartPole-v0')

input_size = env.observation_space.shape[0]
output_size = env.action_space.n
model = model.DQN(input_size, output_size, [10])

criterion = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

# Make replay buffer
REPLAY_MEMORY_SIZE = 50000
replay_buffer = deque()

gamma = 0.9
num_episodes = 5000
rList = []
loss_list = []
model.train()

for i in range(num_episodes):
Ejemplo n.º 18
0
# 3. environment reset
env_name = args.env_id.replace(
    "NoFrameskip-v4",
    "") if "NoFrameskip-v4" in args.env_id else args.env_id.replace(
        "-ramNoFrameskip-v4", "")
env_raw = make_atari(args.env_id)
env = wrap_deepmind(env_raw,
                    frame_stack=False,
                    episode_life=True,
                    clip_rewards=True)

c, h, w = m.fp(env.reset()).shape
n_actions = env.action_space.n

# 4. Network reset
policy_net = m.DQN(h, w, n_actions, device).to(device)
target_net = m.DQN(h, w, n_actions, device).to(device)
policy_net.apply(
    policy_net.init_weights
)  # apply函数会把init_weights函数作用在每一个子模块上,如果更换了模型结构也可以不用更改inti函数,这就是apply的好处
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

# 5. DQN hyperparameters
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 1.
EPS_END = 0.1
EPS_DECAY = 1000000
TARGET_UPDATE = 10000
NUM_STEPS = 15000000
Ejemplo n.º 19
0
        "--env",
        default=ENV_NAME,
        type=str,
        help="Name of environment, Default: {}".format(ENV_NAME))
    parser.add_argument(
        "--reward",
        type=float,
        default=MEAN_REWARD_BOUND,
        help="Mean reward boundary to stop training, Default: {:.2f}".format(
            MEAN_REWARD_BOUND))

    args = parser.parse_args()
    device = torch.device("cuda" if args.gpu else "cpu")

    env = wrappers.make_env(args.env)
    net = model.DQN(env.observation_space.shape, env.action_space.n).to(device)
    tgt_net = model.DQN(env.observation_space.shape,
                        env.action_space.n).to(device)

    writer = SummaryWriter(logdir="logs", comment="-" + args.env)
    # print(net)

    buffer = ExperienceBuffer(REPLAY_SIZE)
    agent = Agent(env, buffer)

    epsilon = EPSILON_START
    optimier = optim.Adam(net.parameters(), lr=LEARNING_RATE)

    total_rewards = []
    frame_idx = 0
    best_mean_reward = None
Ejemplo n.º 20
0
    def run_model(self, obs):
        data = Data('day', 'minute', '1', 'true', self.ticker, '10')
        prices = data.get_prices_formatted()

        env = enviornment.StocksEnv(prices)

        net = model.DQN(env.observation_space.shape[0], env.action_space.n)
        net.load_state_dict(torch.load('RL\saves\(episode-60800.000.data'))
        total_reward = 0.00
        total_balance = 10000
        step_idx = 0

        balance = []
        rewards = []
        profit = []

        epochs = self.epoch
        epoch_step = 1

        while epoch_step <= epochs:
            obs = env.reset()
            while True:
                step_idx += 1
                obs_v = torch.tensor([obs])
                out_v = net(obs_v)
                action_idx = out_v.max(dim=1)[1].item()

                if np.random.random() < self.epsilon:
                    action_idx = env.action_space.sample()

                action = enviornment.Actions(action_idx)

                if action == enviornment.Actions.Buy and not env._state.have_position:
                    start_price = env._state.curr_close()
                    total_balance -= start_price
                    balance.append(total_balance)

                obs, reward, done, _ = env.step(action_idx)

                total_reward += reward
                rewards.append(total_reward)

                if step_idx % 100 == 0:
                    print("Epoch %d, Step_idx: %d reward = %.3f" %
                          (epoch_step, step_idx, total_reward))

                if done:
                    profit_recieved = (env._state.curr_close() -
                                       start_price) / start_price
                    profit.append(profit_recieved)
                    total_balance += env._state.curr_close()
                    balance.append(total_balance)
                    break
            epoch_step += 1

        file_name_profit = 'RL\logs\profits\profit_%s_%s.txt' % (
            self.ticker, str(time.time()))
        file_name_reward = 'RL\logs\\rewards\\reward_{}_{}.txt'.format(
            self.ticker, str(time.time()))

        file_name_balance = 'RL\logs\\balances\\balance_{}_{}.txt'.format(
            self.ticker, str(time.time()))

        with open(file_name_profit, 'w') as f:
            f.writelines('%s,' % x for x in profit)

        with open(file_name_reward, 'w') as f:
            f.writelines('%s,' % x for x in rewards)

        with open(file_name_balance, 'w') as f:
            f.writelines('%s,' % x for x in balance)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--cuda", default=False, action="store_true",
                        help="Enable cuda")
    parser.add_argument("--env", default=ENV_NAME,
                        help="Name of the environment, default=" + ENV_NAME)
    args = parser.parse_args()
    device = torch.device(GRAPHICS_CARD if args.cuda else "cpu")

    env = wrappers.make_atari(args.env)
    env = wrappers.wrap_deepmind(env, episode_life=False, frame_stack=True)
    exp_buffer = ExperienceBuffer(REPLAY_MEMORY_SIZE)
    agent = Agent(env, exp_buffer)

    net = model.DQN(AGENT_HIST_LENGTH, env.action_space.n).to(device)
    tgt_net = model.DQN(AGENT_HIST_LENGTH, env.action_space.n).to(device)
    tgt_net.load_state_dict(net.state_dict())

    criterion = nn.MSELoss()
    optimizer = optim.RMSprop(net.parameters(), lr=LEARNING_RATE,
                              momentum=GRAD_MOMENTUM, eps=MIN_SQ_GRAD)

    writer = SummaryWriter(comment="-" + args.env)

    remaining_time_buffer = collections.deque(maxlen=100)
    last_100_rewards_training = collections.deque(maxlen=100)
    last_100_rewards_test = collections.deque(maxlen=100)

    episode_idx = 0
    frame_idx = 0