Beispiel #1
0
    def __init__(self, state_size, action_size, seed, network):
        """Initialize an Agent object.
        
        Params
        ======
            state_size (int): dimension of each state
            action_size (int): dimension of each action
            seed (int): random seed
        """
        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.network = network

        # Q-Network
        if self.network == "duel":
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        else:
            self.qnetwork_local = DQN(state_size, action_size, seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       seed).to(device)
            self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                        lr=LR)

        # Replay memory
        self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed)
        # Initialize time step (for updating every UPDATE_EVERY steps)
        self.t_step = 0
Beispiel #2
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount
        self.norm_clip = args.norm_clip

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model:  # Load pretrained model if provided
            if os.path.isfile(args.model):
                state_dict = torch.load(args.model, map_location='cpu')  # Always load tensors onto CPU by default, will shift to GPU if necessary
                if 'conv1.weight' in state_dict.keys():
                    for old_key, new_key in (('conv1.weight', 'convs.0.weight'), ('conv1.bias', 'convs.0.bias'), ('conv2.weight', 'convs.2.weight'), ('conv2.bias', 'convs.2.bias'), ('conv3.weight', 'convs.4.weight'), ('conv3.bias', 'convs.4.bias')):
                        state_dict[new_key] = state_dict[old_key]  # Re-map state dict for old pretrained models
                        del state_dict[old_key]  # Delete old keys for strict load_state_dict
                self.online_net.load_state_dict(state_dict)
                print("Loading pretrained model: " + args.model)
            else:  # Raise error if incorrect model path provided
                raise FileNotFoundError(args.model)

        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.learning_rate, eps=args.adam_eps)
Beispiel #3
0
 def __init__(self, time_step, split, lr):
     self.dataset = Dataset(T=time_step,
                            split_ratio=split,
                            binary_file=config.BINARY_DATASET)
     self.policy_net_encoder = AttnEncoder(
         input_size=self.dataset.get_num_features(),
         hidden_size=config.ENCODER_HIDDEN_SIZE,
         time_step=time_step)
     self.policy_net_decoder = AttnDecoder(
         code_hidden_size=config.ENCODER_HIDDEN_SIZE,
         hidden_size=config.DECODER_HIDDEN_SIZE,
         time_step=time_step)
     self.policy_net = DQN(self.policy_net_encoder, self.policy_net_decoder)
     self.target_net_encoder = AttnEncoder(
         input_size=self.dataset.get_num_features(),
         hidden_size=config.ENCODER_HIDDEN_SIZE,
         time_step=time_step)
     self.target_net_decoder = AttnDecoder(
         code_hidden_size=config.ENCODER_HIDDEN_SIZE,
         hidden_size=config.DECODER_HIDDEN_SIZE,
         time_step=time_step)
     self.target_net = DQN(self.target_net_encoder, self.target_net_decoder)
     if torch.cuda.is_available():
         self.policy_net_encoder = self.policy_net_encoder.cuda()
         self.policy_net_decoder = self.policy_net_decoder.cuda()
         self.target_net_encoder = self.target_net_encoder.cuda()
         self.target_net_decoder = self.target_net_decoder.cuda()
         self.policy_net = self.policy_net.cuda()
         self.target_net = self.target_net.cuda()
     self.memory = ReplayMemory(config.MEMORY_CAPACITY)
     self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=lr)
Beispiel #4
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms  # size of value distribution.
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max,
                                      self.atoms).to(device=args.device)
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(
            device=args.device)  # greedily selects the action.
        if args.model and os.path.isfile(args.model):
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu')
            )  # state_dict: python dictionary that maps each layer to its parameters.
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(
            device=args.device)  # use to compute target q-values.
        self.update_target_net(
        )  # sets it to the parameters of the online network.
        self.target_net.train()
        for param in self.target_net.parameters(
        ):  # not updated through backpropagation.
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
Beispiel #5
0
    def __init__(self):
        """
        initializes all the class variables
        """
        self.env = gym.make('CartPole-v0').unwrapped
        self.resize = T.Compose([
            T.ToPILImage(),
            T.Resize(40, interpolation=Image.CUBIC),
            T.ToTensor()
        ])
        self.env.reset()
        init_screen = self.get_screen()
        self.env.reset()
        _, _, screen_height, screen_width = init_screen.shape

        # Get number of actions from gym action space
        self.n_actions = self.env.action_space.n

        self.policy_net = DQN(screen_height, screen_width,
                              self.n_actions).to(device)
        self.target_net = DQN(screen_height, screen_width,
                              self.n_actions).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0.0001)
        self.memory = PriortizedReplayMemory(10000)
Beispiel #6
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model:  # Load pretrained model if provided
            if os.path.isfile(args.model):
                self.online_net.load_state_dict(
                    torch.load(args.model, map_location='cpu')
                )  # Always load tensors onto CPU by default, will shift to GPU if necessary
                print("Loading pretrained model: " + args.model)
            else:  # Raise error if incorrect model path provided
                raise FileNotFoundError(args.model)

        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.learning_rate,
                                    eps=args.adam_eps)
Beispiel #7
0
    def __init__(self, args, state_size, action_size):
        """Initialize an Agent object.
        
        Params
        ======
            args (class defined on the notebook): A set of parameters that will define the agent hyperparameters
            state_size (int): dimension of each state
            action_size (int): dimension of each action
        """
        self.state_size = state_size
        self.action_size = action_size
        self.params = args

        # Deep Q-Network
        if args.use_NoisyNet:
            self.DQN_local = DQN_NoisyNet(args, state_size,
                                          action_size).to(args.device)
            self.DQN_target = DQN_NoisyNet(args, state_size,
                                           action_size).to(args.device)
        else:
            self.DQN_local = DQN(args, state_size, action_size).to(args.device)
            self.DQN_target = DQN(args, state_size,
                                  action_size).to(args.device)

        self.optimizer = optim.Adam(self.DQN_local.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)

        # Replay memory
        self.memory = ReplayBuffer(args, action_size)
        # Initialize time step (for updating every args.target_update steps)
        self.t_step = 0
Beispiel #8
0
 def __init__(self, args, obs):
     self.net = DQN(args.n_obs, args.n_action)
     self.target_net = DQN(args.n_obs, args.n_action)
     if os.path.isfile('./weights/ckpt.pth'):
         self.net.load_state_dict(torch.load('./weights/ckpt.pth'))
         self.target_net.load_state_dict(torch.load('./weights/ckpt.pth'))
     self.device = torch.device(
         'cuda' if torch.cuda.is_available() else 'cpu')
     self.state_preproc = StatePreproc(self.device)
     self.n_action = args.n_action
     self.gamma = args.gamma
     self.max_grad_norm = args.max_grad_norm
     self.num_procs = args.num_procs
     self.memory = ReplayBuffer(args)
     self.optimizer = torch.optim.Adam(self.net.parameters(),
                                       lr=args.lr,
                                       betas=(0.9, 0.99))
     self.criterion = torch.nn.MSELoss()
     # log
     self.log_episode_rewards = torch.zeros(self.num_procs,
                                            device=self.device,
                                            dtype=torch.float)
     self.episode_rewards = deque([0] * 100, maxlen=100)
     self.episode = 1
     self.init(obs)
     # eval
     self.test_episode = args.test_episode
Beispiel #9
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(
            device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount

        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model and os.path.isfile(args.model):
            # Always load tensors onto CPU by default, will shift to GPU if necessary
            self.online_net.load_state_dict(
                torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(),
                                    lr=args.lr,
                                    eps=args.adam_eps)
Beispiel #10
0
    def __init__(self,
                 state_size: int,
                 action_size: int,
                 replay_buffer: ReplayMemory,
                 seed: int,
                 batch_size=BATCH_SIZE,
                 update_every=UPDATE_EVERY,
                 tau=TAU,
                 gamma=GAMMA):
        """Initialize the agent"""

        self.state_size = state_size
        self.action_size = action_size
        self.seed = random.seed(seed)
        self.batch_size = batch_size
        self.tau = tau
        self.update_target_every = update_every
        self.gamma = gamma

        self.qnet_local = DQN(state_size, action_size, seed).to(device)
        self.qnet_target = DQN(state_size, action_size, seed).to(device)

        self.optimizer = optim.Adam(self.qnet_local.parameters(), lr=LR)
        self.max_gradient_norm = float('inf')

        self.memory = replay_buffer

        self.t_step = 0
Beispiel #11
0
def test():
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, OBS_NUM, BUN_NUM, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, CHANNEL, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    total_succ = 0
    for episode in range(10000):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        step = 0
        while not terminal and step <= 200:
            action = brain.get_action()
            state, reward, terminal, succ = game.step(action)
            if terminal and succ:
                total_succ += 1
            step += 1

    print(total_succ)
	def __init__(self, action_set, train=True, load_path=None):
		#1. Initialize agent params
		self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
		self.action_set = action_set
		self.action_number = len(action_set)
		self.steps_done = 0
		self.epsilon = Config.EPS_START
		self.episode_durations = []

		#2. Build networks
		self.policy_net = DQN().to(self.device)
		self.target_net = DQN().to(self.device)
		
		self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

		if not train:		
			self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)	
			self.policy_net.load(load_path, optimizer=self.optimizer)
			self.policy_net.eval()

		self.target_net.load_state_dict(self.policy_net.state_dict())
		self.target_net.eval()

		#3. Create Prioritized Experience Replay Memory
		self.memory = Memory(Config.MEMORY_SIZE)
Beispiel #13
0
    def __init__(self, action_size):
        self.action_size = action_size

        # These are hyper parameters for the DQN
        self.discount_factor = 0.99
        self.epsilon = 1.0
        self.epsilon_min = 0.01
        self.explore_step = 500000
        self.epsilon_decay = (self.epsilon -
                              self.epsilon_min) / self.explore_step
        self.train_start = 100000
        self.update_target = 1000

        # Generate the memory
        self.memory = ReplayMemory()

        # Create the policy net and the target net
        self.policy_net = DQN(action_size)
        self.policy_net.to(device)
        self.target_net = DQN(action_size)
        self.target_net.to(device)

        self.optimizer = optim.Adam(params=self.policy_net.parameters(),
                                    lr=learning_rate)
        self.scheduler = optim.lr_scheduler.StepLR(
            self.optimizer,
            step_size=scheduler_step_size,
            gamma=scheduler_gamma)

        # Initialize a target network and initialize the target network to the policy net
        ### CODE ###
        self.update_target_net()
Beispiel #14
0
def test(env, args):
    current_model = DQN(env, args).to(args.device)
    current_model.eval()

    load_model(current_model, args)

    episode_reward = 0
    episode_length = 0

    state = env.reset()
    while True:
        if args.render:
            env.render()

        action = current_model.act(
            torch.FloatTensor(state).to(args.device), 0.)

        next_state, reward, done, _ = env.step(action)

        state = next_state
        episode_reward += reward
        episode_length += 1

        if done:
            break

    print("Test Result - Reward {} Length {}".format(episode_reward,
                                                     episode_length))
 def __init__(self, env, args):
     super(DQNTrainer).__init__()
     self.model = DQN(env, args, Nash=False).to(args.device)
     self.target = DQN(env, args, Nash=False).to(args.device)
     self.replay_buffer = ReplayBuffer(args.buffer_size)
     self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr)
     self.args = args
Beispiel #16
0
    def __init__(self, state_size, action_size, config=RLConfig()):
        self.seed = random.seed(config.seed)
        self.state_size = state_size
        self.action_size = action_size
        self.batch_size = config.batch_size
        self.batch_indices = torch.arange(config.batch_size).long().to(device)
        self.samples_before_learning = config.samples_before_learning
        self.learn_interval = config.learning_interval
        self.parameter_update_interval = config.parameter_update_interval
        self.per_epsilon = config.per_epsilon
        self.tau = config.tau
        self.gamma = config.gamma

        if config.useDuelingDQN:
            self.qnetwork_local = DuelingDQN(state_size, action_size,
                                             config.seed).to(device)
            self.qnetwork_target = DuelingDQN(state_size, action_size,
                                              config.seed).to(device)
        else:
            self.qnetwork_local = DQN(state_size, action_size,
                                      config.seed).to(device)
            self.qnetwork_target = DQN(state_size, action_size,
                                       config.seed).to(device)
        self.optimizer = optim.Adam(self.qnetwork_local.parameters(),
                                    lr=config.learning_rate)

        self.doubleDQN = config.useDoubleDQN
        self.usePER = config.usePER
        if self.usePER:
            self.memory = PrioritizedReplayBuffer(config.buffer_size,
                                                  config.per_alpha)
        else:
            self.memory = ReplayBuffer(config.buffer_size)

        self.t_step = 0
Beispiel #17
0
  def __init__(self, args, env):
    self.action_space = env.action_space()
    self.atoms = args.atoms
    self.Vmin = args.V_min
    self.Vmax = args.V_max
    self.support = torch.linspace(args.V_min, args.V_max, args.atoms)  # Support (range) of z
    self.delta_z = (args.V_max - args.V_min) / (args.atoms - 1)
    self.batch_size = args.batch_size
    self.n = args.multi_step
    self.discount = args.discount
    self.priority_exponent = args.priority_exponent
    self.max_gradient_norm = args.max_gradient_norm

    self.policy_net = DQN(args, self.action_space)
    if args.model and os.path.isfile(args.model):
      self.policy_net.load_state_dict(torch.load(args.model))
    self.policy_net.train()

    self.target_net = DQN(args, self.action_space)
    self.update_target_net()
    self.target_net.eval()

    self.optimiser = optim.Adam(self.policy_net.parameters(), lr=args.lr, eps=args.adam_eps)
    if args.cuda:
      self.policy_net.cuda()
      self.target_net.cuda()
      self.support = self.support.cuda()
Beispiel #18
0
def main(_):
    game = Game(screen_width, screen_height, show_game=not FLAGS.train)
    state = game.get_state()
    brain = DQN(n_action, screen_width, screen_height, state)

    while 1:
        game.reset()
        gameover = FLAGS.train

        print (" Avg. Reward: %d, Total Game: %d" % (
                    game.total_reward / game.total_game, game.total_game))

        while not gameover:
            # DQN 모델을 이용해 실행할 액션을 결정합니다.
            action = brain.get_action(FLAGS.train)

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            reward, gameover = game.proceed(np.argmax(action))

            # 위에서 결정한 액션에 따른 현재 상태를 가져옵니다.
            # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
            state = game.get_state()

            # DQN 으로 학습을 진행합니다.
            brain.step(state, action, reward, gameover)

            # 학습모드가 아닌 경우, 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다.
            if not FLAGS.train:
                time.sleep(0.3)
    def __init__(self, learner, actor_idx, epsilon):
        # environment initialization
        import gym
        import minerl
        self.actor_idx = actor_idx
        self.env = gym.make("MineRLTreechop-v0")
        self.port_number = int("12340") + actor_idx
        print("actor environment %d initialize successfully" % self.actor_idx)
        self.shared_network_cpu = ray.get(learner.get_network.remote())
        # self.shared_memory = ray.get(shared_memory_id)
        # print("shared memory assign successfully")

        # network initalization
        self.actor_network = DQN(19).cpu()
        self.actor_target_network = DQN(19).cpu()
        self.actor_network.load_state_dict(self.shared_network_cpu.state_dict())
        self.actor_target_network.load_state_dict(self.actor_network.state_dict())
        print("actor network %d initialize successfully" % self.actor_idx)

        self.initialized = False
        self.epi_counter = 0
        # exploring info
        self.epsilon = epsilon
        self.max_step = 100
        self.local_buffer_size = 100
        self.local_buffer = deque(maxlen=self.local_buffer_size)

        project_name = 'apex_dqfd_Actor%d' %(actor_idx)
        wandb.init(project=project_name, entity='neverparadise')
    def __init__(self, action_set, train=True, load_path=None):
        #1. Initialize agent params
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.action_set = action_set
        self.action_number = len(action_set)
        self.steps_done = 0
        self.epsilon = Config.EPS_START
        self.episode_durations = []

        print('LOAD PATH    --  agent.init:', load_path)
        time.sleep(2)

        #2. Build networks
        self.policy_net = DQN().to(self.device)
        self.target_net = DQN().to(self.device)
        
        self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=Config.LEARNING_RATE)

        if not train:
            print('entrou no not train')        
            self.optimizer = optim.RMSprop(self.policy_net.parameters(), lr=0)    
            self.policy_net.load(load_path, optimizer=self.optimizer)
            self.policy_net.eval()

        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.memory = ReplayMemory(1000)
Beispiel #21
0
    def __init__(self):
        self.device = args.device
        self.batch_size = args.batch_size
        self.lr = args.lr
        self.history_size = args.history_size
        self.replay_size = args.replay_size
        self.width = args.width
        self.height = args.height
        self.hidden_size = args.hidden_size
        self.action_size = args.action_size
        self.update_cycle = args.update_cycle
        self.log_interval = args.log_interval
        self.actor_num = args.actor_num
        self.alpha = 0.7
        self.beta_init = 0.4
        self.beta = self.beta_init
        self.beta_increment = 1e-6
        self.e = 1e-6
        self.dis = 0.99
        self.start_epoch = 0
        self.mainDQN = DQN(self.history_size, self.hidden_size,
                           self.action_size).to(self.device)
        self.targetDQN = DQN(self.history_size, self.hidden_size,
                             self.action_size).to(self.device)
        self.update_target_model()
        self.optimizer = optim.Adam(self.mainDQN.parameters(), lr=args.lr)
        self.replay_memory = deque(maxlen=self.replay_size)
        self.priority = deque(maxlen=self.replay_size)

        if args.load_model != '000000000000':
            self.log = args.log_directory + args.load_model + '/'
            args.time_stamp = args.load_model[:12]
            args.start_epoch = self.load_model()
        self.log = args.log_directory + args.time_stamp + config + '/'
        self.writer = SummaryWriter(self.log)
Beispiel #22
0
    def __init__(self):
        self.mode = "train"
        with open("config.yaml") as reader:
            self.config = yaml.safe_load(reader)
        print(self.config)
        self.load_config()

        self.online_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.target_net = DQN(config=self.config,
                              word_vocab=self.word_vocab,
                              char_vocab=self.char_vocab,
                              answer_type=self.answer_type)
        self.online_net.train()
        self.target_net.train()
        self.update_target_net()
        for param in self.target_net.parameters():
            param.requires_grad = False

        if self.use_cuda:
            self.online_net.cuda()
            self.target_net.cuda()

        self.naozi = ObservationPool(capacity=self.naozi_capacity)
        # optimizer
        self.optimizer = torch.optim.Adam(
            self.online_net.parameters(),
            lr=self.config['training']['optimizer']['learning_rate'])
        self.clip_grad_norm = self.config['training']['optimizer'][
            'clip_grad_norm']
Beispiel #23
0
 def __init__(self):
     # build models
     self.Qt = DQN(in_channels=5, num_actions=18)  # Controller Q network
     self.Qt_t = DQN(in_channels=5,
                     num_actions=18)  # Controller target network
     # self.meta_controller = Model(in_channels=4, num_actions=10)
     self.Q = None  # Meta-Controller Q network
     self.Q_t = None  # Meta-Controller target network
Beispiel #24
0
def main():
    max_episodes = 5000
    replay_buffer = deque()

    with tf.Session() as sess:
        mainDQN = DQN(sess, input_size, output_size, name='main')
        targetDQN = DQN(sess, input_size, output_size, name='target')
        tf.global_variables_initializer().run()

        copy_ops = get_copy_var_ops(dest_scope_name='target', src_scope_name='main')

        sess.run(copy_ops)

        for episode in range(max_episodes):
            e = 1. / ((episode / 10) + 1)
            done = False
            step_count = 0

            state = env.reset()

            while not done:
                if np.random.rand(1) < e:
                    action = env.action_space.sample()

                else:
                    action = np.argmax(mainDQN.predict(state))

                next_state, reward, done, _ = env.step(action)

                if done:
                    reward = -100

                replay_buffer.append((state, action, reward, next_state, done))

                if len(replay_buffer) > REPLAY_MEMORY:
                    replay_buffer.popleft()

                state = next_state
                step_count += 1
                if step_count > 10000:
                    break

            print("Episode: {} Steps: {}".format(episode, step_count))

            if step_count > 10000:
                pass

            if episode % 10 == 1:
                for _ in range(50):
                    minibatch = random.sample(replay_buffer, 10)
                    loss, _ = replay_train(mainDQN, targetDQN, minibatch)

                print("Loss: ", loss)

                sess.run(copy_ops)

        bot_play(mainDQN)
    def __init__(self, config: Config):
        self.config = config
        self.is_training = True
        self.buffer = ReplayBuffer(self.config.max_buff)

        self.model = DQN(self.config.state_dim, self.config.action_dim).cuda()
        self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate)

        if self.config.use_cuda:
            self.cuda()
Beispiel #26
0
 def __init__(self, env, model, optimizer, criterion, reward_func, config):
     super(DQNSolver, self).__init__(env, model, optimizer, criterion, reward_func, config)
     self.init_eps, self.final_eps, self.eps_step = config.init_eps, config.final_eps, config.eps_step
     self.target = DQN(in_c=config.in_c, num_actions=config.num_actions).to(self.device)
     self.target.load_state_dict(self.model.state_dict())
     self.batch_size, self.num_actions = config.batch_size, config.num_actions
     self.reward_mean, self.reward_list = None, deque(maxlen=config.display_interval)
     self.epsilon = self.init_eps
     if config.visdom:
         self._build_visdom()
Beispiel #27
0
    def __init__(self, args, env):
        self.action_space = env.action_space()
        self.atoms = args.atoms
        self.Vmin = args.V_min
        self.Vmax = args.V_max
        self.support = torch.linspace(args.V_min, args.V_max, self.atoms).to(device=args.device)  # Support (range) of z
        self.delta_z = (args.V_max - args.V_min) / (self.atoms - 1)
        self.batch_size = args.batch_size
        self.n = args.multi_step
        self.discount = args.discount
        self.saved_model_path = args.saved_model_path
        self.experiment = args.experiment
        self.plots_path = args.plots_path
        self.data_save_path = args.data_save_path


        self.online_net = DQN(args, self.action_space).to(device=args.device)
        if args.model and os.path.isfile(args.model):
            # Always load tensors onto CPU by default, will shift to GPU if necessary
            self.online_net.load_state_dict(torch.load(args.model, map_location='cpu'))
        self.online_net.train()

        self.target_net = DQN(args, self.action_space).to(device=args.device)
        self.update_target_net()
        self.target_net.train()
        for param in self.target_net.parameters():
            param.requires_grad = False

        self.optimiser = optim.Adam(self.online_net.parameters(), lr=args.lr, eps=args.adam_eps)

        # list of layers:
        self.online_net_layers = [self.online_net.conv1,
                                  self.online_net.conv2,
                                  self.online_net.conv3,
                                  self.online_net.fc_h_v,
                                  self.online_net.fc_h_a,
                                  self.online_net.fc_z_v,
                                  self.online_net.fc_z_a
                                  ]

        self.target_net_layers = [self.target_net.conv1,
                                  self.target_net.conv2,
                                  self.target_net.conv3,
                                  self.target_net.fc_h_v,
                                  self.target_net.fc_h_a,
                                  self.target_net.fc_z_v,
                                  self.target_net.fc_z_a
                                  ]

        # freeze all layers except the last, and reinitialize last
        if args.freeze_layers > 0:
            self.freeze_layers(args.freeze_layers)

        if args.reinitialize_layers > 0:
            self.reinit_layers(args.reinitialize_layers)
Beispiel #28
0
    def __init__(self, param_server, batch_size, num_channels, num_actions):
        self.learner_network = DQN(num_channels, num_actions).cuda().float()
        self.learner_target_network = DQN(num_channels,
                                          num_actions).cuda().float()
        self.count = 0
        self.batch_size = batch_size
        self.writer = SummaryWriter(f'runs/apex/learner')

        self.lr = LR
        self.optimizer = optim.Adam(self.learner_network.parameters(), self.lr)
        self.param_server = param_server
Beispiel #29
0
    def __init__(self,
                 env,
                 taskCount=100,
                 alpha=0.4,
                 hiddenSize=500,
                 perfix=''):
        self.GAMMA = 0.99
        self.epsilon = 0.3
        self.epsilon_end = 0.05
        self.epsilon_decay = 200

        self.update_step = 20
        self.memory_size = 2000
        self.max_epoch = 500
        self.batch_size = 32

        # self.max_epoch = 500
        # self.batch_size = 1
        # self.memory_size = 1
        # self.update_step = 1

        self.hiddenSize = hiddenSize
        # self.save_path = '../Model/' + str(taskCount) + '-' + str(alpha) + perfix +'.pth'
        self.save_path = '../Model/' + perfix + '-' + str(
            taskCount) + '-' + str(alpha) + '.pth'

        # Variables
        self.var_phi = autograd.Variable(torch.Tensor(6), volatile=True)

        # For training
        self.var_batch_phi = autograd.Variable(torch.Tensor(
            self.batch_size, 6))
        self.var_batch_a = autograd.Variable(torch.LongTensor(
            self.batch_size, 1),
                                             requires_grad=False)
        self.var_batch_r = autograd.Variable(torch.Tensor(self.batch_size, 1))
        self.var_batch_phi_next = autograd.Variable(
            torch.Tensor(self.batch_size, 6))
        self.var_batch_r_mask = autograd.Variable(torch.Tensor(
            self.batch_size, 1),
                                                  requires_grad=False)

        self.MP = MemoryReplay(self.memory_size, self.batch_size)
        self.dqn = DQN(hiddenSize=self.hiddenSize)
        self.target_dqn = DQN(hiddenSize=self.hiddenSize)
        self.target_dqn.load_state_dict(self.dqn.state_dict())

        self.optimz = optim.RMSprop(self.dqn.parameters(),
                                    lr=0.00025,
                                    alpha=0.9,
                                    eps=1e-02,
                                    momentum=0.0)

        self.env = env
Beispiel #30
0
def run():
    policy_net = DQN(num_channels, 19).cuda()
    target_net = DQN(num_channels, 19).cuda()
    optimizer = optim.Adam(policy_net.parameters(), LR)
    memory = Memory(50000)
    env = gym.make(ENV_NAME)
    env.make_interactive(port=6666, realtime=False)
    max_epi = 100
    n_step = 2
    update_period = 10
    gamma = 0.99

    total_steps = 0
    epsilon = 0.95
    endEpsilon = 0.01
    stepDrop = (epsilon - endEpsilon) / max_epi

    for num_epi in range(max_epi):
        obs = env.reset()
        state = converter(ENV_NAME, obs).cuda()
        state = state.float()
        done = False
        total_reward = 0
        steps = 0
        if epsilon > endEpsilon:
            epsilon -= stepDrop

        while not done:
            steps += 1
            total_steps += 1
            a_out = policy_net.sample_action(state, epsilon)
            action_index = a_out
            action = make_19action(env, action_index)
            obs_prime, reward, done, info = env.step(action)

            total_reward += reward

            if done:
                print("%d episode is done" % num_epi)
                print("total rewards : %d " % total_reward)
                writer.add_scalar('Rewards/train', total_reward, num_epi)
                break

            state_prime = converter(ENV_NAME, obs_prime).cuda()
            append_sample(memory, policy_net, target_net, state, action_index,
                          reward, state_prime, done)
            state = state_prime

            if memory.size() > 1000:
                update_network(policy_net, target_net, memory, 2, optimizer,
                               total_steps)

            if total_steps % 2000 == 0:
                update_target(policy_net, target_net)
def replay():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    saver = tf.train.Saver()
    ckpt = tf.train.get_checkpoint_state('model')
    saver.restore(sess, ckpt.model_checkpoint_path)

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        state = game.reset()
        brain.init_state(state)

        while not terminal:
            action = brain.get_action()

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            brain.remember(state, action, reward, terminal)

            # 게임 진행을 인간이 인지할 수 있는 속도로^^; 보여줍니다.
            time.sleep(0.3)

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))
Beispiel #32
0
def replay():
	print('wake up the brain...')
	sess = tf.Session()

	game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=True)
	brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

	saver = tf.train.Saver()
	ckpt = tf.train.get_checkpoint_state('model')
	saver.restore(sess, ckpt.model_checkpoint_path)

	for episode in range(MAX_EPISODE):
		terminal = False
		total_reward = 0

		state = game.reset()
		brain.init_state(state)

		while not terminal:
			action = brain.get_action()

			state, reward, terminal = game.step(action)
			total_reward += reward

			brain.remember(state, action, reward, terminal)

			time.sleep(0.3)

		print('episode: %d, score: %d' % (episode + 1, total_reward))
Beispiel #33
0
def main():
    env = gym.make(config.ENV_NAME)
    agent = DQN(env)
    optimizer = optim.Adam(agent.parameters(), lr=0.001)
    finished = False

    for epoch in range(config.EPOCHS):
        state = env.reset()
        for step in range(config.ITERATIONS):
            action = agent.get_action(state, 'egreedy')
            next_state, reward, done, _ = env.step(action[0, 0])
            if done:
                reward = -1
            agent.replay_memory.push(Transition(
                config.FloatTensor([state]),
                action,
                config.FloatTensor([reward]),
                config.FloatTensor([next_state]) if not done else None))
            state = next_state
            if len(agent.replay_memory) >= config.BATCH_SIZE:
                batch = agent.replay_memory.sample(config.BATCH_SIZE)
                batch = Transition(*zip(*batch))
                non_final_mask = config.ByteTensor(
                    [s is not None for s in batch.next_state])
                non_final_next_state_batch = Variable(torch.cat([
                    s for s in batch.next_state if s is not None]))

                state_batch = Variable(torch.cat(batch.state),
                                       requires_grad=False)
                action_batch = Variable(torch.cat(batch.action).view(-1, 1),
                                        requires_grad=False)
                reward_batch = Variable(torch.cat(batch.reward),
                                        requires_grad=False)

                q_values = agent(state_batch).gather(1, action_batch)
                s_values = Variable(torch.zeros(config.BATCH_SIZE).type(
                    config.FloatTensor), requires_grad=False)
                s_values[non_final_mask] = agent(
                    non_final_next_state_batch).max(1)[0]
                expected_q_values = config.GAMMA * s_values + reward_batch
                loss = F.smooth_l1_loss(torch.sum(q_values),
                                        torch.sum(expected_q_values))
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            if done:
                break
        agent.epsilon = config.EPSILON_START - epoch / config.EPOCHS * (
            config.EPSILON_START - config.EPSILON_END)
        if epoch % config.TEST_INTERVAL == 0:
            sum_reward = 0
            for _epoch in range(config.TEST_EPOCHS):
                epoch_reward = 0
                state = env.reset()
                for step in range(config.TEST_ITERATIONS):
                    # env.render()
                    action = agent.get_action(state)  # Default
                    state, reward, done, _ = env.step(action[0, 0])
                    if done:
                        break
                    epoch_reward += reward
                sum_reward += epoch_reward
            avg_reward = sum_reward / config.TEST_EPOCHS
            print('Epoch: {}, Average Reward: {}'.format(epoch, avg_reward))
            print('Current Epsilon:', agent.epsilon)
            if avg_reward > 195:
                finished = True
        if finished:
            break

    while True:
        state = env.reset()
        round_reward = 0
        for step in range(config.TEST_ITERATIONS):
            env.render()
            action = agent.get_action(state)  # Default
            state, reward, done, _ = env.step(action[0, 0])
            if done:
                break
            round_reward += reward
        print('Round reward:', round_reward)
def train():
    print('뇌세포 깨우는 중..')
    sess = tf.Session()

    game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
    brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

    rewards = tf.placeholder(tf.float32, [None])
    tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    writer = tf.summary.FileWriter('logs', sess.graph)
    summary_merged = tf.summary.merge_all()

    # 타겟 네트웍을 초기화합니다.
    brain.update_target_network()

    # 다음에 취할 액션을 DQN 을 이용해 결정할 시기를 결정합니다.
    epsilon = 1.0
    # 프레임 횟수
    time_step = 0
    total_reward_list = []

    # 게임을 시작합니다.
    for episode in range(MAX_EPISODE):
        terminal = False
        total_reward = 0

        # 게임을 초기화하고 현재 상태를 가져옵니다.
        # 상태는 screen_width x screen_height 크기의 화면 구성입니다.
        state = game.reset()
        brain.init_state(state)

        while not terminal:
            # 입실론이 랜덤값보다 작은 경우에는 랜덤한 액션을 선택하고
            # 그 이상일 경우에는 DQN을 이용해 액션을 선택합니다.
            # 초반엔 학습이 적게 되어 있기 때문입니다.
            # 초반에는 거의 대부분 랜덤값을 사용하다가 점점 줄어들어
            # 나중에는 거의 사용하지 않게됩니다.
            if np.random.rand() < epsilon:
                action = random.randrange(NUM_ACTION)
            else:
                action = brain.get_action()

            # 일정 시간이 지난 뒤 부터 입실론 값을 줄입니다.
            # 초반에는 학습이 전혀 안되어 있기 때문입니다.
            if episode > OBSERVE:
                epsilon -= 1 / 1000

            # 결정한 액션을 이용해 게임을 진행하고, 보상과 게임의 종료 여부를 받아옵니다.
            state, reward, terminal = game.step(action)
            total_reward += reward

            # 현재 상태를 Brain에 기억시킵니다.
            # 기억한 상태를 이용해 학습하고, 다음 상태에서 취할 행동을 결정합니다.
            brain.remember(state, action, reward, terminal)

            if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
                # DQN 으로 학습을 진행합니다.
                brain.train()

            if time_step % TARGET_UPDATE_INTERVAL == 0:
                # 타겟 네트웍을 업데이트 해 줍니다.
                brain.update_target_network()

            time_step += 1

        print('게임횟수: %d 점수: %d' % (episode + 1, total_reward))

        total_reward_list.append(total_reward)

        if episode % 10 == 0:
            summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
            writer.add_summary(summary, time_step)
            total_reward_list = []

        if episode % 100 == 0:
            saver.save(sess, 'model/dqn.ckpt', global_step=time_step)
Beispiel #35
0
def train():
	print('wake up the brain...')
	sess = tf.Session()

	game = Game(SCREEN_WIDTH, SCREEN_HEIGHT, show_game=False)
	brain = DQN(sess, SCREEN_WIDTH, SCREEN_HEIGHT, NUM_ACTION)

	rewards = tf.placeholder(tf.float32, [None])
	tf.summary.scalar('avg.reward/ep.', tf.reduce_mean(rewards))

	saver = tf.train.Saver()
	sess.run(tf.global_variables_initializer())

	writer = tf.summary.FileWriter('logs', sess.graph)
	summary_merged = tf.summary.merge_all()

	brain.update_target_network()

	epsilon = 1.0
	time_step = 0
	total_reward_list = []

	for episode in range(MAX_EPISODE):
		terminal = False
		total_reward = 0

		state = game.reset()
		brain.init_state(state)

		while not terminal:
			if np.random.rand() < epsilon:
				action = random.randrange(NUM_ACTION)
			else:
				action = brain.get_action()

			if episode > OBSERVE:
				epsilon -= 1 / 1000.

			state, reward, terminal = game.step(action)
			total_reward += reward

			brain.remember(state, action, reward, terminal)

			if time_step > OBSERVE and time_step % TRAIN_INTERVAL == 0:
				brain.train()

			if time_step % TARGET_UPDATE_INTERVAL == 0:
				brain.update_target_network()

			time_step += 1

		print('episode: %d, score: %d' % (episode + 1, total_reward))

		total_reward_list.append(total_reward)

		if episode % 10 == 0:
			summary = sess.run(summary_merged, feed_dict={rewards: total_reward_list})
			writer.add_summary(summary, time_step)
			total_reward_list = []

		if episode % 100 == 0:
			saver.save(sess, 'model/dqn.ckpt', global_step=time_step)