Ejemplo n.º 1
0
    def __init__(self, device, data):
        self.data = data
        self.actor = Actor().to(device)
        self.critic = Critic().to(device)
        #self.ctarget = Critic().to(device)
        self.actor_opt = torch.optim.Adam(itertools.chain(
            self.actor.parameters()),
                                          lr=0.0001,
                                          betas=(0.0, 0.9))
        self.critic_opt = torch.optim.Adam(itertools.chain(
            self.critic.parameters()),
                                           lr=0.001,
                                           betas=(0.0, 0.9))

        def init_weights(m):
            if isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d):
                torch.nn.init.xavier_uniform_(m.weight.data)

        self.actor.apply(init_weights)
        self.critic.apply(init_weights)
        #self.ctarget.apply(init_weights)

        self.device = device
        self.memory = ReplayMemory(1000, device=device)
        self.batch_size = 5
        self.GAMMA = 0.99
        self.count = 0
Ejemplo n.º 2
0
	def __init__(self, session=None, arguments=None):

		self.sess = session
		self.args = arguments

		# Initialize Gym environment.
		self.environment = gym.make(self.args.env)        

		if self.args.env=='MountainCarContinuous-v0':
			input_dimensions = 2
			output_dimensions = 1
		elif self.args.env=='InvertedPendulum-v2':
			input_dimensions = 4
			output_dimensions = 1
		elif self.args.env=='FetchReach-v0':
			input_dimensions = 16
			output_dimensions = 4
		elif self.args.env=='FetchPush-v0':
			input_dimensions = 31
			output_dimensions = 4

		# Initialize a polivy network. 
		self.ACModel = ActorCriticModel(input_dimensions,output_dimensions,number_layers=4,hidden_units=40,sess=session,to_train=self.args.train, env=self.args.env)

		# Create the actual network
		if self.args.weights:
			self.ACModel.create_policy_network(session, pretrained_weights=self.args.weights,to_train=self.args.train)
		else:
			self.ACModel.create_policy_network(session, to_train=self.args.train)

		# Initialize a memory replay. 
		self.memory = ReplayMemory()

		# Create a trainer instance. 
		self.trainer = Trainer(sess=session,policy=self.ACModel, environment=self.environment, memory=self.memory,args=self.args)
Ejemplo n.º 3
0
 def __init__(self,device,actionsize):
     self.samplenet = DQN(actionsize).to(device)
     self.targetnet = DQN(actionsize).to(device)
     self.opt = torch.optim.Adam(itertools.chain(self.samplenet.parameters()),lr=0.00001,betas=(0.0,0.9))
     self.device = device
     self.memory = ReplayMemory(1000,device=device)
     self.BATCH_SIZE = 10
     self.GAMMA = 0.99
     self.count = 0
Ejemplo n.º 4
0
def get_player(pred_model: keras.Model, strat_model: keras.Model) -> RnnPlayer:
    small_replay_memory = ReplayMemory(1)
    small_rnn_replay_memory = RnnReplayMemory(1)

    prediction_network = PredictionNetwork(pred_model, small_replay_memory, 1,
                                           False)
    strategy_network = RnnStrategyNetwork(strat_model, small_rnn_replay_memory,
                                          1, False)

    return RnnPlayer(prediction_network, strategy_network, 0.0, 0.0)
Ejemplo n.º 5
0
    def __init__(self, **config):
        self.config = config
        self.n_actions = self.config["n_actions"]
        self.state_shape = self.config["state_shape"]
        self.batch_size = self.config["batch_size"]
        self.gamma = self.config["gamma"]
        self.initial_mem_size_to_train = self.config[
            "initial_mem_size_to_train"]
        torch.manual_seed(self.config["seed"])

        if torch.cuda.is_available():
            torch.backends.cudnn.deterministic = True
            torch.backends.cudnn.benchmark = False
            torch.cuda.empty_cache()
            torch.cuda.manual_seed(self.config["seed"])
            torch.cuda.manual_seed_all(self.config["seed"])
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")

        self.memory = ReplayMemory(self.config["mem_size"],
                                   self.config["alpha"], self.config["seed"])
        self.v_min = self.config["v_min"]
        self.v_max = self.config["v_max"]
        self.n_atoms = self.config["n_atoms"]
        self.support = torch.linspace(self.v_min, self.v_max,
                                      self.n_atoms).to(self.device)
        self.delta_z = (self.v_max - self.v_min) / (self.n_atoms - 1)
        self.offset = torch.linspace(0, (self.batch_size - 1) * self.n_atoms, self.batch_size).long() \
            .unsqueeze(1).expand(self.batch_size, self.n_atoms).to(self.device)

        self.n_step = self.config["n_step"]
        self.n_step_buffer = deque(maxlen=self.n_step)

        self.online_model = Model(self.state_shape, self.n_actions,
                                  self.n_atoms, self.support,
                                  self.device).to(self.device)
        self.target_model = Model(self.state_shape, self.n_actions,
                                  self.n_atoms, self.support,
                                  self.device).to(self.device)
        self.hard_update_target_network()

        self.optimizer = Adam(self.online_model.parameters(),
                              lr=self.config["lr"],
                              eps=self.config["adam_eps"])
Ejemplo n.º 6
0
    def __init__(self, session=None, arguments=None):

        self.sess = session
        self.args = arguments

        # Initialize Gym environment.
        self.environment = gym.make(self.args.env)

        if self.args.env == 'FetchReach-v0':
            input_dimensions = 16
        elif self.args.env == 'FetchPush-v0':
            input_dimensions = 31

        output_dimensions = 4

        # Initialize a polivy network.
        # self.ACModel = ActorCriticModel(input_dimensions,output_dimensions,sess=session,to_train=self.args.train)
        self.PolicyModel = DAggerPolicy(input_dimensions,
                                        output_dimensions,
                                        name_scope='PolicyModel',
                                        sess=session,
                                        to_train=self.args.train)

        # Create the actual network
        if self.args.weights:
            self.PolicyModel.create_policy_network(
                session,
                pretrained_weights=self.args.weights,
                to_train=self.args.train)
        else:
            self.PolicyModel.create_policy_network(session,
                                                   to_train=self.args.train)

        # Initialize a memory replay.
        self.memory = ReplayMemory()

        # Create a trainer instance.
        self.trainer = Trainer(sess=session,
                               policy=self.PolicyModel,
                               environment=self.environment,
                               memory=self.memory,
                               args=self.args)
Ejemplo n.º 7
0
def main():
    # create replay memories
    pred_memories = [ReplayMemory(prediction_replay_memory_size) for _ in range(1)]
    strat_memories = [RnnReplayMemory(strategy_replay_memory_size) for _ in range(1)]

    # create Networks
    pred_networks = [
        PredictionNetwork(prediction_resnet(), pred_memories[0], prediction_net_batch_size, True),
    ]
    print(pred_networks[0]._neural_network.summary())
    strat_networks = [
        RnnStrategyNetwork(strategy_deep_lstm_resnet(), strat_memories[0], strategy_net_batch_size, True),
    ]
    print(strat_networks[0]._neural_network.summary())

    # make pairs of the networks
    networks = list(sum(zip(pred_networks, strat_networks), ()))

    # give each network a name
    pred_network_names = [
        'normal_prediction'
    ]
    strat_network_names = [
        'normal_strategy'
    ]

    # make the same pairs as above
    network_names = list(sum(zip(pred_network_names, strat_network_names), ()))

    # create players
    players = [
        [RnnPlayer(pred_networks[0], strat_networks[0], prediction_exploration_rate, strategy_exploration_rate)
         for _ in range(4)],
    ]

    # flatten players
    players = sum(players, [])

    # create one PlayerInterlayer for each player
    players = [
        [RnnPlayerInterlayer(player, normal_pred_y_func, normal_strat_y_func) for player in players]
    ]
    players = sum(players, [])

    # create one Sitting
    sitting = Sitting(debugging)
    last_stop = datetime.datetime.now()
    r = random.Random()
    with open('stats_dev.txt', 'w') as f:
        f.write("// interval to print stats: " + str(interval_to_print_stats) + "\n")
        total_diff = 0
        total_losses = [0.0 for _ in range(len(networks))]
        for i in range(start_offset, total_rounds, 10):
            sitting.set_players(r.sample(players, 4))
            for _ in range(10):
                total_diff += sitting.play_full_round()
            i += 9
            if only_train_in_turn:
                index_to_train = i // turn_size % len(networks)
                total_losses[index_to_train] += networks[index_to_train].train()
            else:
                for net_i, network in enumerate(networks):
                    total_losses[net_i] += network.train()
            if (i + 1) % interval_to_print_stats == 0:
                print(str(i + 1), "rounds have been played")
                avg = total_diff / 4 / interval_to_print_stats
                print("Average difference of one player:\t", avg)
                losses_string = ', '.join([str(l) for l in np.array(total_losses) / interval_to_print_stats])
                print("The losses are:\t", losses_string)
                print("It took:", datetime.datetime.now() - last_stop)
                last_stop = datetime.datetime.now()
                print('')
                f.write(str(i + 1) + "\n")
                f.write(str(avg) + "\n")
                f.write(losses_string + "\n")
                total_diff = 0
                total_losses = [0.0 for _ in range(len(networks))]
            if (i + 1) % rounds_until_save == 0:
                for keras_net, net_name in zip(networks, network_names):
                    if 'random' in net_name:
                        continue
                    elif 'pred' in net_name:
                        full_name = prediction_save_path
                    elif 'strat' in net_name:
                        full_name = strategy_save_path
                    else:
                        assert 0, net_name
                    full_name += net_name + '_' + str(i + 1) + '.h5'
                    keras_net.save_network(full_name)
            if i + 1 == round_when_adding_players:
                print('adding players')
                # add 2 more normal players
                nps = [RnnPlayer(networks[-2], networks[-1], prediction_exploration_rate, strategy_exploration_rate)
                       for _ in range(2)]
                inps = [RnnPlayerInterlayer(nps[i], normal_pred_y_func, normal_strat_y_func) for i in range(2)]
                players += inps

                # add 2 static versions of the current normal player
                pred_mem = ReplayMemory(1)
                strat_mem = RnnReplayMemory(1)
                pred_net = load_model(prediction_save_path + 'normal_prediction_' + str(i + 1) + '.h5')
                strat_net = load_model(strategy_save_path + 'normal_strategy_' + str(i + 1) + '.h5')
                p_net = PredictionNetwork(pred_net, pred_mem, 1, False)
                s_net = RnnStrategyNetwork(strat_net, strat_mem, 1, False)
                ps = [RnnPlayer(p_net, s_net, 0.02, 0.02) for _ in range(2)]
                ips = [RnnPlayerInterlayer(ps[i], normal_pred_y_func, normal_strat_y_func) for i in range(2)]
                players += ips
Ejemplo n.º 8
0
 def __init__(self,
              observation_space,
              action_space,
              device,
              gamma=0.99,
              actor_lr=1e-4,
              critic_lr=1e-3,
              batch_size=64,
              memory_size=50000,
              tau=1e-3,
              weight_decay=1e-2,
              writer=None,
              is_image=False):
     super(DdpgAgent, self).__init__()
     self.num_state = observation_space.shape[0]
     self.num_action = action_space.shape[0]
     self.state_mean = None
     self.state_halfwidth = None
     if abs(observation_space.high[0]) != math.inf:
         self.state_mean = 0.5 * (observation_space.high +
                                  observation_space.low)
         self.state_halfwidth = 0.5 * (observation_space.high -
                                       observation_space.low)
     self.gamma = gamma
     self.batch_size = batch_size
     self.device = device
     self.actor = ActorNetwork(self.num_state,
                               action_space,
                               device,
                               is_image=is_image).to(self.device)
     self.actor_target = ActorNetwork(self.num_state,
                                      action_space,
                                      device,
                                      is_image=is_image).to(self.device)
     self.actor_target.load_state_dict(self.actor.state_dict())
     self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=actor_lr)
     self.critic = CriticNetwork(self.num_state,
                                 action_space,
                                 device,
                                 is_image=is_image).to(self.device)
     self.critic_target = CriticNetwork(self.num_state,
                                        action_space,
                                        device,
                                        is_image=is_image).to(self.device)
     self.critic_target.load_state_dict(self.critic.state_dict())
     self.critic_optimizer = optim.Adam(self.critic.parameters(),
                                        lr=critic_lr,
                                        weight_decay=weight_decay)
     self.memory = ReplayMemory(observation_space,
                                action_space,
                                device,
                                num_state=self.num_state,
                                memory_size=memory_size,
                                is_image=is_image)
     self.criterion = nn.SmoothL1Loss()
     self.device = torch.device(
         "cuda:0" if torch.cuda.is_available() else "cpu")
     self.tau = tau
     self.writer = writer
     self.update_step = 0
     self.is_image = is_image
Ejemplo n.º 9
0
    def __init__(
            self,
            observation_space,
            action_space,
            device,
            gamma=0.995,
            actor_lr=5e-4,
            critic_lr=5e-4,
            batch_size=128,
            memory_size=50000,
            tau=5e-3,
            weight_decay=1e-2,
            sigma=0.2,
            noise_clip=0.5,
            alpha=0.2,
            alpha_lr=3e-4,
            rollout_length=2048,
            lambda_=0.95,
            beta_clone=1.0,
            coef_ent=0.01,
            num_updates=32,
            policy_epoch=1,
            value_epoch=1,
            aux_num_updates=6,
            aux_epoch_batch=64,
            max_grad_norm=0.5,
            aux_critic_loss_coef=1.0,
            clip_eps=0.2,
            writer=None,
            is_image=False,
            clip_aux_critic_loss=None,
            clip_aux_multinet_critic_loss=None,
            multipleet_upadte_clip_grad_norm=None,
            summary_interval=1,
            debug_no_aux_phase=False):
        super(PpgAgent, self).__init__()
        self.action_mean = (0.5 * (action_space.high + action_space.low))[0]
        self.action_halfwidth = (0.5 * (action_space.high - action_space.low))[0]
        self.num_state = observation_space.shape[0]
        self.num_action = action_space.shape[0]
        self.state_mean = None
        self.state_halfwidth = None
        if abs(observation_space.high[0]) != math.inf:
            self.state_mean = 0.5 * (observation_space.high + observation_space.low)
            self.state_halfwidth = 0.5 * (observation_space.high - observation_space.low)
        self.gamma = gamma
        self.batch_size = batch_size
        self.device = device
        self.multipleNet = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.multipleNet_target = MultipleNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.multipleNet_target.load_state_dict(self.multipleNet.state_dict())
        self.multipleNet_optimizer = optim.Adam(self.multipleNet.parameters(), lr=actor_lr)

        self.critic = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.critic_target = CriticNetwork(self.num_state, action_space, device, is_image = is_image).to(self.device)
        self.critic_target.load_state_dict(self.critic.state_dict())
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr = critic_lr, weight_decay=weight_decay)

        self.alpha = alpha
        self.log_alpha = torch.zeros(1, requires_grad=True, device=self.device)
        self.log_alpha_optimizer = optim.Adam([self.log_alpha], lr = alpha_lr)

        self.memory = ReplayMemory(observation_space, action_space, device, num_state = self.num_state, memory_size = memory_size, is_image = is_image)
        self.criterion = nn.MSELoss()
        self.device = device
        self.tau = tau
        self.writer = writer
        self.is_image =is_image
        self.sigma = sigma
        self.noise_clip = noise_clip
        self.rollout_length = rollout_length
        self.lambda_ = lambda_
        self.coef_ent = coef_ent
        self.aux_critic_loss_coef = aux_critic_loss_coef
        self.max_grad_norm = max_grad_norm
        self.aux_num_updates = aux_num_updates
        self.clip_eps = clip_eps
        self.beta_clone = beta_clone
        self.policy_epoch = policy_epoch
        self.value_epoch = value_epoch
        self.num_updates = num_updates
        self.aux_epoch_batch = aux_epoch_batch
        self.clip_aux_critic_loss = clip_aux_critic_loss
        self.clip_aux_multinet_critic_loss = clip_aux_multinet_critic_loss
        self.multipleet_upadte_clip_grad_norm = multipleet_upadte_clip_grad_norm
        self.summary_interval = summary_interval
        self.debug_no_aux_phase = debug_no_aux_phase
        self.update_step = 0
Ejemplo n.º 10
0
TARGET_UPDATE = 10
learningrate = 0.001

# Creates the environment, search strategy and agent
env = EnvironmentManager(device, "CarRacing-v0", actionDict)
strat = EpsilonGreedyStrategy(EPS_END, EPS_END, EPS_DECAY)
agent = Agent(strat, env.num_actions_available(), device)

# Creates the policy and target network
policy_net = DQN(env.get_screen_height(), env.get_screen_width(), env.num_actions_available(), n_latent_var).to(device)
target_net = DQN(env.get_screen_height(), env.get_screen_width(), env.num_actions_available(), n_latent_var).to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(), lr=learningrate)
memory = ReplayMemory(10000)

InputLayer = keras.layers.Input(batch_shape=(None, 224, 224, 3))
road = keras.applications.MobileNetV2(input_tensor=InputLayer, weights=None, classes=2)
Nadam = keras.optimizers.Nadam(lr=0.001, beta_1=0.9, beta_2=0.999)
road.compile(optimizer=Nadam, loss='mean_squared_error', metrics=['accuracy'])
road.load_weights('Unitygym.h5')
print("Loaded keras weights")

writer = open("DQNRoad.csv", mode="a")

def runner(num_episodes, max_timestep, BATCH_SIZE, env):
    episodeRew = []
    for i_episode in range(num_episodes):
        # Initialize the environment and state
        env.reset()
Ejemplo n.º 11
0

def print_play_message(card_tnr: np.ndarray):
    output_str = "The Network plays: "
    if card_tnr[1] == 0:
        tmp = rank_strings[3:4] + rank_strings[5:6] + rank_strings[:3] + rank_strings[4:5] + rank_strings[6:]
        output_str += tmp[card_tnr[0]]
    else:
        output_str += rank_strings[card_tnr[0]]
    output_str += suit_strings[card_tnr[1]]
    print(output_str)


assert len(sys.argv) == 3, sys.argv

pred_memory = ReplayMemory(1)
strat_memory = RnnReplayMemory(1)

pred_network = PredictionNetwork(keras.models.load_model(sys.argv[1]), pred_memory, batch_size=1, can_train=False)
strat_network = RnnStrategyNetwork(keras.models.load_model(sys.argv[2]), strat_memory, batch_size=1, can_train=False)

player = RnnPlayer(pred_network, strat_network, 0, 0)

absolute_position = int(input("What is the index of the player? "))
assert 0 <= absolute_position < 4, "the given absolute position is " + str(absolute_position)
player_inter = RnnPlayerInterlayer(player, sum, sum)
player_inter.set_absolute_position(absolute_position)

# get the trump suit
trump_string = None
while trump_string not in suit_strings: