Ejemplo n.º 1
0
def collect_data():
    env = Osillator()
    EP_NUM = 1500
    data_set = []
    for ep in range(EP_NUM):
        ep_loss = 0
        state = env.reset()
        for t in range(env.max_iteration):
            state = torch.from_numpy(state).float().to(device)
            action = ppo.choose_action(state.cpu().data.numpy(), False)
            with torch.no_grad():
                ca1 = model_1(state)
                ca2 = model_2(state)
            control_action = ca1 * action[0] + ca2 * action[1]

            next_state, reward, done = env.step(
                control_action.cpu().data.numpy()[0])
            data_set.append([
                state.cpu().data.numpy()[0],
                state.cpu().data.numpy()[1],
                np.clip(control_action.cpu().data.numpy()[0], -1, 1)
            ])
            state = next_state
            if done:
                break
        print(t)
    return np.array(data_set)
Ejemplo n.º 2
0
def train_weight_adapter_DDPG(EP_NUM=2000):
	mkdir('./adapter_soft')
	env = Osillator()
	scores_deque = deque(maxlen=100)
	scores = []

	for ep in range(EP_NUM):
		state = env.reset()
		agent.reset()
		score = 0
		for t in range(200):
			action = agent.act(state)
			ca1 = model_1(torch.from_numpy(state).float().to(device)).cpu().data.numpy()[0]
			ca2 = model_2(torch.from_numpy(state).float().to(device)).cpu().data.numpy()[0]
			control_action = action[0]*ca1 + action[1]*ca2
			next_state, _, done = env.step(control_action, smoothness=0.5)
			reward = 5
			reward -= weight * abs(control_action) * 20
			reward -= 1 / weight * (abs(next_state[0]) + abs(next_state[1]))
			if done and t < 95:
				reward -= 100
			agent.step(state, action, reward, next_state, done, t)
			score += reward
			state = next_state            
			if done:
				break
		scores_deque.append(score)
		scores.append(score)
		score_average = np.mean(scores_deque)
		if ep % 1 == 0:
			print('\rEpisode {}, Average Score: {:.2f}, Current Score:{:.2f}, Max: {:.2f}, Min: {:.2f}, Epsilon: {:.2f}, Momery:{:.1f}'\
				  .format(ep, score_average,  scores[-1], np.max(scores), np.min(scores), agent.epsilon, len(agent.memory)), end="\n")     
		if ep > 0 and ep % 100 == 0:
			torch.save(agent.actor_local.state_dict(), './adapter_soft/adapter_'+str(ep)+'_'+str(weight)+ '.pth')
Ejemplo n.º 3
0
def collect_data(adapter_name, INDI_NAME):
    assert EXP1 == True
    env = Osillator()
    model = Weight_adapter(2, 2).to(device)
    model.load_state_dict(torch.load(adapter_name))
    EP_NUM = 1500
    data_set = []
    for ep in range(EP_NUM):
        ep_loss = 0
        state = env.reset()
        for t in range(env.max_iteration):
            state = torch.from_numpy(state).float().to(device)
            action = model(state).cpu().data.numpy()
            with torch.no_grad():
                ca1 = model_1(state)
                ca2 = model_2(state)
            control_action = ca1 * action[0] + ca2 * action[1]

            next_state, reward, done = env.step(
                control_action.cpu().data.numpy()[0])
            data_set.append([
                state.cpu().data.numpy()[0],
                state.cpu().data.numpy()[1],
                control_action.cpu().data.numpy()[0]
            ])
            state = next_state
            if done:
                break
        print(ep_loss, t)
    return np.array(data_set)
Ejemplo n.º 4
0
def train():
    env = Osillator()
    state_dim = 2
    action_dim = 2

    # reproducible
    # env.seed(RANDOMSEED)
    np.random.seed(RANDOMSEED)
    torch.manual_seed(RANDOMSEED)

    ppo = PPO(state_dim, action_dim, method=METHOD)
    global all_ep_r, update_plot, stop_plot
    all_ep_r = []
    for ep in range(EP_MAX):
        s = env.reset()
        ep_r = 0
        t0 = time.time()
        for t in range(EP_LEN):
            if RENDER:
                env.render()
            a = ppo.choose_action(s)
            u = gene_u(s, a, model_1, model_2)
            s_, _, done = env.step(u)
            # print(s, a, s_, r, done)
            # assert False
            r = 10
            r -= WEIGHT * abs(np.clip(u, -1, 1)) * 20
            r -= 1 / WEIGHT * (abs(s_[0]) + abs(s_[1]))
            if done and t < 95:
                r -= 100
            ppo.store_transition(
                s, a, r
            )  # useful for pendulum since the nets are very small, normalization make it easier to learn
            s = s_
            ep_r += r

            # update ppo
            if len(ppo.state_buffer) == BATCH_SIZE:
                ppo.finish_path(s_, done)
                ppo.update()
            # if done:
            #     break
        ppo.finish_path(s_, done)
        print(
            'Episode: {}/{}  | Episode Reward: {:.4f}  | Running Time: {:.4f}'.
            format(ep + 1, EP_MAX, ep_r,
                   time.time() - t0))
        if ep == 0:
            all_ep_r.append(ep_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + ep_r * 0.1)
        if PLOT_RESULT:
            update_plot.set()
        if (ep + 1) % 500 == 0 and ep >= 3000:
            ppo.save_model(path='ppo', ep=ep, weight=WEIGHT)
    if PLOT_RESULT:
        stop_plot.set()
    env.close()
Ejemplo n.º 5
0
def train_switcher_DDQN():
    mkdir('./adapter_ab')
    env = Osillator()
    model = DQN(2, 2).to(device)
    target_model = DQN(2, 2).to(device)
    optimizer = optim.Adam(model.parameters())
    EP_NUM = 2001
    frame_idx = 0
    fuel_list = []
    ep_reward = deque(maxlen=100)

    for ep in range(EP_NUM):
        state = env.reset()
        ep_r = 0
        for t in range(200):
            state = torch.from_numpy(state).float().to(device)
            epsilon = epsilon_by_frame(frame_idx)
            action = model.act(state, epsilon)
            with torch.no_grad():
                if action == 0:
                    control_action = model_1(state).cpu().data.numpy()[0]
                elif action == 1:
                    control_action = model_2(state).cpu().data.numpy()[0]
                else:
                    assert False
                    control_action = 0
            next_state, _, done = env.step(control_action)
            reward = 2
            reward -= weight * abs(control_action) * 20
            if done and t < 190:
                reward -= 100
            replay_buffer.push(state.cpu().numpy(), action, reward, next_state,
                               done)
            fuel_list.append(abs(control_action) * 20)
            state = next_state
            ep_r += reward
            frame_idx += 1
            if len(replay_buffer) > batch_size:
                loss = compute_td_loss(model, target_model, batch_size,
                                       optimizer)
            if frame_idx % 100 == 0:
                update_target(model, target_model)
            if done:
                break
        ep_reward.append(ep_r)
        print('epoch:', ep, 'reward:', ep_r, 'average reward:',
              np.mean(ep_reward), 'fuel cost:', sum(fuel_list[-t - 1:]),
              'epsilon:', epsilon, len(replay_buffer))
        if ep >= 100 and ep % 100 == 0:
            torch.save(
                model.state_dict(),
                './adapter_ab/ddqn_' + str(ep) + '_' + str(weight) + '.pth')
Ejemplo n.º 6
0
def distill(adapter_name, INDI_NAME):
    optimizer = torch.optim.SGD(Individual.parameters(),
                                lr=0.001,
                                momentum=0.9)
    loss_func = torch.nn.MSELoss()
    env = Osillator()
    model = DQN(2, 2).to(device)
    EP_NUM = 500

    model.load_state_dict(torch.load(adapter_name))

    for ep in range(EP_NUM):
        ep_loss = 0
        state = env.reset()
        for t in range(env.max_iteration):
            state = torch.from_numpy(state).float().to(device)
            action = model.act(state, epsilon=0)
            with torch.no_grad():
                if action == 0:
                    control_action = model_1(state)
                elif action == 1:
                    control_action = model_2(state)

            control_action.requires_grad = False
            prediction = Individual(state)
            loss = loss_func(prediction, control_action)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            ep_loss += loss.item()

            next_state, reward, done = env.step(
                control_action.cpu().data.numpy()[0])
            state = next_state
            if done:
                break
        print(ep_loss)
    torch.save(Individual.state_dict(), INDI_NAME)
Ejemplo n.º 7
0
def test(adapter_name=None, state_list=None, renew=False, mode='switch', INDI_NAME=None):
	print(mode)
	env = Osillator()
	EP_NUM = 500
	if mode == 'switch':
		model = DQN(2, 2).to(device)
		model.load_state_dict(torch.load(adapter_name))
	if mode == 'weight':
		model = Weight_adapter(2, 2).to(device)
		model.load_state_dict(torch.load(adapter_name))
	if mode == 'individual':
		Individual.load_state_dict(torch.load(INDI_NAME))
	if renew:
		state_list = []
	fuel_list = []
	ep_reward = []
	trajectory = []
	safe = []
	unsafe = []
	control_action_list = []
	for ep in range(EP_NUM):
		if renew:
			state = env.reset()
			state_list.append(state)
		else:
			assert len(state_list) == EP_NUM
			state = env.reset(state_list[ep][0], state_list[ep][1])
		ep_r = 0
		fuel = 0
		if ep == 0:
			trajectory.append(state)
		for t in range(env.max_iteration):
			state = torch.from_numpy(state).float().to(device)
			if mode == 'switch':
				action = model.act(state, epsilon=0)
				with torch.no_grad():
					if action == 0:
						control_action = model_1(state).cpu().data.numpy()[0]
					elif action == 1:
						control_action = model_2(state).cpu().data.numpy()[0]
					else:
						assert False
						control_action = 0
			elif mode == 'ppo':
				action = ppo.choose_action(state.cpu().data.numpy(), True)
				ca1 = model_1(state).cpu().data.numpy()[0]
				ca2 = model_2(state).cpu().data.numpy()[0]
				control_action = action[0]*ca1 + action[1]*ca2
				if ep == 0:
					print(t, state, control_action, action, ca1, ca2)				

			elif mode == 'average':
				ca1 = model_1(state).cpu().data.numpy()[0]
				ca2 = model_2(state).cpu().data.numpy()[0]
				control_action = (ca1 + ca2)/2
			elif mode == 'planning':
				ca1 = model_1(state).cpu().data.numpy()[0]
				ca2 = model_2(state).cpu().data.numpy()[0]
				control_action = plan(state, ca1, ca2) 

			elif mode == 'd1':
				control_action = model_1(state).cpu().data.numpy()[0]
				if ep == 0:
					print(state, control_action)

			elif mode == 'd2':
				control_action = model_2(state).cpu().data.numpy()[0]
				
			elif mode == 'individual':
				if ATTACK:
					delta, original = fgsm(Individual, state)
					# delta = torch.from_numpy(np.random.uniform(low=-SCALE, high=SCALE, size=state.shape)).float().to(device)
					control_action = Individual(state+delta).cpu().data.numpy()[0]
				else:
					control_action = Individual(state).cpu().data.numpy()[0]

			next_state, reward, done = env.step(control_action)
			control_action = np.clip(control_action, -1, 1)
			fuel += abs(control_action) * 20
			state = next_state
			if ep == 0:
				trajectory.append(state)
				control_action_list.append(control_action)
			ep_r += reward
			if done:
				break
		
		ep_reward.append(ep_r)
		if t >= 95:
			fuel_list.append(fuel)
			safe.append(state_list[ep])
		else:
			print(ep, state_list[ep])
			unsafe.append(state_list[ep])
	safe = np.array(safe)
	unsafe = np.array(unsafe)
	np.save('./plot/'+mode+'_safe.npy', safe)
	np.save('./plot/'+mode+'_unsafe.npy', unsafe)
	return ep_reward, np.array(fuel_list), state_list, np.array(control_action_list)
Ejemplo n.º 8
0
def test(adapter_name=None,
         state_list=None,
         renew=False,
         mode='switch',
         INDI_NAME=None):
    print(mode)
    env = Osillator()
    model = DQN(2, 2).to(device)
    EP_NUM = 500
    if mode == 'switch':
        model.load_state_dict(torch.load(adapter_name))
    if mode == 'individual':
        Individual.load_state_dict(torch.load(INDI_NAME))
    if renew:
        state_list = []
    fuel_list = []
    ep_reward = []
    trajectory = []
    for ep in range(EP_NUM):
        if renew:
            state = env.reset()
            state_list.append(state)
        else:
            assert len(state_list) == EP_NUM
            state = env.reset(state_list[ep][0], state_list[ep][1])
        ep_r = 0
        fuel = 0
        if ep == 0:
            trajectory.append(state)
        for t in range(env.max_iteration):
            state = torch.from_numpy(state).float().to(device)
            # flag = where_inv(state.cpu().numpy())
            if mode == 'switch':
                action = model.act(state, epsilon=0)
                with torch.no_grad():
                    if action == 0:
                        control_action = model_1(state).cpu().data.numpy()[0]
                    elif action == 1:
                        control_action = model_2(state).cpu().data.numpy()[0]
                    else:
                        assert False
                        control_action = 0
                if ep == 0:
                    print(t, state, action, control_action * 20)

            elif mode == 'd1':
                control_action = model_1(state).cpu().data.numpy()[0]

            elif mode == 'd2':
                control_action = model_2(state).cpu().data.numpy()[0]

            elif mode == 'individual':
                control_action = Individual(state).cpu().data.numpy()[0]

            next_state, reward, done = env.step(control_action)
            fuel += abs(control_action) * 20
            state = next_state
            if ep == 0:
                trajectory.append(state)
            ep_r += reward
            if done:
                break

        ep_reward.append(ep_r)
        if t >= 95:
            fuel_list.append(fuel)
        else:
            print(ep, state_list[ep])
        if ep == 0:
            trajectory = np.array(trajectory)
            # plt.figure()
            plt.plot(trajectory[:, 0], trajectory[:, 1], label=mode)
            plt.legend()
            plt.savefig('trajectory.png')
    return ep_reward, np.array(fuel_list), state_list
Ejemplo n.º 9
0
def test(adapter_name=None,
         state_list=None,
         renew=False,
         mode='switch',
         INDI_NAME=None):
    print(mode)
    env = Osillator()
    EP_NUM = 1
    if mode == 'switch':
        model = DQN(2, 2).to(device)
        model.load_state_dict(torch.load(adapter_name))
    if mode == 'weight':
        model = Weight_adapter(2, 2).to(device)
        model.load_state_dict(torch.load(adapter_name))
    if mode == 'individual':
        Individual.load_state_dict(torch.load(INDI_NAME))
    if renew:
        state_list = []
    fuel_list = []
    ep_reward = []
    trajectory = []
    safe = []
    unsafe = []
    control_action_list = []
    for ep in range(EP_NUM):
        if renew:
            state = env.reset()
            state_list.append(state)
        else:
            assert len(state_list) == EP_NUM
            state = env.reset(state_list[ep][0], state_list[ep][1])
        ep_r = 0
        fuel = 0
        if ep == 0:
            trajectory.append(state)
        for t in range(env.max_iteration):
            # attack happens here
            # state += np.random.uniform(low=-0.35, high=0.35, size=state.shape)
            state = torch.from_numpy(state).float().to(device)
            if mode == 'switch':
                action = model.act(state, epsilon=0)
                with torch.no_grad():
                    if action == 0:
                        control_action = model_1(state).cpu().data.numpy()[0]
                    elif action == 1:
                        control_action = model_2(state).cpu().data.numpy()[0]
                    else:
                        assert False
                        control_action = 0
            elif mode == 'weight':
                action = model(state).cpu().data.numpy()
                ca1 = model_1(state).cpu().data.numpy()[0]
                ca2 = model_2(state).cpu().data.numpy()[0]
                control_action = action[0] * ca1 + action[1] * ca2
                if ep == 0:
                    print(t, state, control_action, action, ca1, ca2)
            elif mode == 'average':
                ca1 = model_1(state).cpu().data.numpy()[0]
                ca2 = model_2(state).cpu().data.numpy()[0]
                control_action = (ca1 + ca2) / 2
            elif mode == 'planning':
                ca1 = model_1(state).cpu().data.numpy()[0]
                ca2 = model_2(state).cpu().data.numpy()[0]
                control_action = plan(state, ca1, ca2)

            elif mode == 'd1':
                control_action = model_1(state).cpu().data.numpy()[0]

            elif mode == 'd2':
                control_action = model_2(state).cpu().data.numpy()[0]

            elif mode == 'individual':
                # delta, original = fgsm(Individual, state)
                # if ep == 0:
                # 	print(delta, original)
                # control_action = Individual(state+delta).cpu().data.numpy()[0]
                control_action = Individual(state).cpu().data.numpy()[0]

            next_state, reward, done = env.step(control_action)
            control_action = np.clip(control_action, -1, 1)
            fuel += abs(control_action) * 20
            state = next_state
            if ep == 0:
                trajectory.append(state)
                control_action_list.append(control_action)
            ep_r += reward
            if done:
                break

        ep_reward.append(ep_r)
        if t >= 95:
            fuel_list.append(fuel)
            safe.append(state_list[ep])
        else:
            print(ep, state_list[ep])
            unsafe.append(state_list[ep])
        if ep == 0:
            trajectory = np.array(trajectory)
            # plt.figure()
            plt.plot(trajectory[:, 0], trajectory[:, 1], label=mode)
            plt.legend()
            plt.savefig('trajectory.png')
    # safe = np.array(safe)
    # unsafe = np.array(unsafe)
    # plt.figure()
    # plt.scatter(safe[:, 0], safe[:, 1], c='green')
    # plt.scatter(unsafe[:, 0], unsafe[:, 1], c='red')
    # plt.savefig('./safe_sample_plot/'+ mode +'.png')
    return ep_reward, np.array(fuel_list), state_list, np.array(
        control_action_list)
Ejemplo n.º 10
0
 #     thread = threading.Thread(target=train)
 #     thread.daemon = True
 #     thread.start()
 #     if PLOT_RESULT:
 #         drawer = Drawer()
 #         drawer.plot()
 #         drawer.save()
 #     thread.join()
 train()
 assert False
 # test
 env = Osillator()
 state_dim = 2
 action_dim = 2
 ppo = PPO(state_dim, action_dim, method=METHOD)
 ppo.load_model()
 mean_epoch_reward = 0
 for _ in range(TEST_EP):
     state = env.reset()
     for i in range(EP_LEN):
         if RENDER:
             env.render()
         action = ppo.choose_action(state, True)
         u = gene_u(state, action, model_1, model_2)
         next_state, reward, done = env.step(u)
         mean_epoch_reward += reward
         state = next_state
         if done:
             break
 print(mean_epoch_reward / TEST_EP)
 env.close()