Exemple #1
0
    def __init__(
            self,
            action_space,
            discount_factor=.99,  # gamma
    ):
        self.rpm = rpm(100000)  # 10k history
        self.plotter = plotter(num_lines=2)
        self.render = True
        self.training = True

        num_of_actions = 8
        self.outputdims = num_of_actions
        self.discount_factor = discount_factor

        ids, ods = None, num_of_actions

        self.actor = self.create_actor_network(ids, ods)
        self.critic = self.create_critic_network(ids, ods)
        self.actor_target = self.create_actor_network(ids, ods)
        self.critic_target = self.create_critic_network(ids, ods)

        self.feed, self.joint_inference, sync_target = self.train_step_gen()

        sess = ct.get_session()
        sess.run(tf.global_variables_initializer())

        sync_target()
Exemple #2
0
	def __init__(self,
		observation_space_dims,
		discount_factor,
		nb_actions = 19,
		rpm_size = 1000000,
		train_mult = 5




		):

		self.training = True
		self.discount_factor = discount_factor
		#self.noise_source = one_fsq_noise()
		#self.train_counter = 0
		self.train_multiplier = train_mult



		self.rpm = rpm(rpm_size)

		# Deal only with the continuous space for now...
		self.inputdims = observation_space_dims
		self.outputdims = nb_actions

		def clamper(actions):

			return np.clip(actions, a_max = 1.0 , a_min = 0.0)

		self.clamper = clamper

		ids, ods = self.inputdims, self.outputdims
Exemple #3
0
 def __init__(self, **kwargs):
     self.lr_act = 0.0038
     print(self.lr_act)
     self.lr_crit = 0
     self.batch_size = 64
     self.atoms = 80
     self.actions = 3
     self.channels = 9
     self.gamma = 0.65
     self.lambdaEntrop = 0.99
     print(self.lambdaEntrop)
     self.lambdaCrit = 0.41667
     self.weightDecay = False
     self.actor = CNNBase(self.channels, self.actions, self.atoms)
     self.optimizer_actor = optim.RMSprop(
         self.actor.parameters(), lr=self.lr_act, alpha=0.88, eps=1e-5
     )  #, alpha= 0.99, eps=1e-5)#, weight_decay=self.weightDecay)
     self.memory = rpm(250000)
     self.maxReward = 0
     self.minFrame = 0
     self.AveRew = 0
     self.bestEps = 0
     self.ModUpdate = 0
     self.Good = False
     self.maxSteps = 360
Exemple #4
0
def train():
    env = gym.make(GAME).unwrapped
    all_ep_r = []
    memory = rpm(1000000)

    agent = PPO(state_space=S_DIM, action_space=A_DIM, max_episode_num=EP_MAX, episode_lens=EP_LEN,
                discount_factor=GAMMA, actor_learning_rate=A_LR, critic_learning_rate=C_LR,
                mini_batch_size=MINI_BATCH_SIZE, epochs=EPOCHS)

    # load weights
    # agent.load_weights(SAVE_INDEX)

    # run(env, agent)
    for i in range(EP_MAX):
        [steps, episode_r, c_time, aloss, closs] = execute_one_episode(env, agent, memory)
        print('Ep: %4d' % i, "|Ep_r: %i" % episode_r, '|aloss: %8.4f' % aloss, '|closs: %8.4f' % closs,
              '|steps: %4d' % steps, '|time: %6.4f' % c_time)

        if i == 0:
            all_ep_r.append(episode_r)
        else:
            all_ep_r.append(all_ep_r[-1] * 0.9 + episode_r * 0.1)

    # create_path('weights/' + SAVE_INDEX)
    agent.save_weights(SAVE_INDEX)

    plt.plot(np.arange(len(all_ep_r)), all_ep_r)
    plt.xlabel('Episode')
    plt.ylabel('Moving averaged episode reward')

    create_path('weights/' + SAVE_INDEX + '/figure')
    plt.savefig('weights/' + SAVE_INDEX + '/figure/fig.png')
    plt.show()
	def __init__(self,
		observation_space_dims,
		discount_factor,
		nb_actions = 19,
		rpm_size = 1500000,
		train_mult = 10




		):

		self.training = True
		self.discount_factor = discount_factor
		#self.noise_source = one_fsq_noise()
		#self.train_counter = 0
		self.train_multiplier = train_mult



		self.rpm = rpm(rpm_size)

		# Deal only with the continuous space for now...
		self.inputdims = observation_space_dims
		self.outputdims = nb_actions

		def clamper(actions):

			return np.clip(actions, a_max = 1.0 , a_min = 0.0)

		self.clamper = clamper

		ids, ods = self.inputdims, self.outputdims

		#with tf.device('/device:GPU:0'):

		self.actor = self.create_actor_network(ids,ods)
		self.critic = self.create_critic_network(ids,ods)
		self.actor_target = self.create_actor_network(ids,ods)
		self.critic_target = self.create_critic_network(ids,ods)


		self.feed, self.joint_inference, sync_target = self.train_step_gen()


		sess = ct.get_session()
		sess.run(tf.global_variables_initializer())

		sync_target()

		import threading as th
		self.lock = th.Lock()

		self.reward_plotter = plt.figure()
		self.reward_collector = []
		self.learn_reward_collector = []

		self.phased_noise_anneal_duration = 100
Exemple #6
0
    def __init__(self, nb_status, nb_actions, args):
        self.num_actor = 3

        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn
        }
        self.actors = [
            Actor(self.nb_status, self.nb_actions)
            for _ in range(self.num_actor)
        ]
        self.actor_targets = [
            Actor(self.nb_status, self.nb_actions)
            for _ in range(self.num_actor)
        ]
        self.actor_optims = [
            Adam(self.actors[i].parameters(), lr=args.prate)
            for i in range(self.num_actor)
        ]

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        for i in range(self.num_actor):
            hard_update(
                self.actor_targets[i],
                self.actors[i])  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        # Create replay buffer
        self.memory = rpm(
            args.rmsize
        )  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()
Exemple #7
0
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0
        if self.pic:
            self.nb_status = args.pic_status

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'use_bn': args.bn,
            'init_method': args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)

        #Create replay buffer
        self.memory = rpm(
            args.rmsize
        )  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.use_cuda = args.cuda
        #
        if self.use_cuda: self.cuda()
Exemple #8
0
    def __init__(self, nb_status, nb_actions, args, writer):
        self.clip_actor_grad = args.clip_actor_grad
        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        self.writer = writer
        self.select_time = 0        
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn,
            'init_method':args.init_method
        }
        if args.pic:
            self.cnn = CNN(1, args.pic_status)
            self.cnn_target = CNN(1, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actor = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_status, self.nb_actions, **net_cfg)
        self.actor_optim  = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target, self.actor) # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        if args.pic:
            hard_update(self.cnn_target, self.cnn)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()
Exemple #9
0
    def __init__(self, nb_states, nb_actions, args, discrete, use_cuda=False):

        if args.seed > 0:
            self.seed(args.seed)

        self.nb_states = nb_states
        self.nb_actions = nb_actions
        self.discrete = discrete

        # Create Actor and Critic Network
        net_cfg = {
            'hidden1': args.hidden1,
            'hidden2': args.hidden2,
            'init_w': args.init_w
        }
        self.actor = Actor(self.nb_states * args.window_length,
                           self.nb_actions, **net_cfg)
        self.actor_target = Actor(self.nb_states * args.window_length,
                                  self.nb_actions, **net_cfg)
        self.actor_optim = Adam(self.actor.parameters(), lr=args.prate)

        self.critic = Critic(self.nb_states * args.window_length,
                             self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_states * args.window_length,
                                    self.nb_actions, **net_cfg)
        self.critic_optim = Adam(self.critic.parameters(), lr=args.rate)

        hard_update(self.actor_target,
                    self.actor)  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)

        #Create replay buffer
        self.memory = rpm(
            args.rmsize
        )  # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.bsize
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        #
        self.epsilon = 1.0
        self.s_t = None  # Most recent state
        self.a_t = None  # Most recent action
        self.is_training = True
        self.use_cuda = use_cuda
        #
        if self.use_cuda: self.cuda()
Exemple #10
0
    def __init__(self, nb_status, nb_actions, args):
        self.num_actor = 3

        self.nb_status = nb_status * args.window_length
        self.nb_actions = nb_actions
        self.discrete = args.discrete
        self.pic = args.pic
        if self.pic:
            self.nb_status = args.pic_status
        
        # Create Actor and Critic Network
        net_cfg = {
            'hidden1':args.hidden1, 
            'hidden2':args.hidden2, 
            'use_bn':args.bn
        }
        if args.pic:
            self.cnn = CNN(3, args.pic_status)
            self.cnn_optim = Adam(self.cnn.parameters(), lr=args.crate)
        self.actors = [Actor(self.nb_status, self.nb_actions) for _ in range(self.num_actor)]
        self.actor_targets = [Actor(self.nb_status, self.nb_actions) for _ in
                              range(self.num_actor)]
        self.actor_optims = [Adam(self.actors[i].parameters(), lr=args.prate) for i in range(self.num_actor)]

        self.critic = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_target = Critic(self.nb_status, self.nb_actions, **net_cfg)
        self.critic_optim  = Adam(self.critic.parameters(), lr=args.rate)

        for i in range(self.num_actor):
            hard_update(self.actor_targets[i], self.actors[i])  # Make sure target is with the same weight
        hard_update(self.critic_target, self.critic)
        
        #Create replay buffer
        self.memory = rpm(args.rmsize) # SequentialMemory(limit=args.rmsize, window_length=args.window_length)
        self.random_process = Myrandom(size=nb_actions)

        # Hyper-parameters
        self.batch_size = args.batch_size
        self.tau = args.tau
        self.discount = args.discount
        self.depsilon = 1.0 / args.epsilon

        # 
        self.epsilon = 1.0
        self.s_t = None # Most recent state
        self.a_t = None # Most recent action
        self.use_cuda = args.cuda
        # 
        if self.use_cuda: self.cuda()
Exemple #11
0
 def __init__(self, **kwargs):
     self.lr = 3e-4
     self.updata_time = 0
     self.batch_size = 64
     self.gamma = 0.999
     self.epsilon = 1.0
     self.Vmin = -160
     self.Vmax = 160
     self.atoms = 51
     self.actions = 7
     self.policy = DQN(10, self.actions, self.atoms)
     self.target = DQN(10, self.actions, self.atoms)
     self.reward = []
     self.memory = rpm(500000)
     self.target.load_state_dict(self.policy.state_dict())
     self.optimizer_policy = optim.Adam(self.policy.parameters(),
                                        lr=self.lr)
     self.support = torch.linspace(self.Vmin, self.Vmax,
                                   self.atoms).to(device)
Exemple #12
0
    def __init__(self, args):
        self.rpm = rpm(1000000)
        self.render = True
        self.training = True
        self.noise_source = one_fsq_noise()

        self.train_multiplier = args.train_multiplier
        self.inputdims = args.observation_space_dims

        low = 0.0
        high = 1.0
        num_of_actions = args.action_space
        self.action_bias = high / 2.0 + low / 2.0
        self.action_multiplier = high - self.action_bias

        def clamper(actions):
            return np.clip(actions, a_max=high, a_min=low)

        self.clamper = clamper

        self.outputdims = args.action_space
        self.discount_factor = args.gamma
        ids, ods = self.inputdims, self.outputdims
        print('inputdims:{}, outputdims:{}'.format(ids, ods))

        self.actor = models.create_actor_network(ids, ods).cuda()
        self.critic = models.create_critic_network(ids, ods).cuda()
        self.actor_target = models.create_actor_network(ids, ods).cuda()
        self.critic_target = models.create_critic_network(ids, ods).cuda()
        self.critic_criterion = nn.MSELoss().cuda()

        self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=1e-4)
        self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=3e-4)

        self.plot_epoch = [0]
        self.plot_reward = [0]
        import threading as th
        self.lock = th.Lock()
Exemple #13
0
        button_value = pbThread.get()
        lcd_clean()

    if(button_value == 0):
        #E0
        a = '    {0:8s}    '.format(time.strftime("%H:%M:%S", time.gmtime()))
        b = '{}'.format(time.strftime("%a, %d %b %Y", time.gmtime()))
        lcd_string = '{}\n{}'.format(a,b)

    elif(button_value == 1):
        #E1
        lcd_string = '{3:16s}\n{0:1d}    {1:4.0f}    {2:3.0f}'.format(
								s_struct.s_TelemInfo['mGear'],
								s_struct.s_TelemInfo['mEngineRPM'],
								s_struct.s_TelemInfo['mVelocity'],
								rpm.rpm(s_struct.s_TelemInfo['mEngineRPM'])
        )

    elif(button_value == 2):
        #E4
        lcd_string = 'T {0:2.3f} B {1:2.3f}\nS {2:2.3f} C {3:2.3f}'.format(
                                s_struct.s_TelemInfo['mUnfilteredThrottle'],
                                s_struct.s_TelemInfo['mUnfilteredBrake'],
                                s_struct.s_TelemInfo['mUnfilteredSteering'],
                                s_struct.s_TelemInfo['mUnfilteredClutch']
        )


    elif(button_value == 3):
        lcd_string = '{0:16s}\n{1:16s}'.format(
                               string_shift(s_struct.s_TelemInfo['mVehicleName']),
Exemple #14
0
	data, addr = sc.recvfrom(512)
	unpacked_data = unpack('64s64s2l12f', data)

	s_TelemInfo['mVehicleName'] = unpacked_data[0].split(b'\x00')[0].decode()
	s_TelemInfo['mTrackName'] = unpacked_data[1].split(b'\x00')[0].decode()
	s_TelemInfo['mLapNumber'] = unpacked_data[2]
	s_TelemInfo['mGear'] = unpacked_data[3]
	s_TelemInfo['mDeltaTime'] = unpacked_data[4]
	s_TelemInfo['mLapStartET'] = unpacked_data[5]
	s_TelemInfo['mEngineRPM'] = unpacked_data[6]
	s_TelemInfo['mEngineWaterTemp'] = unpacked_data[7]
	s_TelemInfo['mEngineOilTemp'] = unpacked_data[8]
	s_TelemInfo['mClutchRPM'] = unpacked_data[9]
	s_TelemInfo['mUnfilteredThrottle'] = unpacked_data[10]
	s_TelemInfo['mUnfilteredBrake'] = unpacked_data[11]
	s_TelemInfo['mUnfilteredSteering'] = unpacked_data[12]
	s_TelemInfo['mUnfilteredClutch'] = unpacked_data[13]
	s_TelemInfo['mSteeringArmForce'] = unpacked_data[14]
	s_TelemInfo['mVelocity'] = unpacked_data[15]

	lcd_string = '{3:16s}\n{0:1d}    {1:4.0f}    {2:3.0f}'.format(
								s_TelemInfo['mGear'],
								s_TelemInfo['mEngineRPM'],
								s_TelemInfo['mVelocity'],
								rpm.rpm(s_TelemInfo['mEngineRPM'])
								)

	lcd.message(lcd_string)
	lcd.cmd(0x2)
	print(lcd_string)
Exemple #15
0
    def __init__(
        self,
        observation_space_dims,
        action_space,
        stack_factor=1,
        discount_factor=.99,  # gamma
        # train_skip_every=1,
        train_multiplier=1,
    ):
        self.rpm = rpm(1000000)  # 1M history
        self.plotter = plotter(num_lines=3)
        self.render = True
        self.training = True
        self.noise_source = one_fsq_noise()
        self.train_counter = 0
        # self.train_skip_every = train_skip_every
        self.train_multiplier = train_multiplier
        self.observation_stack_factor = stack_factor

        self.inputdims = observation_space_dims * self.observation_stack_factor
        # assume observation_space is continuous

        self.is_continuous = True if isinstance(action_space, Box) else False

        if self.is_continuous:  # if action space is continuous

            low = action_space.low
            high = action_space.high

            num_of_actions = action_space.shape[0]

            self.action_bias = high / 2. + low / 2.
            self.action_multiplier = high - self.action_bias

            # say high,low -> [2,7], then bias -> 4.5
            # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7]

            def clamper(actions):
                return np.clip(actions,
                               a_max=action_space.high,
                               a_min=action_space.low)

            self.clamper = clamper
        else:
            num_of_actions = action_space.n

            self.action_bias = .5
            self.action_multiplier = .5  # map (-1,1) into (0,1)

            def clamper(actions):
                return np.clip(actions, a_max=1., a_min=0.)

            self.clamper = clamper

        self.outputdims = num_of_actions
        self.discount_factor = discount_factor
        ids, ods = self.inputdims, self.outputdims
        print('inputdims:{}, outputdims:{}'.format(ids, ods))

        self.actor = self.create_actor_network(ids, ods)
        self.critic = self.create_critic_network(ids, ods)
        self.actor_target = self.create_actor_network(ids, ods)
        self.critic_target = self.create_critic_network(ids, ods)

        # print(self.actor.get_weights())
        # print(self.critic.get_weights())

        self.feed, self.joint_inference, sync_target = self.train_step_gen()

        sess = ct.get_session()
        sess.run(tf.global_variables_initializer())

        sync_target()

        import threading as th
        self.lock = th.Lock()

        if not hasattr(self, 'wavegraph'):
            num_waves = self.outputdims * 2 + 1

            def rn():
                r = np.random.uniform()
                return 0.2 + r * 0.4

            colors = []
            for i in range(num_waves - 1):
                color = [rn(), rn(), rn()]
                colors.append(color)
            colors.append([0.2, 0.5, 0.9])
            self.wavegraph = wavegraph(num_waves, 'actions/noises/Q',
                                       np.array(colors))
Exemple #16
0
    def __init__(
        self,
        observation_space,
        action_space,
        stack_factor=1,
        discount_factor=.99,  # gamma
        train_skip_every=1,
    ):
        self.rpm = rpm(1000000)  # 1M history
        self.render = True
        self.noise_source = one_fsq_noise()
        self.train_counter = 0
        self.train_skip_every = train_skip_every
        self.observation_stack_factor = stack_factor

        self.inputdims = observation_space.shape[
            0] * self.observation_stack_factor
        # assume observation_space is continuous

        self.is_continuous = True if isinstance(action_space, Box) else False

        if self.is_continuous:  # if action space is continuous

            low = action_space.low
            high = action_space.high

            num_of_actions = action_space.shape[0]

            self.action_bias = high / 2. + low / 2.
            self.action_multiplier = high - self.action_bias

            # say high,low -> [2,7], then bias -> 4.5
            # mult = 2.5. then [-1,1] multiplies 2.5 + bias 4.5 -> [2,7]

            def clamper(actions):
                return np.clip(actions,
                               a_max=action_space.high,
                               a_min=action_space.low)

            self.clamper = clamper
        else:
            num_of_actions = action_space.n

            self.action_bias = .5
            self.action_multiplier = .5  # map (-1,1) into (0,1)

            def clamper(actions):
                return np.clip(actions, a_max=1., a_min=0.)

            self.clamper = clamper

        self.outputdims = num_of_actions
        self.discount_factor = discount_factor
        ids, ods = self.inputdims, self.outputdims
        print('inputdims:{}, outputdims:{}'.format(ids, ods))

        self.actor = self.create_actor_network(ids, ods)
        self.critic = self.create_critic_network(ids, ods)
        self.actor_target = self.create_actor_network(ids, ods)
        self.critic_target = self.create_critic_network(ids, ods)

        # print(self.actor.get_weights())
        # print(self.critic.get_weights())

        self.feed, self.joint_inference, sync_target = self.train_step_gen()

        sess = ct.get_session()
        sess.run(tf.global_variables_initializer())

        sync_target()
class Game(object):
    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True
    sess = tf.Session(config=config)
    from keras import backend as K
    K.set_session(sess)

    mx = 0
    LOG = 0
    trainnum = 0
    modelcnt = 0
    noiselevel = 0.5
    rpm = rpm(2000000)
    TAU = 0.001
    lr_actor = 3e-4
    lr_critic = 3e-4
    train_interval = 1
    train_times = 100
    action_dim = 18
    state_dim = 76
    max_steps = 1000 // 4
    cnt = 0
    GAMMA = 0.96
    BATCH_SIZE = 128
    log_path = './logs'

    import threading as th
    lock = th.Lock()

    actor = ActorNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU,
                         lr_actor)
    critic = CriticNetwork(sess, state_dim, action_dim, BATCH_SIZE, TAU,
                           lr_critic)

    callback = TensorBoard(log_path)
    callback.set_model(critic.model)

    def write_log(self, callback, names, logs, batch_no):
        output = open('logs/data.txt', 'w')
        output.write(str(self.LOG) + ' ' + str(self.trainnum))
        output.close()
        for name, value in zip(names, itertools.repeat(logs)):
            summary = tf.Summary()
            summary_value = summary.value.add()
            summary_value.simple_value = value
            summary_value.tag = name
            callback.writer.add_summary(summary, batch_no)
            callback.writer.flush()
        callback = TensorBoard(self.log_path)

    def play(self, env, cnt):
        episode_memory = []
        step = 0
        s_t = env.reset()
        total_reward = 0.
        sp = 0.
        noise_t = np.zeros([1, self.action_dim])
        a_t = np.zeros([1, self.action_dim])
        noise = self.noiselevel
        self.noiselevel = noise * 0.999
        for j in range(self.max_steps):
            self.lock.acquire()
            global graph
            with graph.as_default():
                a_t_original = self.actor.model.predict(np.array([s_t]))
            self.lock.release()
            noise = noise * 0.98
            if cnt % 3 == 0:
                if j % 5 == 0:
                    noise_t[0] = np.random.randn(self.action_dim) * noise
            elif cnt % 3 == 1:
                if j % 5 == 0:
                    noise_t[0] = np.random.randn(self.action_dim) * noise * 2
            else:
                noise_t = np.zeros([1, self.action_dim])
            a_t = a_t_original + noise_t
            for i in range(self.action_dim):
                if (a_t[0][i] > 1):
                    a_t[0][i] = 1
                elif (a_t[0][i] < 0):
                    a_t[0][i] = 0
            ob, r_t, done, _, pen = env.step(a_t[0])
            s_t1 = ob
            episode_memory.append([s_t, a_t[0], r_t - pen, done, s_t1])
            total_reward += r_t
            sp += pen
            s_t = s_t1
            step += 1
            if done or step == 1000 / 4 - 1:
                if total_reward > self.mx:
                    self.mx = total_reward
                print("Episode", cnt, "Step", step, "Reward", total_reward,
                      "max", self.mx, "penalty", sp)
                train_names = ['reward']
                self.lock.acquire()
                self.LOG = self.LOG + 1
                self.write_log(self.callback, train_names, total_reward,
                               self.LOG)
                self.lock.release()
                break
        self.lock.acquire()
        for i in range(step):
            self.rpm.add(episode_memory[i])
        self.lock.release()

    def playonce(self, env, T):
        from multi import fastenv
        fenv = fastenv(env, 4)
        self.play(fenv, T)
        env.rel()
        del fenv

    def play_ignore(self, env, T):
        import threading as th
        try:
            t = th.Thread(target=self.playonce, args=(
                env,
                T,
            ))
            t.setDaemon(True)
            t.start()
        except:
            print("startfail")

    def playifavailable(self, T):
        while True:
            remote_env = farmer.acq_env()
            if remote_env == False:
                pass
            else:
                self.play_ignore(remote_env, T)
                break

    def train(self):
        memory = self.rpm
        if memory.size() < self.BATCH_SIZE:
            return
        global graph
        loss = 0
        for T in range(self.train_times):
            [states, actions, rewards, dones,
             new_states] = memory.sample_batch(self.BATCH_SIZE)
            y_t = np.asarray([0.0] * self.BATCH_SIZE)
            rewards = np.concatenate(rewards)
            self.lock.acquire()
            with graph.as_default():
                target_q_values = self.critic.target_model.predict(
                    [new_states,
                     self.actor.target_model.predict(new_states)])
            target_q_values = target_q_values.reshape(
                [1, target_q_values.shape[0]])[0]
            for k in range(self.BATCH_SIZE):
                if dones[k]:
                    y_t[k] = rewards[k]
                else:
                    y_t[k] = rewards[k] + self.GAMMA * target_q_values[k]
            with graph.as_default():
                self.critic.model.optimizer.learning_rate = self.lr_critic
                logs = self.critic.model.train_on_batch([states, actions], y_t)
                a_for_grad = self.actor.model.predict(states)
                grads = self.critic.gradients(states, a_for_grad)
                self.actor.train(states, grads, learning_rate=self.lr_actor)
                self.actor.target_train()
                self.critic.target_train()
            train_names = ['train_loss']
            self.write_log(self.callback, train_names, logs, self.trainnum)
            self.trainnum = self.trainnum + 1
            loss = loss + logs
            self.lock.release()
        print("train", memory.size(), loss)

    def save(self):
        self.modelcnt = self.modelcnt + 1
        self.actor.target_model.save_weights("logs/actormodel.h5",
                                             overwrite=True)
        self.critic.target_model.save_weights("logs/criticmodel.h5",
                                              overwrite=True)
        self.actor.target_model.save_weights("logs/actormodel{}.h5".format(
            self.modelcnt))
        self.critic.target_model.save_weights("logs/criticmodel{}.h5".format(
            self.modelcnt))
        print("save")

    def pre(self):
        print("Now we load the weight")
        try:
            input = open('logs/data.txt', 'r')
            self.LOG, self.trainnum = map(int, input.read().split(' '))
            print("LOG", self.LOG, "trainnum", self.trainnum)
            input.close()
            print("log found")
            self.critic.model.load_weights("logs/criticmodel.h5")
            self.critic.target_model.load_weights("logs/criticmodel.h5")
            self.actor.model.load_weights("logs/actormodel.h5")
            self.actor.target_model.load_weights("logs/actormodel.h5")
            print("Weight load successfully")
            self.rpm.load('logs/rpm.pickle')
            print("rmp success")
        except:
            if self.LOG > 0:
                print("Load fault")
                return False
            else:
                print("A new experiment")
        return True

    def run(self):
        np.random.seed(23333)
        episode_count = 10000
        reward = 0
        done = False
        LOSS = 0

        for T in range(50):
            self.playifavailable(T)
        for T in range(episode_count):
            self.train()
            self.playifavailable(T)
            if np.mod(T, 100) == 0 and T >= 100:
                self.save()
        print("Finish.")