コード例 #1
0
	def __init__(self, env, discount = 0.99, model_filename = None, history_filename = None):
		self.env = env
		self.discount = discount
		self.model_filename = model_filename
		self.history_filename = history_filename

		from keras.optimizers import SGD
		self.model = MarketPolicyGradientModelBuilder(self.model_filename).getModel()
		sgd = SGD(lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True)
		self.model.compile(loss='mse', optimizer='rmsprop')
コード例 #2
0
	def __init__(self, env, discount = 0.99, model_filename = None, history_filename = None, max_memory=100):
		self.env = env
		self.discount = discount
		self.model_filename = model_filename
		self.history_filename = history_filename

		self.max_memory = max_memory 

		# 没有利用SGD 
		from keras.optimizers import SGD
		self.model = MarketPolicyGradientModelBuilder(modelFilename).getModel()
		sgd = SGD(lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True)
		# 利用rmsprop 
		#self.model.compile(loss='mse', optimizer='rmsprop')
		self.model.compile(loss='binary_crossentropy', optimizer='rmsprop')
コード例 #3
0
class PolicyGradient:
    def __init__(self,
                 env,
                 env_test,
                 discount=0.99,
                 model_filename=None,
                 history_filename=None):
        self.env = env
        self.env_test = env_test
        self.discount = discount
        self.model_filename = model_filename
        self.history_filename = history_filename

        from keras.optimizers import SGD
        self.model = MarketPolicyGradientModelBuilder(modelFilename).getModel()
        sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
        self.model.compile(loss='mse', optimizer='rmsprop')

        self.test_avg_reward_sum = 0
        self.avg_reward_sum = 0

    def discount_rewards(self, r):
        discounted_r = np.zeros_like(r)
        running_add = 0
        r = r.flatten()

        for t in reversed(range(0, r.size)):
            if r[t] != 0:
                running_add = 0

            running_add = running_add * self.discount + r[t]
            discounted_r[t] = running_add

        return discounted_r

    def paper(self, code):
        def take_position():
            date2position = {}
            game_over = False
            observation = self.env_test._reset(code)
            while not game_over:
                aprob = self.model.predict(observation)[0]
                if aprob.shape[0] > 1:
                    action = np.random.choice(env_test.action_space.n,
                                              1,
                                              p=aprob / np.sum(aprob))[0]
                else:
                    action = 0 if np.random.uniform() < aprob else 1
                observation, reward, game_over, info = env_test.step(action)
                date2position[info['dt']] = action
            return date2position

        def cum_return(sym, row):
            date = row['date']
            close_rel = row['close_rel']
            if date in date2position:
                position = date2position[date]
                if 1 == position:
                    result['cum'] *= close_rel
                    return result['cum']
                elif 0 == position:
                    result['cum'] *= 2 - close_rel
                    return result['cum']
            else:
                print(date, 'not in date2postion')
                return result['cum']

        def cum_return_bh(sym, row):  ## buy and hold
            def bought_every_day(sym, current_day):
                return 1

            position = bought_every_day(sym, row['date'])
            if 1 == position:
                result['cum'] *= row['close_rel']
            elif 0 == position:
                pass
            elif -1 == position:
                pass  # no short
            return result['cum']

        def cum_return_sh(sym, row):  # short and hold
            def bought_every_day(sym, current_day):
                return -1

            position = bought_every_day(sym, row['date'])
            if 1 == position:
                result['cum'] *= row['close_rel']
            elif 0 == position:
                pass
            elif -1 == position:
                result['cum'] *= 2 - row['close_rel']
            return result['cum']

        df = pd.read_csv(os.path.join(local_path, 'data', '%s.csv' % code))
        start = self.env_test.startDate
        end = self.env_test.endDate
        df = df[(df.date >= start) & (df.date < end)]
        dates = pd.to_datetime(df.date, format='%Y-%m-%d')
        df['close_rel'] = (df.close / df.close.shift(1)).fillna(1.0)

        date2position = take_position()
        result = {'cum': 1}
        df_cum = df.apply(lambda x: cum_return('^DJI', x), axis=1)
        import matplotlib.pyplot as plt
        plt.plot(dates, df_cum)
        df_cum = df.apply(lambda x: cum_return_bh('^DJI', x), axis=1)
        plt.plot(dates, df_cum)
        df_cum = df.apply(lambda x: cum_return_sh('^DJI', x), axis=1)
        plt.plot(dates, df_cum)
        plt.show()

    def test(self, e, code, verbose=False):
        env_test = self.env_test
        model = self.model
        env_test._reset(code)
        observation = env_test._reset(code)
        game_over = False
        reward_sum = 0
        while not game_over:
            aprob = model.predict(observation)[0]
            if aprob.shape[0] > 1:
                action = np.random.choice(env_test.action_space.n,
                                          1,
                                          p=aprob / np.sum(aprob))[0]
            else:
                action = 0 if np.random.uniform() < aprob else 1

            observation, reward, game_over, info = env_test.step(action)
            reward_sum += float(reward)
            if verbose > 0:
                if env_test.actions[action] == "LONG" or env_test.actions[
                        action] == "SHORT":
                    color = bcolors.FAIL if env_test.actions[
                        action] == "LONG" else bcolors.OKBLUE
                    print("%s:\t%s\t%.2f\t%.2f\t" %
                          (info["dt"], color + env_test.actions[action] +
                           bcolors.ENDC, reward_sum, info["cum"]) +
                          ("\t".join([
                              "%s:%.2f" % (l, i)
                              for l, i in zip(env_test.actions, aprob.tolist())
                          ])))

        self.test_avg_reward_sum = self.test_avg_reward_sum * 0.99 + reward_sum * 0.01
        toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (
            e, info["code"],
            (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) +
            ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"],
            self.test_avg_reward_sum)
        return toPrint

    def train(self, max_episode=1000000, max_path_length=200, verbose=True):
        env = self.env
        model = self.model

        for e in range(max_episode):
            from random import random
            code = self.env.targetCodes[int(random() *
                                            len(self.env.targetCodes))]
            env._reset(code)
            observation = env._reset(code)
            game_over = False
            reward_sum = 0

            inputs = []
            outputs = []
            predicteds = []
            rewards = []

            while not game_over:
                aprob = model.predict(observation)[0]
                inputs.append(observation)
                predicteds.append(aprob)

                if aprob.shape[0] > 1:
                    action = np.random.choice(self.env.action_space.n,
                                              1,
                                              p=aprob / np.sum(aprob))[0]

                    y = np.zeros([self.env.action_space.n])
                    y[action] = 1.

                    outputs.append(y)
                else:
                    action = 0 if np.random.uniform() < aprob else 1

                    y = [float(action)]
                    outputs.append(y)

                observation, reward, game_over, info = self.env.step(action)
                reward_sum += float(reward)

                rewards.append(float(reward))

                if verbose > 0:
                    if env.actions[action] == "LONG" or env.actions[
                            action] == "SHORT":
                        color = bcolors.FAIL if env.actions[
                            action] == "LONG" else bcolors.OKBLUE
                        print("%s:\t%s\t%.2f\t%.2f\t" %
                              (info["dt"], color + env.actions[action] +
                               bcolors.ENDC, reward_sum, info["cum"]) +
                              ("\t".join([
                                  "%s:%.2f" % (l, i)
                                  for l, i in zip(env.actions, aprob.tolist())
                              ])))

            self.avg_reward_sum = self.avg_reward_sum * 0.99 + reward_sum * 0.01
            toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (
                e, info["code"],
                (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) +
                ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"],
                self.avg_reward_sum)
            print(toPrint, '\t', self.test(e, code))
            if self.history_filename != None:
                os.system("echo %s >> %s" % (toPrint, self.history_filename))

            dim = len(inputs[0])
            inputs_ = [[] for i in range(dim)]
            for obs in inputs:
                for i, block in enumerate(obs):
                    inputs_[i].append(block[0])
            inputs_ = [np.array(inputs_[i]) for i in range(dim)]

            outputs_ = np.vstack(outputs)
            predicteds_ = np.vstack(predicteds)
            rewards_ = np.vstack(rewards)

            discounted_rewards_ = self.discount_rewards(rewards_)
            #discounted_rewards_ -= np.mean(discounted_rewards_)
            discounted_rewards_ /= np.std(discounted_rewards_)

            #outputs_ *= discounted_rewards_
            for i, r in enumerate(zip(rewards, discounted_rewards_)):
                reward, discounted_reward = r

                if verbose > 1:
                    print(outputs_[i], end=' ')

                #outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward
                if discounted_reward < 0:
                    outputs_[i] = 1 - outputs_[i]
                    outputs_[i] = outputs_[i] / sum(outputs_[i])
                outputs_[i] = np.minimum(
                    1,
                    np.maximum(
                        0, predicteds_[i] + (outputs_[i] - predicteds_[i]) *
                        abs(discounted_reward)))

                if verbose > 1:
                    print(predicteds_[i], outputs_[i], reward,
                          discounted_reward)

            model.fit(inputs_, outputs_, nb_epoch=1, verbose=0, shuffle=True)
            model.save_weights(self.model_filename)
コード例 #4
0
class PolicyGradient:
    def __init__(self,
                 env,
                 discount=0.99,
                 model_filename=None,
                 history_filename=None):
        self.env = env
        self.discount = discount
        self.model_filename = model_filename
        self.history_filename = history_filename
        self.model = MarketPolicyGradientModelBuilder(
            model_filename).getModel()
        sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
        self.model.compile(loss='mse', optimizer='rmsprop')

    def discount_rewards(self, r):
        discounted_r = zeros_like(r)
        running_add = 0
        r = r.flatten()

        for t in reversed(range(0, r.size)):
            if r[t] != 0:
                running_add = 0

            running_add = running_add * self.discount + r[t]
            discounted_r[t] = running_add

        return discounted_r

    def train(self, max_episode=10, max_path_length=200, verbose=0):
        env = self.env
        model = self.model
        avg_reward_sum = 0.

        for e in range(max_episode):
            env._reset()
            observation = env._reset()
            game_over = False
            reward_sum = 0

            inputs = []
            outputs = []
            predicteds = []
            rewards = []

            while not game_over:
                aprob = model.predict(observation)[0]
                inputs.append(observation)
                predicteds.append(aprob)

                if aprob.shape[0] > 1:
                    action = random.choice(self.env.action_space.n,
                                           1,
                                           p=aprob / sum(aprob))[0]

                    y = zeros([self.env.action_space.n])
                    y[action] = 1.

                    outputs.append(y)
                else:
                    action = 0 if random.uniform() < aprob else 1

                    y = [float(action)]
                    outputs.append(y)

                observation, reward, game_over, info = self.env._step(action)
                reward_sum += float(reward)

                rewards.append(float(reward))

                if verbose > 0:
                    if env.actions[action] == "LONG" or env.actions[
                            action] == "SHORT":
                        color = bcolors.FAIL if env.actions[
                            action] == "LONG" else bcolors.OKBLUE
                        print("%s:\t%s\t%.2f\t%.2f\t" %
                              (info["dt"], color + env.actions[action] +
                               bcolors.ENDC, reward_sum, info["cum"]) +
                              ("\t".join([
                                  "%s:%.2f" % (l, i)
                                  for l, i in zip(env.actions, aprob.tolist())
                              ])))

            avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
            toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (
                e, info["code"],
                (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) +
                ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"],
                avg_reward_sum)
            print(toPrint)
            if self.history_filename != None:
                system("echo %s >> %s" % (toPrint, self.history_filename))

            dim = len(inputs[0])
            inputs_ = [[] for i in range(dim)]
            for obs in inputs:
                for i, block in enumerate(obs):
                    inputs_[i].append(block[0])
            inputs_ = [array(inputs_[i]) for i in range(dim)]

            outputs_ = vstack(outputs)
            predicteds_ = vstack(predicteds)
            rewards_ = vstack(rewards)

            discounted_rewards_ = self.discount_rewards(rewards_)
            #discounted_rewards_ -= mean(discounted_rewards_)
            discounted_rewards_ /= std(discounted_rewards_)

            #outputs_ *= discounted_rewards_
            for i, r in enumerate(zip(rewards, discounted_rewards_)):
                reward, discounted_reward = r

                if verbose > 1:
                    print(outputs_[i], end=' ')

                #outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward
                if discounted_reward < 0:
                    outputs_[i] = 1 - outputs_[i]
                    outputs_[i] = outputs_[i] / sum(outputs_[i])
                outputs_[i] = minimum(
                    1,
                    maximum(
                        0, predicteds_[i] + (outputs_[i] - predicteds_[i]) *
                        abs(discounted_reward)))

                if verbose > 1:
                    print(predicteds_[i], outputs_[i], reward,
                          discounted_reward)

            model.fit(inputs_, outputs_, nb_epoch=1, verbose=0, shuffle=True)
            model_json = model.to_json()
            with open(join(BASE_DIR, "models", self.model_filename + ".json"),
                      "w") as json_file:
                json_file.write(model_json)
            model.save_weights(
                join(BASE_DIR, "models", self.model_filename + ".h5"))
コード例 #5
0
class PolicyGradient:
    def __init__(self,
                 env,
                 discount=0.99,
                 model_filename=None,
                 history_filename=None):
        self.env = env
        self.discount = discount
        self.model_filename = model_filename
        self.history_filename = history_filename

        from keras.optimizers import SGD
        self.model = MarketPolicyGradientModelBuilder(modelFilename).getModel()
        # self.model = MarketPolicyGradientModelBuilder().buildModel()
        sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
        self.model.compile(loss='mse', optimizer='rmsprop')

    def discount_rewards(self, r):
        discounted_r = np.zeros_like(r)
        running_add = 0
        r = r.flatten()

        for t in reversed(np.arange(0, r.size)):
            if r[t] != 0:
                running_add = 0

            running_add = running_add * self.discount + r[t]
            discounted_r[t] = running_add

        return discounted_r

    def train(self,
              max_episode=1e1,
              max_path_length=200,
              threshold=0.5,
              verbose=0):
        env = self.env
        model = self.model
        avg_reward_sum = 0.

        for e in np.arange(max_episode):
            env.reset()
            observation = env.reset()
            # print('observation[0].shape:', '\n', observation[0].shape)
            # print('observation[1].shape:', '\n', observation[1].shape)
            # print('observation[1]:', '\n', observation[1])

            game_over = False
            reward_sum = 0
            last_y = np.array([0, 1])

            inputs = []
            outputs = []
            predicteds = []
            rewards = []
            count = 0
            date_list = []
            value_list = []
            benchmark_list = []
            predict_summary = []
            while not game_over:
                # count += 1
                # print('count:',count)
                aprob = model.predict(observation)[0]
                # print('aprob:', '\n', aprob)
                # print('aprob_shape:', '\n', aprob.shape)
                # print('aprob[0]:', '\n', aprob[0])
                # print('aprob[1]:', '\n', aprob[1])
                inputs.append(observation)
                predicteds.append(aprob)

                if aprob.shape[0] > 1:
                    if max(aprob) > threshold:
                        action = np.argsort(aprob)[-1]
                        # action = np.random.choice(self.env.action_space.n, 1, p = aprob / np.sum(aprob))[0]
                        self.env.last_action = action
                        y = np.zeros([self.env.action_space.n])
                        y[action] = 1.
                        # print('action:', action)
                        # print('y:', y)
                        last_y = y.copy()
                        outputs.append(y)
                    else:
                        action = 2
                        outputs.append(last_y)
                else:
                    action = 0 if np.random.uniform() < aprob else 1

                    y = [float(action)]
                    outputs.append(y)

                predict_summary.append(max(aprob))
                observation, reward, game_over, info = self.env.step(action)
                # print('boservation[0]:',observation[0])
                reward_sum += float(reward)
                #print('reward_sum:','\n',reward_sum)
                rewards.append(float(reward))
                #print('rewards:','\n',rewards)
                date_list.append(info["dt"])
                value_list.append(info["rat"])
                benchmark_list.append(info["cum"])

                if verbose > 0:
                    if action == 2:
                        color = bcolors.OKBLUE if aprob[0] == max(
                            aprob) else bcolors.FAIL
                        print("%s:\t%s\t%.2f\t%.2f\t%.2f\t" %
                              (info["dt"], color + "HOLD!!!" + bcolors.ENDC,
                               reward_sum, info["cum"], info["rat"]) +
                              ("\t".join([
                                  "%s:%.2f" % (l, i)
                                  for l, i in zip(env.actions, aprob.tolist())
                              ])))
                    elif env.actions[action] == "LONG" or env.actions[
                            action] == "SHORT":
                        color = bcolors.FAIL if env.actions[
                            action] == "LONG" else bcolors.OKBLUE
                        print("%s:\t%s\t%.2f\t%.2f\t%.2f\t" %
                              (info["dt"], color + env.actions[action] +
                               bcolors.ENDC, reward_sum, info["cum"],
                               info["rat"]) + ("\t".join([
                                   "%s:%.2f" % (l, i)
                                   for l, i in zip(env.actions, aprob.tolist())
                               ])))

            avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
            fc = bcolors.FAIL if info["cum"] >= 1 else bcolors.OKBLUE
            fr = bcolors.FAIL if info["rat"] >= 1 else bcolors.OKBLUE
            bw = bcolors.ENDC
            toPrint = "%d\t\t%s\t%.2f\t%s\t%s\t%.2f" % (
                e, info["code"], reward_sum, fc +
                ("%.2f" % info["cum"]) + bw, fr +
                ("%.2f" % info["rat"]) + bw, avg_reward_sum)
            # toPrint = "%d\t\t%s\t%s\t%.2f\t%.2f\t%.2f" % (e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], info["rat"], avg_reward_sum)
            if self.history_filename != None:
                os.system("echo %s >> %s" % (toPrint, self.history_filename))
            print(toPrint)
            # print('avg_reward_sum', '\n', avg_reward_sum)
            # print('env.actions:',env.actions)
            M.plot_trade_summary(indices=date_list,
                                 value=value_list,
                                 benchmark=benchmark_list)
            plt.hist(predict_summary, bins=200)
            plt.show()

            dim = len(inputs[0])
            inputs_ = [[] for i in np.arange(dim)]
            for obs in inputs:
                for i, block in enumerate(obs):
                    inputs_[i].append(block[0])
            inputs_ = [np.array(inputs_[i]) for i in np.arange(dim)]
            #print('inputs:', '\n', inputs[0][1])
            outputs_ = np.vstack(outputs)
            # print('outputs_:', '\n', outputs_)
            predicteds_ = np.vstack(predicteds)
            rewards_ = np.vstack(rewards)

            discounted_rewards_ = self.discount_rewards(rewards_)
            #discounted_rewards_ -= np.mean(discounted_rewards_)
            discounted_rewards_ /= np.std(discounted_rewards_)

            #outputs_ *= discounted_rewards_
            for i, r in enumerate(zip(rewards, discounted_rewards_)):
                reward, discounted_reward = r

                if verbose > 1:
                    print(outputs_[i], )

                #outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward
                if discounted_reward < 0:
                    outputs_[i] = 1 - outputs_[i]
                    outputs_[i] = outputs_[i] / sum(outputs_[i])
                outputs_[i] = np.minimum(
                    1,
                    np.maximum(
                        0, predicteds_[i] + (outputs_[i] - predicteds_[i]) *
                        abs(discounted_reward)))

                if verbose > 1:
                    print(predicteds_[i], outputs_[i], reward,
                          discounted_reward)

            # print('inputs_:', '\n', inputs_[0].shape)
            # print('inputs_:', '\n', inputs_[1].shape)
            # print('inputs_:', '\n', inputs_[1])
            # print('outputs_:', '\n', outputs_)
            # print('layers:', '\n', model.layers)
            model.fit(inputs_, outputs_, epochs=1, verbose=0, shuffle=True)
            model.save_weights('model_1.h5')
コード例 #6
0
class PolicyGradient:

	def __init__(self, env, discount = 0.99, model_filename = None, history_filename = None, max_memory=100):
		self.env = env
		self.discount = discount
		self.model_filename = model_filename
		self.history_filename = history_filename

		self.max_memory = max_memory 

		# 没有利用SGD 
		from keras.optimizers import SGD
		self.model = MarketPolicyGradientModelBuilder(modelFilename).getModel()
		sgd = SGD(lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True)
		# 利用rmsprop 
		#self.model.compile(loss='mse', optimizer='rmsprop')
		self.model.compile(loss='binary_crossentropy', optimizer='rmsprop')

	# 更详细解释: https://blog.csdn.net/heyc861221/article/details/80132054
	def discount_rewards(self, r):
		discounted_r = np.zeros_like(r)
		running_add = 0
		r = r.flatten()

		# 从后向前推算
		for t in reversed(range(0, r.size)):
			# TODO: running_add 为 0 ?
			# 应该是参照pong game,因为游戏里面,只有游戏结束了才有一个reward。
			# 这里是一个reset,每轮游戏结束,即reward非空,重置running_add
			# Reset the running sum at a game boundary.
#			if r[t] != 0:
#				running_add = 0

			# 拆开来就是,run_add 初始为当前reward, 即r[t]
			# run_add = (r[t] * discount + r[t+1]) * discount + r[t+2] + ...
			#		   = r[t] * discount^2 + r[t+1] * discount^1 + r[t+2] + ...
            # 即公式中的。E(Discount^n * Reward)
			running_add = running_add * self.discount + r[t]
			discounted_r[t] = running_add

		return discounted_r

	def train(self, max_episode = 10, max_path_length = 200, verbose = 0):
		env = self.env
		model = self.model
		avg_reward_sum = 0.

		#f_eps = open("episode.csv","w")
		#write_eps = csv.write(f_eps)

		for e in range(max_episode):
			env._reset()
			observation = env._reset()
			game_over = False
			reward_sum = 0

			inputs = []
			outputs = []
			predicteds = []
			rewards = []

			#f_iter = open("episode_{0}.csv".format(e),"w")
			#write_iter = csv.writer(f_iter)
			f_episode = "episode_{0}.csv".format(e)
			os.system("rm -rf {0}".format(f_episode))

			while not game_over:

				aprob = model.predict(observation)[0]
				inputs.append(observation)
				predicteds.append(aprob)
				
				if aprob.shape[0] > 1:
					action = np.random.choice(self.env.action_space.n, 1, p = aprob / np.sum(aprob))[0]

					y = np.zeros([self.env.action_space.n])
					y[action] = 1.

					outputs.append(y)
				else:
					#action = 0 if np.random.uniform() < aprob else 1

					# if aprob = 1.0 reduce it.
					# 因为uniform返回[0, 1)
					m_aprob = 0.9 if aprob == 1.0 else aprob
					action = 0 if np.random.uniform() < m_aprob else 1

					y = [float(action)]
					outputs.append(y)

				observation, reward, actual_reward, game_over, info = self.env._step(action)
				reward_sum += float(actual_reward)

				rewards.append(float(reward))

				# check memory for RNN model
				if len(inputs) > self.max_memory:
					del inputs[0]
					del outputs[0]
					del predicteds[0]
					del rewards[0]


				if verbose > 0:
					if env.actions[action] == "LONG" or env.actions[action] == "SHORT":
					#if env.actions[action] == "LONG" or env.actions[action] == "SHORT" or env.actions[action] == "HOLD":
						color = bcolors.FAIL if env.actions[action] == "LONG" else bcolors.OKBLUE
						print ("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] + bcolors.ENDC, reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])))
					#write_iter.writerow("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])))
					os.system("echo %s >> %s" % ("%s:\t%s\t%.2f\t%.2f\t" % (info["dt"], env.actions[action], reward_sum, info["cum"]) + ("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())])),
							  f_episode))

			#write_iter.close()

				avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
				toPrint = "%d\t%s\t%s\t%.2f\t%.2f" % (e, info["code"], (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + ("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum)
				print (toPrint)
				if self.history_filename != None:
					os.system("echo %s >> %s" % (toPrint, self.history_filename))


				dim = len(inputs[0])
				inputs_ = [[] for i in range(dim)]
				for obs in inputs:
					for i, block in enumerate(obs):
						inputs_[i].append(block[0])
				inputs_ = [np.array(inputs_[i]) for i in range(dim)]

				outputs_ = np.vstack(outputs)
				predicteds_ = np.vstack(predicteds)
				rewards_ = np.vstack(rewards)

				discounted_rewards_ = self.discount_rewards(rewards_)
				# TODO: 不做均值平移应该也可以
				# 平移后,有可能会导致最小的负值变为正值。改变了正负号。
				#discounted_rewards_ -= np.mean(discounted_rewards_)
				if np.std(discounted_rewards_) != 0.:
					discounted_rewards_ /= np.std(discounted_rewards_)

				#outputs_ *= discounted_rewards_
				for i, r in enumerate(zip(rewards, discounted_rewards_)):
					reward, discounted_reward = r

					if verbose > 1:
#						print (outputs_[i],)
						print (outputs_[i],)
					
					#outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward
					# 修正output, reward<0 亏钱,反转所有的output。
					# 
					#if discounted_reward < 0:
					#	outputs_[i] = 1 - outputs_[i]
					#	outputs_[i] = outputs_[i] / sum(outputs_[i])

					# softmax的log函数求导后的Gradient ?
					# http://vsooda.github.io/2017/03/14/softmax-logistic/
					# 最终对于softmax层,其反向梯度仅仅是概率值减去label值。
					#outputs_[i] = np.minimum(1, np.maximum(0, predicteds_[i] + (outputs_[i] - predicteds_[i]) * abs(discounted_reward)))
					outputs_[i] = np.minimum(1, np.maximum(0, predicteds_[i] + (outputs_[i] - predicteds_[i]) * discounted_reward))

					if verbose > 0:
						print (predicteds_[i], outputs_[i], reward, discounted_reward)

				print("fit model input.shape %s, output.shape %s" %( [inputs_[i].shape for i in range(len(inputs_))], outputs_.shape))
				
				np.set_printoptions(linewidth=200, suppress=True)
				print("currentTargetIndex:", env.currentTargetIndex)
				#print(inputs_)
				model.fit(inputs_, outputs_, nb_epoch = 1, verbose = 0, shuffle = True)
				model.save_weights(self.model_filename)
コード例 #7
0
class PolicyGradient:

	def __init__(self, env, discount = 0.99, model_filename = None, history_filename = None):
		self.env = env
		self.discount = discount
		self.model_filename = model_filename
		self.history_filename = history_filename

		from keras.optimizers import SGD
		self.model = MarketPolicyGradientModelBuilder(self.model_filename).getModel()
		sgd = SGD(lr = 0.1, decay = 1e-6, momentum = 0.9, nesterov = True)
		self.model.compile(loss='mse', optimizer='rmsprop')

	def discount_rewards(self, r):
		discounted_r = np.zeros_like(r)
		running_add = 0
		r = r.flatten()

		for t in reversed(xrange(0, r.size)):
			if r[t] != 0:
				running_add = 0

			running_add = running_add * self.discount + r[t]
			discounted_r[t] = running_add

		return discounted_r

	def train(self, max_episode = 1000, max_path_length = 200, verbose = 0):
		env = self.env
		model = self.model
		avg_reward_sum = 0.
		target_close = env.get_close()

		for e in xrange(max_episode):
			env.reset()
			observation = env.reset()
			game_over = False
			reward_sum = 0
			cum_profit = {}
			pre_action = {}
			inputs = []
			outputs = []
			predicteds = []
			rewards = []

			while not game_over:
				aprob = model.predict(observation)[0]
				inputs.append(observation)
				predicteds.append(aprob) 
				
				if aprob.shape[0] > 1:
					action = np.random.choice(self.env.action_space.n, 1, p = aprob / np.sum(aprob))[0]
					y = np.zeros([self.env.action_space.n])
					y[action] = 1.
					outputs.append(y)
				else:
					action = 0 if np.random.uniform() < aprob else 1

					y = [float(action)]
					outputs.append(y)

				observation, reward, game_over, info = self.env.step(action)
				reward_sum += float(reward)
				cum_profit[info['dt']] = reward_sum
				rewards.append(float(reward))

				if verbose > 0:
					if env.actions[action] == "LONG" or env.actions[action] == "SHORT":
						pre_action[info['dt']] = env.actions[action]
						color = bcolors.FAIL if env.actions[action] == "LONG" else bcolors.OKBLUE
						print "%s:\t%s\t%d\t%.2f\t%.2f\t" % (info["dt"], color + env.actions[action] \
							+ bcolors.ENDC, info['correct_action'], reward_sum, info["cum"]) + \
						("\t".join(["%s:%.2f" % (l, i) for l, i in zip(env.actions, aprob.tolist())]))

			avg_reward_sum = avg_reward_sum * 0.99 + reward_sum * 0.01
			toPrint = "%d\t%s\t%.2f\t%.2f" % (e,  (bcolors.FAIL if reward_sum >= 0 else bcolors.OKBLUE) + \
				("%.2f" % reward_sum) + bcolors.ENDC, info["cum"], avg_reward_sum)
			print toPrint
			if self.history_filename != None:
				os.system("echo %s >> %s" % (toPrint, self.history_filename))


			dim = len(inputs[0])
			inputs_ = [[] for i in xrange(dim)]
			for obs in inputs:
				for i, block in enumerate(obs):
					inputs_[i].append(block[0])
			inputs_ = [np.array(inputs_[i]) for i in xrange(dim)]

			outputs_ = np.vstack(outputs)#0 or 1
			predicteds_ = np.vstack(predicteds)# probabilty
			rewards_ = np.vstack(rewards)

			discounted_rewards_ = self.discount_rewards(rewards_)
			#discounted_rewards_ -= np.mean(discounted_rewards_)
			discounted_rewards_ /= np.std(discounted_rewards_)

			#outputs_ *= discounted_rewards_
			for i, r in enumerate(zip(rewards, discounted_rewards_)):
				reward, discounted_reward = r

				if verbose > 1:
					print outputs_[i],
				
				#outputs_[i] = 0.5 + (2 * outputs_[i] - 1) * discounted_reward
				if discounted_reward < 0:
					outputs_[i] = 1 - outputs_[i]
					outputs_[i] = outputs_[i] / sum(outputs_[i])
				outputs_[i] = np.minimum(1, np.maximum(0, predicteds_[i] + \
					(outputs_[i] - predicteds_[i]) * abs(discounted_reward)))

				if verbose > 1:
					print predicteds_[i], outputs_[i], reward, discounted_reward

			model.fit(inputs_, outputs_, nb_epoch = 1, verbose = 0, shuffle = True)

			if(e % 5 == 0 and e != 0):
				test_util.plot_profit(cum_profit, target_close, pre_action, "pg_train_"+str(e))
				test_util.get_test_performance(e,'model_pg.h5', model)
				model.save_weights("model_pg.h5" if self.model_filename == None else self.model_filename, overwrite=True)