def step(self, env: Environment) -> Tuple[EpisodeStep, Reward]: state = env.get_observation() action_vector = self.net(torch.FloatTensor(data=state)) softmax = nn.Softmax() act_probs = softmax(action_vector).detach().numpy() action = np.random.choice(act_probs.size, p=act_probs) episode_step = EpisodeStep(state=state, action=action) reward: Reward = env.action(action) return (episode_step, reward)
def play_episode(self, env: Environment) -> Episode: env.reset() episode_steps = [] total_reward: Reward = 0.0 while not env.is_done(): episode_step, reward = self.step(env) episode_steps.append(episode_step), total_reward += reward episode = Episode(steps=episode_steps, reward=total_reward) return episode
def train(): env = Environment() player = Player() for episode in range(2): env.start() while True: actions = player.act(env.state) reverd = env.apply(actions) player.learn(reverd) if env.done: break return
def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate, grad_applier, show_env=False, local_t_max=20, max_global_time_step=10 * 10**7, gamma=0.99, save_interval_step=100 * 1000, env='Breakout-v0', device='/cpu:0'): self.thread_index = thread_index self.learning_rate = learning_rate self.env = env # Whether to render the environment # or not during training (default is # True for one of the agents) - change # this in main.py self.show_env = show_env # Discount factor for the reward self.gamma = gamma # Number of "epochs" self.max_global_time_step = max_global_time_step # Number of steps for the LSTM self.local_t_max = local_t_max # Number of actions the agent can take self.action_size = Environment.get_action_size(env) self.local_network = A3C(self.action_size, self.thread_index, device) self.global_network = global_network # Build computational graph self.local_network._create_network() # Build computational graph for the losses # and gradients self.local_network.prepare_a3c_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.a3c_loss, global_network.get_vars(), self.local_network.get_vars()) # Sync the weights of the local network with those # of the main network self.sync = self.local_network.sync_from(global_network) # Initialize time step, learning rate, etc self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0
class Core: def __init__(self): self.game_count = 0 self.events = Events() self.text_renderer = TextRenderer() # empty declarations for linting self.balls = None self.env = None return def new_game(self): self.game_count += 1 self.balls = self.new_balls() self.env = Environment(self.balls) def new_balls(self): return [Ball() for _ in range(settings.num_balls)] def update(self): self.events.update() settings.update(self.events) # only cycle through balls alive in the environment for optimization for ball in self.env.balls: ball.update(self.events) self.env.update(self.events) def game_over(self): return self.env.game_over() def draw(self): surface = self.env.get_surface() if self.events.info: surface.blit(self.get_info_surface(), (0, 0)) return surface def get_info_surface(self): texts = [ " Game: {}".format(self.game_count), " Score: {}".format(self.env.score), " Alive: {}".format(self.env.num_alive) ] return self.text_renderer.texts_to_surface(texts)
def __init__(self, size=0, mountpoint=False, logger=False, environ=False): """ """ ##### # Version/timestamp is # <YYYY><MM><DD>.<HH><MM><SS>.<microseconds> # in UTC time self.module_version = '20160224.032043.009191' if not logger: self.logger = CyLogger() else: self.logger = logger if not environ: self.environ = Environment() else: self.environ = environ self.chkApp = CheckApplicable(self.environ, self.logger)
def deploy_bcf(config, fuel_cluster_id): # Deploy setup node Helper.safe_print("Start to prepare setup node\n") env = Environment(config, fuel_cluster_id) Helper.common_setup_node_preparation(env) # Generate detailed node information Helper.safe_print("Start to setup Big Cloud Fabric\n") nodes_config = None if 'nodes' in config: nodes_yaml_config = config['nodes'] node_dic = Helper.load_nodes(nodes_yaml_config, env) # Generate scripts for each node for hostname, node in node_dic.iteritems(): if node.os == const.CENTOS: Helper.generate_scripts_for_centos(node) elif node.os == const.UBUNTU: Helper.generate_scripts_for_ubuntu(node) with open(const.LOG_FILE, "a") as log_file: log_file.write(str(node)) if node.skip: Helper.safe_print("skip node %(hostname)s due to %(error)s\n" % { 'hostname': hostname, 'error': node.error }) continue node_q.put(node) # Use multiple threads to setup nodes for i in range(const.MAX_WORKERS): t = threading.Thread(target=worker_setup_node) t.daemon = True t.start() node_q.join() Helper.safe_print( "Big Cloud Fabric deployment finished! Check %(log)s on each node for details.\n" % {'log': const.LOG_FILE})
def __init__(self, conf, parent=None): """ Initialization method... @author: Roy Nielsen """ super(VirtualMachineBuilder, self).__init__(parent) self.ui = Ui_MainWindow() self.ui.setupUi(self) ##### # initialization of class variables. self.conf = conf self.conf.loggerSelf() self.logger = self.conf.getLogger() self.environ = Environment() #self.logger = self.conf.get_logger() self.logger.log(lp.DEBUG, str(self.logger)) self.runWith = RunWith(self.logger) self.libc = getLibc(self.logger) ##### # Set label states self.ui.packerLabel.setText( "( <a href='https://www.packer.io'>https://www.packer.io</a> - Download and install packer separately )" ) self.ui.boxcutterLabel.setText( "( <a href='https://github.com/boxcutter'>https://github.com/boxcutter</a> - Clone repos separately )" ) ##### # Handle button box # self.ui.buttonBox.button( QtWidgets.QDialogButtonBox.Close).clicked.connect( self.closeApplication) self.ui.buttonBox.button( QtWidgets.QDialogButtonBox.Ok).clicked.connect(self.processVm) ##### # Rename Save button self.ui.buttonBox.button( QtWidgets.QDialogButtonBox.Save).setText("Configure Repos") btn = self.ui.buttonBox.button(QtWidgets.QDialogButtonBox.Save) btn.clicked.connect(self.configureRepos) ##### # Rename Apply button self.ui.buttonBox.button( QtWidgets.QDialogButtonBox.Apply).setText("Install packer") btnTwo = self.ui.buttonBox.button(QtWidgets.QDialogButtonBox.Apply) btnTwo.clicked.connect(self.installPacker) btnTwo.hide() self.chkApp = CheckApplicable(self.environ, self.logger) self.macOsBlackListApplicable = { 'type': 'black', 'os': { 'Mac OS X': ['10.0.0', 'r', '20.12.10'] } } self.linuxWhitelistApplicable = {'type': 'white', 'family': 'linux'} self.freebsdWhitelistApplicable = { 'type': 'white', 'family': 'freebsd' } self.macosWhitelistApplicable = {'type': 'white', 'family': 'darwin'} #openbsdWhitelistApplicable = {} #windowsWhitelistApplicable = {} ##### # Set up the configure dialog self.configRepos = ConfigureRepos(self.conf) self.configRepos.setWindowTitle("Configure Repos") ##### # Connect the configure 'done' signal to the refreshFamilyComboBox slot self.configRepos.doneConfigure.connect(self.refreshFamilyComboBox) ##### # Signal/slot to deal with osFamily combo box change self.ui.osFamily.currentIndexChanged.connect(self.osFamilySelected) self.refreshFamilyComboBox() self.osFamilySelected(0) self.logger.log(lp.DEBUG, "Done with VirtualMachineBuilder init...")
def build_environment(self): """ Create the environment """ self.environment = Environment(self.env, show_env=self.show_env)
def __init__(self, conf, parent=None): """ Initialization method... @author: Roy Nielsen """ super(ConfigureRepos, self).__init__(parent) self.ui = Ui_Dialog() self.ui.setupUi(self) ##### # initialization of class variables. self.conf = conf self.environ = Environment() self.conf.loggerSelf() self.logger = self.conf.getLogger() #self.logger = self.conf.get_logger() self.logger.log(lp.DEBUG, str(self.logger)) self.runWith = RunWith(self.logger) self.libc = getLibc(self.logger) self.chkApp = CheckApplicable(self.environ, self.logger) macOsWhiteListApplicable = { 'type': 'white', 'os': { 'Mac OS X': ['10.0.0', 'r', '20.12.10'] } } ##### # Handle button box self.ui.buttonBox.button( QtWidgets.QDialogButtonBox.Cancel).clicked.connect(self.close) self.ui.buttonBox.button( QtWidgets.QDialogButtonBox.Ok).clicked.connect(self.okDone) ##### # Handle other buttons self.ui.downloadReposButton.clicked.connect(self.downloadRepos) self.ui.prepareIsoButton.clicked.connect(self.prepareIso) self.ui.gitResetHardButton.clicked.connect(self.resetRepos) self.ui.gitPullButton.clicked.connect(self.updateRepos) if self.chkApp.isApplicable(macOsWhiteListApplicable): self.ui.prepareIsoButton.clicked.connect(self.prepareIso) else: self.ui.prepareIsoButton.hide() self.ui.macosCheckBox.hide() ##### # default boxcutter repo path self.reposRoot = self.conf.getRepoRoot() ##### # Future features self.ui.winCheckBox.hide() self.ui.label_2.hide() self.ui.leReposPath.hide() self.ui.proxyButton.hide() ##### # Future features self.ui.winCheckBox.hide() self.ui.label_2.hide() self.ui.leReposPath.hide() self.ui.proxyButton.hide() self.git = "/usr/bin/git" ##### # repos self.repos2process = [] self.getSelected()
def deploy_bcf(config, mode, fuel_cluster_id, rhosp, tag, cleanup, verify, verify_only, skip_ivs_version_check, certificate_dir, certificate_only, generate_csr, support, upgrade_dir, offline_dir, sriov): # Deploy setup node safe_print("Start to prepare setup node\n") env = Environment(config, mode, fuel_cluster_id, rhosp, tag, cleanup, skip_ivs_version_check, certificate_dir, upgrade_dir, offline_dir, sriov) Helper.common_setup_node_preparation(env) controller_nodes = [] # Generate detailed node information safe_print("Start to setup Big Cloud Fabric\n") nodes_yaml_config = config['nodes'] if 'nodes' in config else None node_dic = Helper.load_nodes(nodes_yaml_config, env) if upgrade_dir: return upgrade_bcf(node_dic) if sriov: return setup_sriov(node_dic) if generate_csr: safe_print("Start to generate csr for virtual switches.\n") # create ~/csr and ~/key directory Helper.run_command_on_local("mkdir -p %s" % const.CSR_DIR) Helper.run_command_on_local("mkdir -p %s" % const.KEY_DIR) for hostname, node in node_dic.iteritems(): if node.skip: safe_print("skip node %(fqdn)s due to %(error)s\n" % { 'fqdn': node.fqdn, 'error': node.error }) continue if node.tag != node.env_tag: safe_print("skip node %(fqdn)s due to mismatched tag\n" % {'fqdn': node.fqdn}) continue if node.deploy_mode == const.T6 and node.role == const.ROLE_COMPUTE: Helper.generate_csr(node) safe_print("Finish generating csr for virtual switches.\n") return # copy neutron config from neutron server to setup node for hostname, node in node_dic.iteritems(): if node.role == const.ROLE_NEUTRON_SERVER: controller_nodes.append(node) Helper.copy_neutron_config_from_controllers(controller_nodes) # check if vlan is the tenant network type for fuel environment if not Helper.check_if_vlan_is_used(controller_nodes): safe_print("tenant network type is not vlan. Stop deploying.\n") return # prepare keystone client from /etc/neutron/api-paste.ini #Helper.prepare_keystone_client(controller_nodes) # Generate scripts for each node for hostname, node in node_dic.iteritems(): if support: support_node_q.put(node) if node.skip: safe_print("skip node %(fqdn)s due to %(error)s\n" % { 'fqdn': node.fqdn, 'error': node.error }) continue if node.tag != node.env_tag: safe_print("skip node %(fqdn)s due to mismatched tag\n" % {'fqdn': node.fqdn}) continue if node.os == const.CENTOS: Helper.generate_scripts_for_centos(node) elif node.os == const.UBUNTU: Helper.generate_scripts_for_ubuntu(node) elif node.os == const.REDHAT: Helper.generate_scripts_for_redhat(node) if node.role == const.ROLE_NEUTRON_SERVER: controller_node_q.put(node) else: # python doesn't have deep copy for Queue, hence add to all node_q.put(node) verify_node_q.put(node) if node.deploy_mode == const.T6 and node.role == const.ROLE_COMPUTE: certify_node_q.put(node) if node.rhosp: Helper.chmod_node(node) with open(const.LOG_FILE, "a") as log_file: version = Helper.run_command_on_local("pip show bosi") log_file.write(str(version)) for hostname, node in node_dic.iteritems(): log_file.write(str(node)) if support: safe_print("Start to collect logs.\n") # copy installer logs to ~/support Helper.run_command_on_local("mkdir -p %s" % const.SUPPORT_DIR) Helper.run_command_on_local("cp -r %(src)s %(dst)s" % { "src": const.LOG_FILE, "dst": const.SUPPORT_DIR }) Helper.run_command_on_local( "cp -r %(setup_node_dir)s/%(generated_script_dir)s %(dst)s" % { "setup_node_dir": env.setup_node_dir, "generated_script_dir": const.GENERATED_SCRIPT_DIR, "dst": const.SUPPORT_DIR }) for i in range(const.MAX_WORKERS): t = threading.Thread(target=support_node_setup, args=(support_node_q, )) t.daemon = True t.start() support_node_q.join() # compress ~/support Helper.run_command_on_local("cd /tmp; tar -czf support.tar.gz support") safe_print( "Finish collecting logs. logs are at /tmp/support.tar.gz.\n") return # in case of verify_only or certificate_only, do not deploy if (not verify_only) and (not certificate_only): # Use single thread to setup controller nodes t = threading.Thread(target=worker_setup_node, args=(controller_node_q, )) t.daemon = True t.start() controller_node_q.join() # Use multiple threads to setup compute nodes for i in range(const.MAX_WORKERS): t = threading.Thread(target=worker_setup_node, args=(node_q, )) t.daemon = True t.start() node_q.join() sorted_time_dict = OrderedDict( sorted(time_dict.items(), key=lambda x: x[1])) for fqdn, h_time in sorted_time_dict.items(): safe_print("node: %(fqdn)s, time: %(time).2f\n" % { 'fqdn': fqdn, 'time': h_time }) safe_print("Big Cloud Fabric deployment finished! " "Check %(log)s on each node for details.\n" % {'log': const.LOG_FILE}) if certificate_dir or certificate_only: # certify each node safe_print("Start to certify virtual switches.\n") for i in range(const.MAX_WORKERS): t = threading.Thread(target=certify_node_setup, args=(certify_node_q, )) t.daemon = True t.start() certify_node_q.join() safe_print('Certifying virtual switches done.\n') if verify or verify_only: # verify each node and post results safe_print("Verifying deployment for all compute nodes.\n") for i in range(const.MAX_WORKERS): t = threading.Thread(target=verify_node_setup, args=(verify_node_q, )) t.daemon = True t.start() verify_node_q.join() # print status # success nodes safe_print('Deployed successfully to: \n') for node_element in node_pass: safe_print(node_element + '\n') # failed nodes safe_print('Deployment to following failed: \n') for node_element in node_fail: safe_print( str(node_element) + ' : ' + str(node_fail[node_element]) + '\n')
def new_game(self): self.game_count += 1 self.balls = self.new_balls() self.env = Environment(self.balls)
from lib.agent import Agent from lib.environment import Environment num_actions = 12 agent = Agent(num_actions) environment = Environment() done = 0 environment.start() while done != 1: action = agent.choose_action() environment.sendAction(action) reward, done = environment.getState() environment.exit()
def __init__(self, **kwargs): """ Variables that can be passed in: logger userName userShell userComment userUid userPriGid userHomeDir """ if 'logDispatcher' not in kwargs: raise ValueError( "Variable 'logDispatcher' a required parameter for " + str(self.__class__.__name__)) else: self.logger = kwargs.get('logDispatcher') if 'userName' not in kwargs: self.userName = "" else: self.userName = kwargs.get('userName') if 'userShell' not in kwargs: self.userShell = "/bin/bash" else: userShell = kwargs.get('userShell') if 'userComment' not in kwargs: self.userComment = "" else: self.userComment = kwargs.get('userComment') if 'userUid' not in kwargs: self.userUid = 10000 else: self.userUid = kwargs.get('userUid') if 'userPriGid' not in kwargs: self.userPriGid = 20 else: self.userPriGid = kwargs.get('userPriGid') if 'userHomeDir' not in kwargs: self.userHomeDir = "" else: self.userHomeDir = kwargs.get('userHomeDir') self.module_version = '20160225.125554.540679' ##### # Acqure the environment self.environ = Environment() ##### # THIS IS A LIBRARY, SO LOGS SHOULD BE INITIALIZED ELSEWHERE... # self.logger.initializeLogs() self.logger.log(lp.INFO, "Logger: " + str(self.logger)) ##### # Initialize the RunWith helper for executing shelled out commands. self.runWith = RunWith(self.logger)
def run(self): """ Run the model """ with tf.device(self.device): # The learning rate is sampled from a # log-uniform distribution between # 0.0001 and 0.005. Then, it is # decayed linearly to 0 progressively # during training initial_learning_rate = log_uniform(self.initial_alpha_low, self.initial_alpha_high, 0.5) # Whether to terminate, pause or keep training self.stop = False self.terminate = False # Initialize global time step self.global_t = 0 # Number of actions the agent can take action_size = Environment.get_action_size(self.env) # Initialize the shared/global network self.global_network = A3C(action_size, thread_index=-1, device=self.device) # Build computational graph self.global_network._create_network() # Placeholder for the Trainers self.trainers = [] learning_rate_input = tf.placeholder("float") # Initialize the RMSPROP object for the updates grad_applier = RMSPropApplier(learning_rate_input) # Build the agents for i in range(self.parallel_size): trainer = Trainer(thread_index=i, global_network=self.global_network, initial_learning_rate=initial_learning_rate, grad_applier=grad_applier, learning_rate=learning_rate_input) if i == 0: trainer.show_env = True self.trainers.append(trainer) # Initialize Session self.sess = tf.Session() self.sess.run(tf.global_variables_initializer()) # Params for logging scores in Tensorboard self.score_input = tf.placeholder(tf.int32) tf.summary.scalar("score", self.score_input) self.summary_op = tf.summary.merge_all() # sess.graph contains the graph definition; # that enables the Graph Visualizer. To start # Tensorboard run the following command: # $ tensorboard --logdir=path/to/LOG_FILE self.summary_writer = tf.summary.FileWriter(LOG_FILE, graph=self.sess.graph) # Parameters for saving the global network params self.saver = tf.train.Saver(var_list=self.global_network.get_vars(), max_to_keep=1) # Set next checkpoint self.next_checkpoint = self.checkpoint_interval # Set next log point self.next_log = self.logging_interval # ----------- # RUN THREADS # ----------- self.train_threads = [] for i in range(self.parallel_size): self.train_threads.append(threading.Thread(target=self.train, args=(i, True))) for t in self.train_threads: t.start()
#path param ile verilen path üzerine dosya oluşturulur. #op.createFile(path="lib",fileName="test.swift",content="hello") #init de verilen default path üzerine klasör oluşturulur. #op.createFolder(folderName="hello/1") #path is valid #print(op.isExist("/Users/umut/Desktop/Architecture/CodeGenerationCore/lib")) #append file add content #op.appendFile(fileName="test.swift",content="\nworld") #loglar kapali artik #Environment.Shared().online() print(type(Environment.Shared().online())) #remove file #op.removeFile(fileName=fileName) #op.createFile(fileName=fileName, content="hello") #print(MESSAGE.ERROR) #print(DEV_ENV.LOCAL) #print(CODE.SLASH) #log samples #Log.i(message=MESSAGE.INFO) #Log.s(MESSAGE.SUCCESS) #Log.e(MESSAGE.ERROR)
import numpy as np import sys import tensorflow as tf from lib.agent import DeepQLearningAgent from lib.environment import Environment from lib.experience import Experience tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.FATAL) f = open("trace.txt", "w") print(f'starting training') accuracy = 0 num_actions = 5 num_episodes = 100000 environment = Environment() agent = DeepQLearningAgent(environment.frameset_size, num_actions, environment.hot_encode_action_size) agent.saveModel(0) agent.loadModel('policy_network_model_0.h5') for episode in range(1, num_episodes): done = 0 rewards_current_episode = 0 position_current_episode = 0 level_position = 0 last_frameset = np.zeros(environment.frameset_size) last_last_actions = np.zeros(environment.hot_encode_action_size) environment.start() first_random_action = np.random.choice(range(agent.num_actions)) environment.sendAction(first_random_action)
# Author: Umut Boz # Copyright (c) 2020, OneframeMobile, KoçSistem # Email: [email protected] ############################################################ # Version: 0.1.0 ############################################################ from lib.enums import MessageType from lib.enums import CodeLine from lib.log import Log from lib.environment import Environment from lib.httpOperation3 import HttpOperation3 MESSAGE = MessageType() CODE = CodeLine() # close log Environment.Shared().online() url = "https://petstore.swagger.io/v2/swagger.json" op = HttpOperation3() jsonData = op.request(url=url).jsonParse() op2 = HttpOperation3(url=url) jsonData = op2.request().jsonParse() op3 = HttpOperation3() print(op3.fetch(url=url)) print(jsonData["swagger"])
class Trainer(object): """ Class for Training a Local Network / ONE agent """ def __init__(self, thread_index, global_network, initial_learning_rate, learning_rate, grad_applier, show_env=False, local_t_max=20, max_global_time_step=10 * 10**7, gamma=0.99, save_interval_step=100 * 1000, env='Breakout-v0', device='/cpu:0'): self.thread_index = thread_index self.learning_rate = learning_rate self.env = env # Whether to render the environment # or not during training (default is # True for one of the agents) - change # this in main.py self.show_env = show_env # Discount factor for the reward self.gamma = gamma # Number of "epochs" self.max_global_time_step = max_global_time_step # Number of steps for the LSTM self.local_t_max = local_t_max # Number of actions the agent can take self.action_size = Environment.get_action_size(env) self.local_network = A3C(self.action_size, self.thread_index, device) self.global_network = global_network # Build computational graph self.local_network._create_network() # Build computational graph for the losses # and gradients self.local_network.prepare_a3c_loss() self.apply_gradients = grad_applier.minimize_local( self.local_network.a3c_loss, global_network.get_vars(), self.local_network.get_vars()) # Sync the weights of the local network with those # of the main network self.sync = self.local_network.sync_from(global_network) # Initialize time step, learning rate, etc self.local_t = 0 self.initial_learning_rate = initial_learning_rate self.episode_reward = 0 def build_environment(self): """ Create the environment """ self.environment = Environment(self.env, show_env=self.show_env) def stop(self): """ Terminate the environment """ self.environment.stop() def _record_score(self, sess, summary_writer, summary_op, score_input, score, global_t): """ Save Score to Tensorboard """ summary_str = sess.run(summary_op, feed_dict={score_input: score}) summary_writer.add_summary(summary_str, global_t) # Write to disk summary_writer.flush() def choose_action(self, pi_values): """ Sample from the learned policy distribution :param pi_values: Probability distribution for every actions """ return np.random.choice(range(len(pi_values)), p=pi_values) def concat_action_reward(self, action, action_size, reward): """ Return one hot vectored action and reward. """ action_reward = np.zeros([action_size + 1], dtype='float32') action_reward[action] = 1.0 action_reward[-1] = float(reward) return action_reward def _decay_learning_rate(self, global_time_step): """ Decay the learning rate linearly """ time_left = self.max_global_time_step - global_time_step learning_rate = self.initial_learning_rate * time_left \ / self.max_global_time_step # Clip learning rate at 0.0 if learning_rate < 0.0: learning_rate = 0.0 return learning_rate def _process_a3c(self, sess, global_t, summary_writer, summary_op, score_input): """ Process max_local_t steps/frames in the A3C network :param sess: TensorFlow session object :param global_t: Global time step (number of steps processed by the global/shared network) """ # States of the LSTM states = [] last_action_rewards = [] actions = [] rewards = [] values = [] # Synchronize with global network sess.run(self.sync) # Initial local time step self.local_t = 0 # Whether we hit a terminal state or not terminal_end = False start_lstm_state = self.local_network.lstm_state_out # Loops local_t_max time steps for _ in range(self.local_t_max): last_action = self.environment.last_action last_reward = self.environment.last_reward last_action_reward = self.concat_action_reward( last_action, self.action_size, last_reward) # Compute policy and value function pi_, value_ = self.local_network.run_pi_value( sess, self.environment.last_state, last_action_reward) # Pick an action given the new computed policy action = self.choose_action(pi_) # Append results to placeholders... states.append(self.environment.last_state) last_action_rewards.append(last_action_reward) actions.append(action) values.append(value_) # Process next action new_state, reward, terminal = self.environment.process(action) rewards.append(reward) self.episode_reward += reward self.local_t += 1 if terminal: # Environment hit a terminal state terminal_end = True # ---------------- # PRINT STATISTICS # ---------------- print('Time step: %5d k - Score: %3d' % (global_t / 1000, self.episode_reward)) self._record_score(sess, summary_writer, summary_op, score_input, self.episode_reward, global_t) # If we hit a terminal state, then the # reward is set to 0, else, it is set # to the value function self.episode_reward = 0 self.environment.reset() self.local_network.reset_state() break # --------- # BACK-PROP # --------- # We discount the rewards from t - 1 to t_start. At # time step t the reward is either 0 (if terminal state) # or V (non terminal state) R = 0.0 if not terminal_end: R = self.local_network.run_last_value(sess, new_state, last_action_reward) # Reverse placeholders actions.reverse() states.reverse() rewards.reverse() values.reverse() # To compute the gradients we compute a minibatch of # length local_t_max batch_s = [] batch_a = [] batch_adv = [] batch_R = [] # For printing R_non_discounted = R # Discounting... for (ai, ri, si, Vi) in zip(actions, rewards, states, values): R = ri + self.gamma * R adv = R - Vi a = np.array([0] * self.action_size) a[ai] = 1.0 batch_s.append(si) batch_a.append(a) # Convert np.array -> float because # the advantage and reward placeholders # expects shape [None, ] not [None, 1] batch_adv.append(float(adv)) batch_R.append(float(R)) batch_s.reverse() batch_a.reverse() batch_adv.reverse() batch_R.reverse() # Decay learning rate cur_learning_rate = self._decay_learning_rate(global_t) # Create feed_dict for gradient_applier feed_dict = { self.local_network.input: batch_s, self.local_network.last_action_reward: last_action_rewards, self.local_network.a: batch_a, self.local_network.adv: batch_adv, self.local_network.R: batch_R, self.local_network.lstm_state: start_lstm_state, self.learning_rate: cur_learning_rate } # compute gradients and update weights sess.run(self.apply_gradients, feed_dict=feed_dict) """ # ---------------- # PRINT STATISTICS # ---------------- # Compute losses total_loss, policy_loss, value_loss = self.local_network.run_losses(sess, feed_dict) total_loss = np.mean(total_loss) policy_loss = np.mean(policy_loss) value_loss = np.mean(value_loss) if global_t % 1000 == 0: print('Time Step: %6d k Reward: %3d - Total Loss: %.4f - ' 'Policy Loss: %.4f - Value Loss: %.4f' % (global_t / 1000, float(R_non_discounted), total_loss, policy_loss, value_loss)) # Save to log file with open(LOG_FILE, 'a') as f: f.write('Reward: %3d - Total Loss: %.4f - Policy Loss: %.4f ' '- Value Loss: %.4f \n' % (float(R), total_loss, policy_loss, value_loss)) """ # Return the number of steps taken # to update global_time_steps return self.local_t