def train_step(self, cases, weights, caching): if len(cases) != len(weights): raise ValueError('cases and weights must have the same length.') if len(cases) == 0: #logging.warn('Training on zero cases.') print >> sys.stderr, " WARNING: Zero cases \033[F" # still increment the step sess = tf.get_default_session() sess.run(self._increment_step) elif not self._max_batch_size or len(cases) <= self._max_batch_size: print >> sys.stderr, " Updating ({} cases) \033[F".format(len(cases)) self.compute(self._take_step, cases, weights, caching) else: print >> sys.stderr, " Updating ({} cases) \033[F".format(len(cases)) assert not caching grads = None slices = range(0, len(cases), self._max_batch_size) for i in verboserate(slices, desc='Computing gradients ({} cases)'.format(len(cases))): cases_slice = cases[i:i + self._max_batch_size] weights_slice = weights[i:i + self._max_batch_size] grads_slice = self.compute(self._grad_tensors, cases_slice, weights_slice, False) if grads is None: grads = grads_slice else: for i in xrange(len(self._grad_tensors)): grads[i] += grads_slice[i] sess = tf.get_default_session() feed_dict = dict(zip(self._combined_grad_placeholders, grads)) sess.run(self._apply_gradients, feed_dict) sess.run(self._increment_step)
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: # Load the model metagraph and checkpoint print('Model directory: %s' % args.model_dir) meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.model_dir)) print('Metagraph file: %s' % meta_file) print('Checkpoint file: %s' % ckpt_file) model_dir_exp = os.path.expanduser(args.model_dir) saver = tf.train.import_meta_graph(os.path.join(model_dir_exp, meta_file), clear_devices=True) tf.get_default_session().run(tf.global_variables_initializer()) tf.get_default_session().run(tf.local_variables_initializer()) saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file)) # Retrieve the protobuf graph definition and fix the batch norm nodes input_graph_def = sess.graph.as_graph_def() # Freeze the graph def output_graph_def = freeze_graph_def(sess, input_graph_def, 'embeddings') # Serialize and dump the output graph to the filesystem with tf.gfile.GFile(args.output_file, 'wb') as f: f.write(output_graph_def.SerializeToString()) print("%d ops in the final graph: %s" % (len(output_graph_def.node), args.output_file))
def get_session(): """Get the globally defined TensorFlow session. If the session is not already defined, then the function will create a global session. Returns: _ED_SESSION: tf.InteractiveSession. """ global _ED_SESSION if tf.get_default_session() is None: _ED_SESSION = tf.InteractiveSession() else: _ED_SESSION = tf.get_default_session() save_stderr = sys.stderr try: import os sys.stderr = open(os.devnull, 'w') # suppress keras import from keras import backend as K sys.stderr = save_stderr have_keras = True except ImportError: sys.stderr = save_stderr have_keras = False if have_keras: K.set_session(_ED_SESSION) return _ED_SESSION
def train(self, obs, actions, gaes, rewards, v_preds_next): tf.get_default_session().run(self.train_op, feed_dict={self.Policy.obs: obs, self.Old_Policy.obs: obs, self.actions: actions, self.rewards: rewards, self.v_preds_next: v_preds_next, self.gaes: gaes})
def get_session(): """Returns the TF session to be used by the backend. If a default TensorFlow session is available, we will return it. Else, we will return the global Keras session. If no global Keras session exists at this point: we will create a new global session. Note that you can manually set the global session via `K.set_session(sess)`. """ global _SESSION if tf.get_default_session() is not None: return tf.get_default_session() if _SESSION is None: if not os.environ.get("OMP_NUM_THREADS"): _SESSION = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) else: nb_thread = int(os.environ.get("OMP_NUM_THREADS")) _SESSION = tf.Session( config=tf.ConfigProto(intra_op_parallelism_threads=nb_thread, allow_soft_placement=True) ) return _SESSION
def fit(self, xs, ys): if self.normalize_inputs: # recompute normalizing constants for inputs new_mean = np.mean(xs, axis=0, keepdims=True) new_std = np.std(xs, axis=0, keepdims=True) + 1e-8 tf.get_default_session().run(tf.group( tf.assign(self.x_mean_var, new_mean), tf.assign(self.x_std_var, new_std), )) if self.use_trust_region and self.first_optimized: old_prob = self.f_prob(xs) inputs = [xs, ys, old_prob] optimizer = self.tr_optimizer else: inputs = [xs, ys] optimizer = self.optimizer loss_before = optimizer.loss(inputs) if self.name: prefix = self.name + "_" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) optimizer.optimize(inputs) loss_after = optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after) self.first_optimized = True
def restore_trainer(self, filename): ''' Load the training progress (including the model) Args: filename: path where the model will be saved ''' self.modelsaver.restore(tf.get_default_session(), filename) self.saver.restore(tf.get_default_session(), filename + '_trainvars')
def test_logging_trainable(self): with tf.Graph().as_default() as g, self.test_session(g): var = tf.Variable(tf.constant(42.0), name='foo') var.initializer.run() cof = tf.constant(1.0) loss = tf.sub(tf.mul(var, cof), tf.constant(1.0)) train_step = tf.train.GradientDescentOptimizer(0.5).minimize(loss) tf.get_default_session().run(train_step) self._run_monitor(learn.monitors.LoggingTrainable('foo')) self.assertRegexpMatches(str(self.logged_message), var.name)
def test_lookup_activations(self): x = tf.constant(-1.0, shape=[2, 2]) with self.test_session(): activations = ['relu','prelu','selu','crelu'] for activation in activations: activation = ops.lookup(activation)(x) tf.get_default_session().run(tf.global_variables_initializer()) self.assertNotEqual(x.eval()[0][0], activation.eval()[0][0])
def fit(self, paths, policy=None, batch_size=32, max_itrs=100, logger=None, lr=1e-3,**kwargs): #self._compute_path_probs(paths, insert=True) self.eval_expert_probs(paths, policy, insert=True) self.eval_expert_probs(self.expert_trajs, policy, insert=True) obs, acts, path_probs = self.extract_paths(paths, keys=('observations', 'actions', 'a_logprobs')) expert_obs, expert_acts, expert_probs = self.extract_paths(self.expert_trajs, keys=('observations', 'actions', 'a_logprobs')) # Train discriminator for it in TrainingIterator(max_itrs, heartbeat=5): obs_batch, act_batch, lprobs_batch = \ self.sample_batch(obs, acts, path_probs, batch_size=batch_size) expert_obs_batch, expert_act_batch, expert_lprobs_batch = \ self.sample_batch(expert_obs, expert_acts, expert_probs, batch_size=batch_size) labels = np.zeros((batch_size*2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) lprobs_batch = np.expand_dims(np.concatenate([lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32) loss, _ = tf.get_default_session().run([self.loss, self.step], feed_dict={ self.act_t: act_batch, self.obs_t: obs_batch, self.labels: labels, self.lprobs: lprobs_batch, self.lr: lr }) it.record('loss', loss) if it.heartbeat: print(it.itr_message()) mean_loss = it.pop_mean('loss') print('\tLoss:%f' % mean_loss) if logger: energy, logZ, dtau = tf.get_default_session().run([self.energy, self.value_fn, self.d_tau], feed_dict={self.act_t: acts, self.obs_t: obs, self.lprobs: np.expand_dims(path_probs, axis=1)}) logger.record_tabular('IRLLogZ', np.mean(logZ)) logger.record_tabular('IRLAverageEnergy', np.mean(energy)) logger.record_tabular('IRLAverageLogPtau', np.mean(-energy-logZ)) logger.record_tabular('IRLAverageLogQtau', np.mean(path_probs)) logger.record_tabular('IRLMedianLogQtau', np.median(path_probs)) logger.record_tabular('IRLAverageDtau', np.mean(dtau)) energy, logZ, dtau = tf.get_default_session().run([self.energy, self.value_fn, self.d_tau], feed_dict={self.act_t: expert_acts, self.obs_t: expert_obs, self.lprobs: np.expand_dims(expert_probs, axis=1)}) logger.record_tabular('IRLAverageExpertEnergy', np.mean(energy)) logger.record_tabular('IRLAverageExpertLogPtau', np.mean(-energy-logZ)) logger.record_tabular('IRLAverageExpertLogQtau', np.mean(expert_probs)) logger.record_tabular('IRLMedianExpertLogQtau', np.median(expert_probs)) logger.record_tabular('IRLAverageExpertDtau', np.mean(dtau)) return mean_loss
def get_session(): global _session # Build/retrieve the session if it doesn't exist if _session is None: if tf.get_default_session() is not None: _session = tf.get_default_session() else: _session = tf.Session() return _session
def test_preserves_existing_session(self): with tf.Session() as sess: op = tf.reduce_sum([2, 2]) self.assertIs(sess, tf.get_default_session()) result = self._square(123) self.assertEqual(123 * 123, result) self.assertIs(sess, tf.get_default_session()) number_of_lights = sess.run(op) self.assertEqual(number_of_lights, 4)
def zero_model_gradient_accumulators(cls) -> None: zero_operations = [ tf.get_default_graph().get_operation_by_name( '{}/zero_model_gradient_accumulators'.format( variable_scope_name)) for variable_scope_name in [ 'empty_statistic', 'move_rate', 'game_state_as_update', 'updated_statistic', 'updated_update', 'cost_function']] tf.get_default_session().run(zero_operations)
def predict_with_three_models_on_hashtags(hashtag_dir, hashtag_emb_dir, trial_hashtag_names, labels_exist=True): # eval_hashtag_names = get_hashtag_file_names(SEMEVAL_HUMOR_EVAL_DIR) emb_char_predictions = [] emb_predictions = [] char_predictions = [] per_hashtag_first_tweet_ids = [] per_hashtag_second_tweet_ids = [] K.clear_session() K.set_session(tf.get_default_session()) hp1 = humor_predictor.HumorPredictor(EMB_CHAR_HUMOR_MODEL_DIR, use_emb_model=True, use_char_model=True) for trial_hashtag_name in trial_hashtag_names: np_predictions, np_output_prob, np_labels, first_tweet_ids, second_tweet_ids = hp1(hashtag_dir, trial_hashtag_name) emb_char_predictions.append(np_output_prob) per_hashtag_first_tweet_ids.append(first_tweet_ids) per_hashtag_second_tweet_ids.append(second_tweet_ids) K.clear_session() K.set_session(tf.get_default_session()) hp2 = humor_predictor.HumorPredictor(EMB_HUMOR_MODEL_DIR, use_emb_model=True, use_char_model=False) for trial_hashtag_name in trial_hashtag_names: np_predictions, np_output_prob, np_labels, first_tweet_ids, second_tweet_ids = hp2(hashtag_dir, trial_hashtag_name) emb_predictions.append(np_output_prob) K.clear_session() K.set_session(tf.get_default_session()) hp3 = humor_predictor.HumorPredictor(CHAR_HUMOR_MODEL_DIR, use_emb_model=False, use_char_model=True) for trial_hashtag_name in trial_hashtag_names: np_predictions, np_output_prob, np_labels, first_tweet_ids, second_tweet_ids = hp3(hashtag_dir, trial_hashtag_name) char_predictions.append(np_output_prob) all_predictions = [] for i in range(len(trial_hashtag_names)): hashtag_all_predictions = np.concatenate( [np.reshape(emb_char_predictions[i], [-1, 1]), np.reshape(emb_predictions[i], [-1, 1]), np.reshape(char_predictions[i], [-1, 1])], axis=1) all_predictions.append(hashtag_all_predictions) hashtag_labels = None if labels_exist: hashtag_labels = [] for hashtag_name in trial_hashtag_names: print 'Loading label for hashtag %s' % hashtag_name np_first_tweets, np_second_tweets, np_labels, first_tweet_ids, second_tweet_ids, np_hashtag = \ load_hashtag_data(hashtag_emb_dir, hashtag_name) hashtag_labels.append(np_labels) return all_predictions, hashtag_labels, per_hashtag_first_tweet_ids, per_hashtag_second_tweet_ids
def main(args): with tf.Graph().as_default(): with tf.Session() as sess: # Load the model metagraph and checkpoint print('Model directory: %s' % args.model_dir) meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.model_dir)) print('Metagraph file: %s' % meta_file) print('Checkpoint file: %s' % ckpt_file) model_dir_exp = os.path.expanduser(args.model_dir) saver = tf.train.import_meta_graph(os.path.join(model_dir_exp, meta_file), clear_devices=True) tf.get_default_session().run(tf.global_variables_initializer()) tf.get_default_session().run(tf.local_variables_initializer()) saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file)) # Retrieve the protobuf graph definition and fix the batch norm nodes gd = sess.graph.as_graph_def() for node in gd.node: if node.op == 'RefSwitch': node.op = 'Switch' for index in xrange(len(node.input)): if 'moving_' in node.input[index]: node.input[index] = node.input[index] + '/read' elif node.op == 'AssignSub': node.op = 'Sub' if 'use_locking' in node.attr: del node.attr['use_locking'] elif node.op == 'AssignAdd': node.op = 'Add' if 'use_locking' in node.attr: del node.attr['use_locking'] # Get the list of important nodes output_node_names = 'embeddings' whitelist_names = [] for node in gd.node: if node.name.startswith('InceptionResnetV1') or node.name.startswith('embeddings') or node.name.startswith('phase_train'): print(node.name) whitelist_names.append(node.name) # Replace all the variables in the graph with constants of the same values output_graph_def = graph_util.convert_variables_to_constants( sess, gd, output_node_names.split(","), variable_names_whitelist=whitelist_names) # Serialize and dump the output graph to the filesystem with tf.gfile.GFile(args.output_file, 'wb') as f: f.write(output_graph_def.SerializeToString()) print("%d ops in the final graph." % len(output_graph_def.node))
def start(self): if self._need_default_sess: assert tf.get_default_session() is not None, \ "Not session is bind to predictors, " \ "MultiThreadAsyncPredictor.start() has to be called under a default session!" for t in self.threads: t.start()
def traced_run(fetches): """Runs fetches, dumps timeline files in current directory.""" global timeline_counter run_metadata = tf.RunMetadata() config = load_config() log_fn = "%s-%s-%s"%(config.task_type, config.task_id, timeline_counter) sess = tf.get_default_session() root = os.getcwd()+"/data" os.system('mkdir -p '+root) from tensorflow.python.client import timeline results = sess.run(fetches, options=run_options, run_metadata=run_metadata); tl = timeline.Timeline(step_stats=run_metadata.step_stats) ctf = tl.generate_chrome_trace_format(show_memory=True, show_dataflow=False) open(root+"/timeline_%s.json"%(log_fn,), "w").write(ctf) open(root+"/stepstats_%s.pbtxt"%(log_fn,), "w").write(str( run_metadata.step_stats)) timeline_counter+=1 return results
def applyOptimizer(self, opt, steps=5, is_sparse=False): if is_sparse: var0 = tf.Variable([[0.0], [0.0]]) var1 = tf.Variable([[0.0], [0.0]]) grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]), tf.constant([0]), tf.constant([2, 1])) grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]), tf.constant([1]), tf.constant([2, 1])) else: var0 = tf.Variable([0.0, 0.0]) var1 = tf.Variable([0.0, 0.0]) grads0 = tf.constant([0.1, 0.2]) grads1 = tf.constant([0.01, 0.02]) update = opt.apply_gradients(zip([grads0, grads1], [var0, var1])) tf.initialize_all_variables().run() sess = tf.get_default_session() v0_val, v1_val = sess.run([var0, var1]) if is_sparse: self.assertAllClose([[0.0], [0.0]], v0_val) self.assertAllClose([[0.0], [0.0]], v1_val) else: self.assertAllClose([0.0, 0.0], v0_val) self.assertAllClose([0.0, 0.0], v1_val) # Run Ftrl for a few steps for _ in range(steps): update.run() v0_val, v1_val = sess.run([var0, var1]) return v0_val, v1_val
def __init__(self, batch=64, use_cpus=False): image_shape = [batch, 224, 224, 3] labels_shape = [batch] # Synthetic image should be within [0, 255]. images = tf.truncated_normal( image_shape, dtype=tf.float32, mean=127, stddev=60, name='synthetic_images') # Minor hack to avoid H2D copy when using synthetic data inputs = tf.contrib.framework.local_variable( images, name='gpu_cached_images') labels = tf.random_uniform( labels_shape, minval=0, maxval=999, dtype=tf.int32, name='synthetic_labels') model = model_config.get_model_config("resnet101", MockDataset()) logits, aux = model.build_network( inputs, data_format=use_cpus and "NHWC" or "NCHW") loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=labels) # Implement model interface self.loss = tf.reduce_mean(loss, name='xentropy-loss') self.optimizer = tf.train.GradientDescentOptimizer(1e-6) self.variables = ray_tf_utils.TensorFlowVariables( self.loss, tf.get_default_session())
def get_global_step_value(): """ Returns: int: global_step value in current graph and session""" return tf.train.global_step( tf.get_default_session(), get_global_step_var())
def _trigger_epoch(self): try: if not self.meta_graph_written: self.saver.export_meta_graph( os.path.join(logger.LOG_DIR, 'graph-{}.meta'.format(logger.get_time_str())), collection_list=self.graph.get_all_collection_keys()) self.meta_graph_written = True self.saver.save( tf.get_default_session(), self.path, global_step=self.global_step, write_meta_graph=False) # create a symbolic link for the latest model latest = self.saver.last_checkpoints[-1] basename = os.path.basename(latest) linkname = os.path.join(os.path.dirname(latest), 'latest') try: os.unlink(linkname) except OSError: pass os.symlink(basename, linkname) except (OSError, IOError): # disk error sometimes.. just ignore it logger.exception("Exception in ModelSaver.trigger_epoch!")
def test_outputs(self, model, inputs, output_tensors, outputs): """Test for correct output.""" sess = tf.get_default_session() guarantee_initialized_variables(sess) args, kwargs = inputs test_outputs = model.compute(output_tensors, *args, **kwargs) assert_array_collections_equal(outputs, test_outputs, decimal=4)
def get_param_shapes(self, **tags): tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0])) if tag_tuple not in self._cached_param_shapes: params = self.get_params(**tags) param_values = tf.get_default_session().run(params) self._cached_param_shapes[tag_tuple] = [val.shape for val in param_values] return self._cached_param_shapes[tag_tuple]
def __init__(self): # Import data error = None for _ in range(10): try: self.mnist = input_data.read_data_sets( "/tmp/tensorflow/mnist/input_data", one_hot=True) error = None break except Exception as e: error = e time.sleep(5) if error: raise ValueError("Failed to import data", error) # Set seed and build layers tf.set_random_seed(0) self.x = tf.placeholder(tf.float32, [None, 784], name="x") self.y_ = tf.placeholder(tf.float32, [None, 10], name="y_") y_conv, self.keep_prob = deepnn(self.x) # Need to define loss and optimizer attributes self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits( labels=self.y_, logits=y_conv)) self.optimizer = tf.train.AdamOptimizer(1e-4) self.variables = ray_tf_utils.TensorFlowVariables( self.loss, tf.get_default_session()) # For evaluating test accuracy correct_prediction = tf.equal( tf.argmax(y_conv, 1), tf.argmax(self.y_, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
def eval_value(self, state): sess = tf.get_default_session() state_batch = state[newaxis, :, :] value_batch = sess.run(self.value_tensor, feed_dict={self.state_input: state_batch}) value = value_batch[0, 0] return value
def callback(self,data): with session.as_default(): assert tf.get_default_session() is session input_image = np.flipud(data.data.reshape(image_size,image_size).astype(np.float32)).reshape(-1,image_size,image_size,1) out_class, out_angle = test_model(input_image) pre_class = tf.nn.softmax(out_class) pre_angle = tf.nn.softmax(out_angle).eval() angle = np.sum(np.multiply(pre_angle, angles_list))/np.sum(pre_angle) pre_dict = dict(zip(list(range(num_labels)),pre_class.eval()[0])) sorted_pre_dict = sorted(pre_dict.items(), key=operator.itemgetter(1)) name1 = value2name[sorted_pre_dict[-1][0]] name1 = name2string[name1] value1 = str(sorted_pre_dict[-1][1]) name2 = value2name[sorted_pre_dict[-2][0]] name2 = name2string[name2] value2 = str(sorted_pre_dict[-2][1]) pre = PredictionMSG() pre.name1, pre.value1, pre.name2, pre.value2, pre.angle = name1, float(value1), name2, float(value2), angle self.pub1.publish(pre) image = ((input_image.reshape(image_size,image_size) + 0.65)*255).astype(np.uint8) pt1x = int(self.pt1x * math.cos(math.radians(angle)) + self.pt1y * -math.sin(math.radians(angle))) + 40 pt1y = int(self.pt1x * math.sin(math.radians(angle)) + self.pt1y * math.cos(math.radians(angle))) + 40 pt2x = int(self.pt2x * math.cos(math.radians(angle)) + self.pt2y * -math.sin(math.radians(angle))) + 40 pt2y = int(self.pt2x * math.sin(math.radians(angle)) + self.pt2y * math.cos(math.radians(angle))) + 40 cv2.line(image,(pt1x,pt1y),(pt2x,pt2y),255,2) ros_image = self.bridge.cv2_to_imgmsg(image, encoding="mono8") self.pub2.publish(ros_image) sys.stdout.write(".") sys.stdout.flush()
def get_grad(self, obs, actions, gaes, rewards, v_preds_next): return tf.get_default_session().run(self.gradients, feed_dict={self.Policy.obs: obs, self.Old_Policy.obs: obs, self.actions: actions, self.rewards: rewards, self.v_preds_next: v_preds_next, self.gaes: gaes})
def validate_probtype(probtype, pdparam): N = 100000 # Check to see if mean negative log likelihood == differential entropy Mval = np.repeat(pdparam[None, :], N, axis=0) M = probtype.param_placeholder([N]) X = probtype.sample_placeholder([N]) pd = probtype.pdfromflat(M) calcloglik = U.function([X, M], pd.logp(X)) calcent = U.function([M], pd.entropy()) Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval}) logliks = calcloglik(Xval, Mval) entval_ll = - logliks.mean() #pylint: disable=E1101 entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 entval = calcent(Mval).mean() #pylint: disable=E1101 assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas # Check to see if kldiv[p,q] = - ent[p] - E_p[log q] M2 = probtype.param_placeholder([N]) pd2 = probtype.pdfromflat(M2) q = pdparam + np.random.randn(pdparam.size) * 0.1 Mval2 = np.repeat(q[None, :], N, axis=0) calckl = U.function([M, M2], pd.kl(pd2)) klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101 logliks = calcloglik(Xval, Mval2) klval_ll = - entval - logliks.mean() #pylint: disable=E1101 klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101 assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas print('ok on', probtype, pdparam)
def fit(self, trajs, batch_size=32, max_itrs=100, **kwargs): obs, acts = self.extract_paths(trajs) expert_obs, expert_acts = self.expert_trajs # Train discriminator for it in TrainingIterator(max_itrs, heartbeat=5): obs_batch, act_batch = self.sample_batch(obs, acts, batch_size=batch_size) expert_obs_batch, expert_act_batch = self.sample_batch(expert_obs, expert_acts, batch_size=batch_size) labels = np.zeros((batch_size*2, 1)) labels[batch_size:] = 1.0 obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0) act_batch = np.concatenate([act_batch, expert_act_batch], axis=0) loss, _ = tf.get_default_session().run([self.loss, self.step], feed_dict={ self.act_t: act_batch, self.obs_t: obs_batch, self.labels: labels, self.lr: 1e-3 }) it.record('loss', loss) if it.heartbeat: print(it.itr_message()) mean_loss = it.pop_mean('loss') print('\tLoss:%f' % mean_loss) return mean_loss
def fit(self, xs, ys): sess = tf.get_default_session() if self._normalize_inputs: # recompute normalizing constants for inputs sess.run([ tf.assign(self._x_mean_var, np.mean(xs, axis=0, keepdims=True)), tf.assign(self._x_std_var, np.std(xs, axis=0, keepdims=True) + 1e-8), ]) if self._normalize_outputs: # recompute normalizing constants for outputs sess.run([ tf.assign(self._y_mean_var, np.mean(ys, axis=0, keepdims=True)), tf.assign(self._y_std_var, np.std(ys, axis=0, keepdims=True) + 1e-8), ]) if self._use_trust_region: old_means, old_log_stds = self._f_pdists(xs) inputs = [xs, ys, old_means, old_log_stds] else: inputs = [xs, ys] loss_before = self._optimizer.loss(inputs) if self._name: prefix = self._name + "_" else: prefix = "" logger.record_tabular(prefix + 'LossBefore', loss_before) self._optimizer.optimize(inputs) loss_after = self._optimizer.loss(inputs) logger.record_tabular(prefix + 'LossAfter', loss_after) if self._use_trust_region: logger.record_tabular(prefix + 'MeanKL', self._optimizer.constraint_val(inputs)) logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
def main(): FLAGS.set_seed() FLAGS.freeze() env = create_env(FLAGS.env.id, seed=FLAGS.seed, log_dir=FLAGS.log_dir, absorbing_state=FLAGS.GAIL.learn_absorbing, rescale_action=FLAGS.env.rescale_action) env_eval = create_env(FLAGS.env.id, seed=FLAGS.seed + 1000, log_dir=FLAGS.log_dir, absorbing_state=FLAGS.GAIL.learn_absorbing, rescale_action=FLAGS.env.rescale_action) dim_state = env.observation_space.shape[0] dim_action = env.action_space.shape[0] normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state) policy = GaussianMLPPolicy(dim_state, dim_action, FLAGS.TRPO.policy_hidden_sizes, normalizer=normalizers.state) vfn = MLPVFunction(dim_state, FLAGS.TRPO.vf_hidden_sizes, normalizers.state) algo = TRPO(vfn=vfn, policy=policy, dim_state=dim_state, dim_action=dim_action, **FLAGS.TRPO.algo.as_dict()) subsampling_rate = env.max_episode_steps // FLAGS.GAIL.trajectory_size discriminator = Discriminator(dim_state, dim_action, normalizers=normalizers, subsampling_rate=subsampling_rate, **FLAGS.GAIL.discriminator.as_dict()) tf.get_default_session().run(tf.global_variables_initializer()) # load expert dataset expert_dataset = load_expert_dataset(FLAGS.GAIL.buf_load) expert_reward = expert_dataset.get_average_reward() logger.info('Expert Reward %f', expert_reward) if FLAGS.GAIL.learn_absorbing: expert_dataset.add_absorbing_states(env) expert_dataset.subsample_trajectories(FLAGS.GAIL.traj_limit) logger.info('Original dataset size {}'.format(len(expert_dataset))) expert_dataset.subsample_transitions(subsampling_rate) logger.info('Subsampled dataset size {}'.format(len(expert_dataset))) saver = nn.ModuleDict({ 'policy': policy, 'vfn': vfn, 'normalizers': normalizers }) runner = Runner(env, max_steps=env.max_episode_steps, gamma=FLAGS.TRPO.gamma, lambda_=FLAGS.TRPO.lambda_, add_absorbing_state=FLAGS.GAIL.learn_absorbing) print(saver) max_ent_coef = FLAGS.TRPO.algo.ent_coef for t in range(0, FLAGS.GAIL.total_timesteps, FLAGS.TRPO.rollout_samples * FLAGS.GAIL.g_iters): time_st = time.time() if t % FLAGS.GAIL.eval_freq == 0: eval_returns, eval_lengths = evaluate(policy, env_eval) log_kvs(prefix='Evaluate', kvs=dict(iter=t, episode=dict(returns=np.mean(eval_returns), lengths=int(np.mean(eval_lengths))))) # Generator generator_dataset = None for n_update in range(FLAGS.GAIL.g_iters): data, ep_infos = runner.run(policy, FLAGS.TRPO.rollout_samples) if FLAGS.TRPO.normalization: normalizers.state.update(data.state) normalizers.action.update(data.action) normalizers.diff.update(data.next_state - data.state) if t == 0 and n_update == 0 and not FLAGS.GAIL.learn_absorbing: data_ = data.copy() data_ = data_.reshape( [FLAGS.TRPO.rollout_samples // env.n_envs, env.n_envs]) for e in range(env.n_envs): samples = data_[:, e] masks = 1 - (samples.done | samples.timeout)[..., np.newaxis] masks = masks[:-1] assert np.allclose(samples.state[1:] * masks, samples.next_state[:-1] * masks) t += FLAGS.TRPO.rollout_samples data.reward = discriminator.get_reward(data.state, data.action) advantages, values = runner.compute_advantage(vfn, data) train_info = algo.train(max_ent_coef, data, advantages, values) fps = int(FLAGS.TRPO.rollout_samples / (time.time() - time_st)) train_info['reward'] = np.mean(data.reward) train_info['fps'] = fps log_kvs(prefix='TRPO', kvs=dict(iter=t, **train_info)) generator_dataset = data # Discriminator for n_update in range(FLAGS.GAIL.d_iters): batch_size = FLAGS.GAIL.d_batch_size d_train_infos = dict() for generator_subset in generator_dataset.iterator(batch_size): expert_batch = expert_dataset.sample(batch_size) expert_state = np.stack([t.obs for t in expert_batch]) expert_action = np.stack([t.action for t in expert_batch]) expert_mask = np.stack([ t.mask for t in expert_batch ]).flatten() if FLAGS.GAIL.learn_absorbing else None train_info = discriminator.train( expert_state, expert_action, generator_subset.state, generator_subset.action, expert_mask, ) for k, v in train_info.items(): if k not in d_train_infos: d_train_infos[k] = [] d_train_infos[k].append(v) d_train_infos = {k: np.mean(v) for k, v in d_train_infos.items()} if n_update == FLAGS.GAIL.d_iters - 1: log_kvs(prefix='Discriminator', kvs=dict(iter=t, **d_train_infos)) if t % FLAGS.TRPO.save_freq == 0: np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict()) np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict()) np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
def init_tf(config_dict=dict()): if tf.get_default_session() is None: tf.set_random_seed(np.random.randint(1 << 31)) create_session(config_dict, force_as_default=True)
def load_model(model_uri, tf_sess=None): """ Load an MLflow model that contains the TensorFlow flavor from the specified path. *With TensorFlow version <2.0.0, this method must be called within a TensorFlow graph context.* :param model_uri: The location, in URI format, of the MLflow model. For example: - ``/Users/me/path/to/local/model`` - ``relative/path/to/local/model`` - ``s3://my_bucket/path/to/model`` - ``runs:/<mlflow_run_id>/run-relative/path/to/model`` - ``models:/<model_name>/<model_version>`` - ``models:/<model_name>/<stage>`` For more information about supported URI schemes, see `Referencing Artifacts <https://www.mlflow.org/docs/latest/concepts.html# artifact-locations>`_. :param tf_sess: The TensorFlow session in which to load the model. If using TensorFlow version >= 2.0.0, this argument is ignored. If using TensorFlow <2.0.0, if no session is passed to this function, MLflow will attempt to load the model using the default TensorFlow session. If no default session is available, then the function raises an exception. :return: For TensorFlow < 2.0.0, a TensorFlow signature definition of type: ``tensorflow.core.protobuf.meta_graph_pb2.SignatureDef``. This defines the input and output tensors for model inference. For TensorFlow >= 2.0.0, A callable graph (tf.function) that takes inputs and returns inferences. >>> import mlflow.tensorflow >>> import tensorflow as tf >>> tf_graph = tf.Graph() >>> tf_sess = tf.Session(graph=tf_graph) >>> with tf_graph.as_default(): >>> signature_definition = mlflow.tensorflow.load_model(model_uri="model_uri", >>> tf_sess=tf_sess) >>> input_tensors = [tf_graph.get_tensor_by_name(input_signature.name) >>> for _, input_signature in signature_def.inputs.items()] >>> output_tensors = [tf_graph.get_tensor_by_name(output_signature.name) >>> for _, output_signature in signature_def.outputs.items()] """ if LooseVersion(tensorflow.__version__) < LooseVersion('2.0.0'): if not tf_sess: tf_sess = tensorflow.get_default_session() if not tf_sess: raise MlflowException("No TensorFlow session found while calling load_model()." + "You can set the default Tensorflow session before calling" + " load_model via `session.as_default()`, or directly pass " + "a session in which to load the model via the tf_sess " + "argument.") else: if tf_sess: warnings.warn("A TensorFlow session was passed into load_model, but the " + "currently used version is TF 2.0 where sessions are deprecated. " + "The tf_sess argument will be ignored.", FutureWarning) local_model_path = _download_artifact_from_uri(artifact_uri=model_uri) tf_saved_model_dir, tf_meta_graph_tags, tf_signature_def_key =\ _get_and_parse_flavor_configuration(model_path=local_model_path) return _load_tensorflow_saved_model(tf_saved_model_dir=tf_saved_model_dir, tf_meta_graph_tags=tf_meta_graph_tags, tf_signature_def_key=tf_signature_def_key, tf_sess=tf_sess)
def set_discount_d_loss_factor(self, value=.1): sess = tf.get_default_session() if sess is None: raise ValueError("in the with tf.Session() as sess block") sess.run(self.set_discount_d_loss_fac_op, feed_dict={self.D_loss_factor_ph: value})
def run_optimize(*, feed_dict): assert lr in feed_dict, 'feed_dict need to contain learning rate.' return tf.get_default_session().run(optimize_op, feed_dict)
def start(self): self._sess = tf.get_default_session() super().start()
def __init__(self, observation_space, action_space, config, existing_inputs=None): """ Arguments: observation_space: Environment observation space specification. action_space: Environment action space specification. config (dict): Configuration values for PPO graph. existing_inputs (list): Optional list of tuples that specify the placeholders upon which the graph should be built upon. """ config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config) self.sess = tf.get_default_session() self.action_space = action_space self.config = config self.kl_coeff_val = self.config["kl_coeff"] self.kl_target = self.config["kl_target"] dist_cls, logit_dim = ModelCatalog.get_action_dist( action_space, self.config["model"]) if existing_inputs: obs_ph, value_targets_ph, adv_ph, act_ph, \ logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \ existing_inputs[:8] existing_state_in = existing_inputs[8:-1] existing_seq_lens = existing_inputs[-1] else: obs_ph = tf.placeholder(tf.float32, name="obs", shape=(None, ) + observation_space.shape) adv_ph = tf.placeholder(tf.float32, name="advantages", shape=(None, )) act_ph = ModelCatalog.get_action_placeholder(action_space) logits_ph = tf.placeholder(tf.float32, name="logits", shape=(None, logit_dim)) vf_preds_ph = tf.placeholder(tf.float32, name="vf_preds", shape=(None, )) value_targets_ph = tf.placeholder(tf.float32, name="value_targets", shape=(None, )) prev_actions_ph = ModelCatalog.get_action_placeholder(action_space) prev_rewards_ph = tf.placeholder(tf.float32, [None], name="prev_reward") existing_state_in = None existing_seq_lens = None self.observations = obs_ph self.prev_actions = prev_actions_ph self.prev_rewards = prev_rewards_ph self.loss_in = [ (SampleBatch.CUR_OBS, obs_ph), (Postprocessing.VALUE_TARGETS, value_targets_ph), (Postprocessing.ADVANTAGES, adv_ph), (SampleBatch.ACTIONS, act_ph), (BEHAVIOUR_LOGITS, logits_ph), (SampleBatch.VF_PREDS, vf_preds_ph), (SampleBatch.PREV_ACTIONS, prev_actions_ph), (SampleBatch.PREV_REWARDS, prev_rewards_ph), ] self.model = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, logit_dim, self.config["model"], state_in=existing_state_in, seq_lens=existing_seq_lens) # KL Coefficient self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer( self.kl_coeff_val), name="kl_coeff", shape=(), trainable=False, dtype=tf.float32) self.logits = self.model.outputs curr_action_dist = dist_cls(self.logits) self.sampler = curr_action_dist.sample() if self.config["use_gae"]: if self.config["vf_share_layers"]: self.value_function = self.model.value_function() else: vf_config = self.config["model"].copy() # Do not split the last layer of the value function into # mean parameters and standard deviation parameters and # do not make the standard deviations free variables. vf_config["free_log_std"] = False if vf_config["use_lstm"]: vf_config["use_lstm"] = False logger.warning( "It is not recommended to use a LSTM model with " "vf_share_layers=False (consider setting it to True). " "If you want to not share layers, you can implement " "a custom LSTM model that overrides the " "value_function() method.") with tf.variable_scope("value_function"): self.value_function = ModelCatalog.get_model( { "obs": obs_ph, "prev_actions": prev_actions_ph, "prev_rewards": prev_rewards_ph, "is_training": self._get_is_training_placeholder(), }, observation_space, action_space, 1, vf_config).outputs self.value_function = tf.reshape(self.value_function, [-1]) else: self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1]) if self.model.state_in: max_seq_len = tf.reduce_max(self.model.seq_lens) mask = tf.sequence_mask(self.model.seq_lens, max_seq_len) mask = tf.reshape(mask, [-1]) else: mask = tf.ones_like(adv_ph, dtype=tf.bool) self.loss_obj = PPOLoss(action_space, value_targets_ph, adv_ph, act_ph, logits_ph, vf_preds_ph, curr_action_dist, self.value_function, self.kl_coeff, mask, entropy_coeff=self.config["entropy_coeff"], clip_param=self.config["clip_param"], vf_clip_param=self.config["vf_clip_param"], vf_loss_coeff=self.config["vf_loss_coeff"], use_gae=self.config["use_gae"]) LearningRateSchedule.__init__(self, self.config["lr"], self.config["lr_schedule"]) TFPolicyGraph.__init__( self, observation_space, action_space, self.sess, obs_input=obs_ph, action_sampler=self.sampler, action_prob=curr_action_dist.sampled_action_prob(), loss=self.loss_obj.loss, model=self.model, loss_inputs=self.loss_in, state_inputs=self.model.state_in, state_outputs=self.model.state_out, prev_action_input=prev_actions_ph, prev_reward_input=prev_rewards_ph, seq_lens=self.model.seq_lens, max_seq_len=config["model"]["max_seq_len"]) self.sess.run(tf.global_variables_initializer()) self.explained_variance = explained_variance(value_targets_ph, self.value_function) self.stats_fetches = { "cur_kl_coeff": self.kl_coeff, "cur_lr": tf.cast(self.cur_lr, tf.float64), "total_loss": self.loss_obj.loss, "policy_loss": self.loss_obj.mean_policy_loss, "vf_loss": self.loss_obj.mean_vf_loss, "vf_explained_var": self.explained_variance, "kl": self.loss_obj.mean_kl, "entropy": self.loss_obj.mean_entropy }
def __call__(self, theta): tf.get_default_session().run(self.op, feed_dict={self.theta: theta})
def save(self): save_checkpoint(tf.get_default_session(), self.saver, self.checkpoint_dir, self.global_step)
def have_data_for_dequeue(self): return self.have_more(tf.get_default_session())
def learn(env, eval_env, policy_func, reward_giver, expert_dataset, rank, pretrained, pretrained_weight, *, g_step, d_step, entcoeff, save_per_iter, ckpt_dir, log_dir, timesteps_per_batch, evaluation_freq, task_name, gamma, lam, max_kl, cg_iters, cg_damping=1e-2, vf_stepsize=3e-4, d_stepsize=3e-4, vf_iters=3, num_epochs=1000, callback=None): # configure log # logger.configure(dir=log_dir) nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space pi = policy_func("pi", ob_space, ac_space, reuse=(pretrained_weight != None)) oldpi = policy_func("oldpi", ob_space, ac_space) atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ob = U.get_placeholder_cached(name="ob") ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = entcoeff * meanent vferr = tf.reduce_mean(tf.square(pi.vpred - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = pi.get_trainable_variables() var_list = [ v for v in all_var_list if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd") ] vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")] # assert len(var_list) == len(vf_var_list) + 1 d_adam = MpiAdam(reward_giver.get_trainable_variables()) vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) # pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(oldpi.get_variables(), pi.get_variables()) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) d_adam.sync() vfadam.sync() if rank == 0: print("Init param sum", th_init.sum(), flush=True) # Prepare for rollouts # ---------------------------------------- seg_gen = traj_segment_generator(pi, env, reward_giver, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards true_rewbuffer = deque(maxlen=40) g_loss_stats = stats(loss_names) d_loss_stats = stats(reward_giver.loss_name) ep_stats = stats(["True_rewards", "Rewards", "Episode_length"]) # if provide pretrained weight if pretrained_weight is not None: U.load_state(pretrained_weight, var_list=pi.get_variables()) for epoch in range(num_epochs): if callback: callback(locals(), globals()) # Save model if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None: fname = os.path.join(ckpt_dir, task_name) os.makedirs(os.path.dirname(fname), exist_ok=True) saver = tf.train.Saver() saver.save(tf.get_default_session(), fname) logger.log("********** Epoch %i ************" % epoch) def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p # ------------------ Update G ------------------ logger.log("Optimizing Policy...") total_obs = [] total_acs = [] total_ep_rets = [] total_ep_lens = [] total_ep_true_rets = [] for g_step_num in range(g_step): with timed("sampling"): seg = seg_gen.__next__() # Add seg into total_seg total_obs.append(seg["ob"]) total_acs.append(seg["ac"]) total_ep_rets.append(seg["ep_rets"]) total_ep_lens.append(seg["ep_lens"]) total_ep_true_rets.append(seg["ep_true_rets"]) add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ob_rms"): pi.ob_rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log("Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log("violated KL constraint. shrinking step.") elif improve < 0: logger.log("surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128): if hasattr(pi, "obs_rms"): pi.obs_rms.update( mbob) # update running mean/std for policy g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) # Evaluate current policy if (g_step * epoch + g_step_num) % evaluation_freq == 0: evaluate_policy(pi, reward_giver, eval_env, g_step * epoch + g_step_num, timesteps_per_batch, tstart) # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") total_obs = np.vstack(total_obs) total_acs = np.vstack(total_acs) total_ep_rets = np.concatenate(total_ep_rets) total_ep_lens = np.concatenate(total_ep_lens) total_ep_true_rets = np.concatenate(total_ep_true_rets) logger.log(fmt_row(13, reward_giver.loss_name)) ob_expert, ac_expert = expert_dataset.get_next_batch(len(total_obs)) batch_size = len(total_obs) // d_step d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch for ob_batch, ac_batch in dataset.iterbatches( (total_obs, total_acs), include_final_partial_batch=False, batch_size=batch_size): ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch)) # Update running mean/std for reward_giver if hasattr(reward_giver, "obs_rms"): reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch, ob_expert, ac_expert) d_adam.update(allmean(g), d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
def one_more_enqueue_is_enough(self): tf_session = tf.get_default_session() return tf_session.run(self.one_more_enqueue_is_enough_op)
def get_session(config=None): """Get default session or create one with a given config""" sess = tf.get_default_session() if sess is None: sess = make_session(config=config, make_default=True) return sess
def __call__(self): return tf.get_default_session().run(self.op)
def deep_q_learning(sess, env, q_estimator, target_estimator, state_processor, num_episodes, experiment_dir, replay_memory_size=500000, replay_memory_init_size=50000, update_target_estimator_every=10000, discount_factor=0.99, epsilon_start=1.0, epsilon_end=0.1, epsilon_decay_steps=500000, batch_size=32, record_video_every=50): """ Q-Learning algorithm for off-policy TD control using Function Approximation. Finds the optimal greedy policy while following an epsilon-greedy policy. Args: sess: Tensorflow Session object env: OpenAI environment q_estimator: Estimator object used for the q values target_estimator: Estimator object used for the targets state_processor: A StateProcessor object num_episodes: Number of episodes to run for experiment_dir: Directory to save Tensorflow summaries in replay_memory_size: Size of the replay memory replay_memory_init_size: Number of random experiences to sampel when initializing the reply memory. update_target_estimator_every: Copy parameters from the Q estimator to the target estimator every N steps discount_factor: Gamma discount factor epsilon_start: Chance to sample a random action when taking an action. Epsilon is decayed over time and this is the start value epsilon_end: The final minimum value of epsilon after decaying is done epsilon_decay_steps: Number of steps to decay epsilon over batch_size: Size of batches to sample from the replay memory record_video_every: Record a video every N episodes Returns: An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards. """ Transition = namedtuple( "Transition", ["state", "action", "reward", "next_state", "done"]) # The replay memory replay_memory = [] # Keeps track of useful statistics stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes), episode_rewards=np.zeros(num_episodes)) # Create directories for checkpoints and summaries checkpoint_dir = os.path.join(experiment_dir, "checkpoints") checkpoint_path = os.path.join(checkpoint_dir, "model") monitor_path = os.path.join(experiment_dir, "monitor") if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) if not os.path.exists(monitor_path): os.makedirs(monitor_path) saver = tf.train.Saver() # Load a previous checkpoint if we find one latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir) if latest_checkpoint: print("Loading model checkpoint {}...\n".format(latest_checkpoint)) saver.restore(sess, latest_checkpoint) total_t = sess.run(tf.contrib.framework.get_global_step()) # The epsilon decay schedule epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps) # The policy we're following policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS)) # Populate the replay memory with initial experience print("Populating replay memory...") state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) for i in range(replay_memory_init_size): action_probs = policy(sess, state, epsilons[min(total_t, epsilon_decay_steps - 1)]) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) replay_memory.append( Transition(state, action, reward, next_state, done)) if done: state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) else: state = next_state # Record videos # Use the gym env Monitor wrapper env = Monitor(env, directory=monitor_path, resume=True, video_callable=lambda count: count % record_video_every == 0) for i_episode in range(num_episodes): # Save the current checkpoint saver.save(tf.get_default_session(), checkpoint_path) # Reset the environment state = env.reset() state = state_processor.process(sess, state) state = np.stack([state] * 4, axis=2) loss = None # One step in the environment for t in itertools.count(): # Epsilon for this time step epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)] # Add epsilon to Tensorboard episode_summary = tf.Summary() episode_summary.value.add(simple_value=epsilon, tag="epsilon") q_estimator.summary_writer.add_summary(episode_summary, total_t) # Maybe update the target estimator if total_t % update_target_estimator_every == 0: copy_model_parameters(sess, q_estimator, target_estimator) print("\nCopied model parameters to target network.") # Print out which step we're on, useful for debugging. print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format( t, total_t, i_episode + 1, num_episodes, loss), end="") sys.stdout.flush() # Take a step action_probs = policy(sess, state, epsilon) action = np.random.choice(np.arange(len(action_probs)), p=action_probs) next_state, reward, done, _ = env.step(VALID_ACTIONS[action]) next_state = state_processor.process(sess, next_state) next_state = np.append(state[:, :, 1:], np.expand_dims(next_state, 2), axis=2) # If our replay memory is full, pop the first element if len(replay_memory) == replay_memory_size: replay_memory.pop(0) # Save transition to replay memory replay_memory.append( Transition(state, action, reward, next_state, done)) # Update statistics stats.episode_rewards[i_episode] += reward stats.episode_lengths[i_episode] = t # Sample a minibatch from the replay memory samples = random.sample(replay_memory, batch_size) states_batch, action_batch, reward_batch, next_states_batch, done_batch = map( np.array, zip(*samples)) # Calculate q values and targets (Double DQN) q_values_next = q_estimator.predict(sess, next_states_batch) best_actions = np.argmax(q_values_next, axis=1) q_values_next_target = target_estimator.predict( sess, next_states_batch) targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \ discount_factor * q_values_next_target[np.arange(batch_size), best_actions] # Perform gradient descent update states_batch = np.array(states_batch) loss = q_estimator.update(sess, states_batch, action_batch, targets_batch) if done: break state = next_state total_t += 1 # Add summaries to tensorboard episode_summary = tf.Summary() episode_summary.value.add( simple_value=stats.episode_rewards[i_episode], node_name="episode_reward", tag="episode_reward") episode_summary.value.add( simple_value=stats.episode_lengths[i_episode], node_name="episode_length", tag="episode_length") q_estimator.summary_writer.add_summary(episode_summary, total_t) q_estimator.summary_writer.flush() yield total_t, plotting.EpisodeStats( episode_lengths=stats.episode_lengths[:i_episode + 1], episode_rewards=stats.episode_rewards[:i_episode + 1]) env.monitor.close() return stats
import tensorflow as tf with tf.Session() as sess: print("1111111111111111111111111111111") print(tf.get_default_session()) sess2 = tf.Session() print("2222222222222222222222222222222222") print(tf.get_default_session()) sess3 = tf.Session() with sess3: print("33333333333333333333333333333333") print(tf.get_default_session())
def run(*args, **kwargs): # Run the specified ops in the default session. return tf.get_default_session().run(*args, **kwargs)
def build_model(hps, kind="train", datasets=None): """Builds a model from either random initialization, or saved parameters. Args: hps: The hyper parameters for the model. kind: (optional) The kind of model to build. Training vs inference require different graphs. datasets: The datasets structure (see top of lfads.py). Returns: an LFADS model. """ build_kind = kind if build_kind == "write_model_params": build_kind = "train" with tf.variable_scope("LFADS", reuse=None): model = LFADS(hps, kind=build_kind, datasets=datasets) if not os.path.exists(hps.lfads_save_dir): print("Save directory %s does not exist, creating it." % hps.lfads_save_dir) os.makedirs(hps.lfads_save_dir) cp_pb_ln = hps.checkpoint_pb_load_name cp_pb_ln = 'checkpoint' if cp_pb_ln == "" else cp_pb_ln if cp_pb_ln == 'checkpoint': print("Loading latest training checkpoint in: ", hps.lfads_save_dir) saver = model.seso_saver elif cp_pb_ln == 'checkpoint_lve': print("Loading lowest validation checkpoint in: ", hps.lfads_save_dir) saver = model.lve_saver else: print("Loading checkpoint: ", cp_pb_ln, ", in: ", hps.lfads_save_dir) saver = model.seso_saver ckpt = tf.train.get_checkpoint_state(hps.lfads_save_dir, latest_filename=cp_pb_ln) session = tf.get_default_session() print("ckpt: ", ckpt) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): print("Reading model parameters from %s" % ckpt.model_checkpoint_path) saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") if kind in [ "posterior_sample_and_average", "posterior_push_mean", "prior_sample", "write_model_params" ]: print("Possible error!!! You are running ", kind, " on a newly \ initialized model!") # cannot print ckpt.model_check_point path if no ckpt print("Are you sure you sure a checkpoint in ", hps.lfads_save_dir, " exists?") tf.global_variables_initializer().run() if ckpt: train_step_str = re.search('-[0-9]+$', ckpt.model_checkpoint_path).group() else: train_step_str = '-0' fname = 'hyperparameters' + train_step_str + '.txt' hp_fname = os.path.join(hps.lfads_save_dir, fname) hps_for_saving = jsonify_dict(hps) utils.write_data(hp_fname, hps_for_saving, use_json=True) return model
def _run(self): print(cfg.to_string()) threshold_reached = True self.global_step = 0 self.n_global_experiences = 0 self.curriculum_remaining = self.curriculum + [] self.curriculum_complete = [] stage_idx = 0 while self.curriculum_remaining: print("\n" + "=" * 50) self.timestamp("Starting stage {}".format(stage_idx)) print("\n") if cfg.start_tensorboard: restart_tensorboard(self.experiment_store.path, cfg.tbport, cfg.reload_interval) stage_config = self.curriculum_remaining.pop(0) stage_config = Config(stage_config) self.data.start_stage(stage_idx, stage_config) with ExitStack() as stack: # --------------- Stage set-up ------------------- print("\n" + "-" * 10 + " Stage set-up " + "-" * 10) print("\nNew config values for this stage are: \n{}\n".format(pformat(stage_config))) stack.enter_context(stage_config) stage_prepare_func = cfg.get("stage_prepare_func", None) if callable(stage_prepare_func): stage_prepare_func() # Modify the stage config in arbitrary ways before starting stage self.mpi_context.start_stage() # Configure and create session and graph for stage. session_config = tf.ConfigProto() session_config.intra_op_parallelism_threads = cfg.get('intra_op_parallelism_threads', 0) session_config.inter_op_parallelism_threads = cfg.get('inter_op_parallelism_threads', 0) session_config.log_device_placement = cfg.get('log_device_placement', 0) if cfg.use_gpu: per_process_gpu_memory_fraction = getattr(cfg, 'per_process_gpu_memory_fraction', None) if per_process_gpu_memory_fraction: session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None) if gpu_allow_growth: session_config.gpu_options.allow_growth = gpu_allow_growth if cfg.use_gpu: print("Using GPU if available.") print("Using {}% of GPU memory.".format( 100 * session_config.gpu_options.per_process_gpu_memory_fraction)) print("Allowing growth of GPU memory: {}".format(session_config.gpu_options.allow_growth)) graph = tf.Graph() sess = tf.Session(graph=graph, config=session_config) # This HAS to come after the creation of the session, otherwise # it allocates all GPU memory if using the GPU. print("\nAvailable devices: ") from tensorflow.python.client import device_lib print(device_lib.list_local_devices()) if not cfg.use_gpu: print("Not using GPU.") stack.enter_context(graph.device("/cpu:0")) stack.enter_context(graph.as_default()) stack.enter_context(sess) stack.enter_context(sess.as_default()) # Set the seed for the stage. Notice we generate a new tf seed for each stage. tf_seed = gen_seed() print("Setting tensorflow seed to generated seed: {}\n".format(tf_seed)) tf.set_random_seed(tf_seed) # Set limit on CPU RAM for the stage cpu_ram_limit_mb = cfg.get("cpu_ram_limit_mb", None) if cpu_ram_limit_mb is not None: stack.enter_context(memory_limit(cfg.cpu_ram_limit_mb)) print("Building env...\n") # Maybe build env if stage_idx == 0 or not cfg.preserve_env: if getattr(self, 'env', None): self.env.close() self.env = cfg.build_env() if hasattr(self.env, "print_memory_footprint"): self.env.print_memory_footprint() print("\nDone building env.\n") print("Building updater...\n") import warnings with warnings.catch_warnings(): warnings.simplefilter('once') if cfg.n_procs > 1: updater = cfg.get_updater(self.env, mpi_context=self.mpi_context) else: updater = cfg.get_updater(self.env) updater.stage_idx = stage_idx updater.exp_dir = self.exp_dir updater.build_graph() print("\nDone building updater.\n") walk_variable_scopes(max_depth=3) # Maybe initialize network weights. # Let a *path_specification* be one of three things: # 1. An integer specifying a stage to load the best hypothesis from. # 2. A string of format: "stage_idx,kind" where `stage_idx` specifies a stage to load from # and `kind` is either "final" or "best", specifying whether to load final or best # hypothesis from that stage. # 3. A path on the filesystem that gives a prefix for a tensorflow checkpoint file to load from. # # Then cfg.load_path can either be a path_specification itself, in which case all variables # in the network will be loaded from that path_specification, or a dictionary mapping from # variable scope names to path specifications, in which case all variables in each supplied # variable scope name will be loaded from the path_specification paired with that scope name. load_path = cfg.load_path if load_path is not None: if isinstance(load_path, str) or isinstance(load_path, int): load_path = {"": load_path} load_path = dict(load_path) # Sort in increasing order, so that it if one variable scope lies within another scope, # the outer scope gets loaded before the inner scope, rather than having the outer scope # wipe out the inner scope. items = sorted(load_path.items()) for var_scope, path in items: variables = {v.name: v for v in trainable_variables(var_scope, for_opt=False)} if not variables: print("No variables to load in scope {}.".format(str(var_scope))) continue saver = tf.train.Saver(variables) load_stage, kind = None, None if isinstance(path, int): load_stage = path kind = "best" elif isinstance(path, str): try: split = path.split(',') load_stage = int(split[0]) kind = 'best' if len(split) > 1 else split[1] assert kind in 'best final'.split(), "path={}".format(path) except Exception: load_stage, kind = None, None if load_stage is not None: if stage_idx == 0: print( "Not loading var scope \"{}\" from stage {}, " "currently in stage 0.".format(var_scope, load_stage)) continue else: key = kind + '_path' completed_history = self.data.history[:-1] path = completed_history[load_stage][key] path = os.path.realpath(path) saver.restore(tf.get_default_session(), path) print("Loading var scope \"{}\" from {}.".format(var_scope, path)) else: print("Using a fresh set of weights, not loading anything.") tf.train.get_or_create_global_step() sess.run(uninitialized_variables_initializer()) sess.run(tf.assert_variables_initialized()) for hook in cfg.hooks: assert isinstance(hook, Hook) hook.start_stage(self, updater, stage_idx) threshold_reached = False reason = None try: # --------------- Run stage ------------------- start = time.time() phys_memory_before = memory_usage(physical=True) gpu_memory_before = gpu_memory_usage() threshold_reached, reason = self._run_stage(stage_idx, updater) except KeyboardInterrupt: reason = "User interrupt" except NotImplementedError as e: # There is a bug in pdb_postmortem that prevents instances of `NotImplementedError` # from being handled properly, so replace it with an instance of `Exception`. if cfg.robust: traceback.print_exc() reason = "Exception occurred ({})".format(repr(e)) else: raise Exception("NotImplemented") from e except Exception as e: reason = "Exception occurred ({})".format(repr(e)) if cfg.robust: traceback.print_exc() else: raise except Alarm: reason = "Time limit exceeded" raise finally: phys_memory_after = memory_usage(physical=True) gpu_memory_after = gpu_memory_usage() self.data.record_values_for_stage( stage_duration=time.time()-start, phys_memory_before_mb=phys_memory_before, phys_memory_delta_mb=phys_memory_after - phys_memory_before, gpu_memory_before_mb=gpu_memory_before, gpu_memory_delta_mb=gpu_memory_after - gpu_memory_before ) self.data.record_values_for_stage(reason=reason) print("\n" + "-" * 10 + " Optimization complete " + "-" * 10) print("\nReason: {}.\n".format(reason)) final_path = self.data.path_for('weights/final_for_stage_{}'.format(stage_idx)) final_path = cfg.get('save_path', final_path) final_path = updater.save(tf.get_default_session(), final_path) self.data.record_values_for_stage(final_path=final_path) # --------------- Maybe render performance of best hypothesis ------------------- do_final_testing = ( "Exception occurred" not in reason and reason != "Time limit exceeded" and 'best_path' in self.data.current_stage_record) if do_final_testing: try: print("\n" + "-" * 10 + " Final testing/rendering " + "-" * 10) print("Best hypothesis for this stage was found on " "step (l: {best_local_step}, g: {best_global_step}) " "with stopping criteria ({sc_name}) of {best_stopping_criteria}.".format( sc_name=self.stopping_criteria_name, **self.data.current_stage_record)) best_path = self.data.current_stage_record['best_path'] print("Loading best hypothesis for this stage " "from file {}...".format(best_path)) updater.restore(sess, best_path) test_record = updater.evaluate(cfg.batch_size, mode="test") for hook in cfg.hooks: if hook.call_per_timestep and hook.final: hook_record = hook.step(self, updater) if hook_record: assert len(hook_record) == 1 for k, d in dict(hook_record).items(): test_record.update(d) self.data.record_values_for_stage( **{'_test_' + k: v for k, v in test_record.items()}) if cfg.render_step > 0 and cfg.render_hook is not None: print("Rendering...") cfg.render_hook(updater) print("Done rendering.") except BaseException: print("Exception occurred while performing final testing/rendering: ") traceback.print_exc() else: print("\n" + "-" * 10 + " Skipping final testing/rendering " + "-" * 10) # --------------- Finish up the stage ------------------- self.data.end_stage(updater.n_updates) print("\n" + "-" * 10 + " Running end-of-stage hooks " + "-" * 10 + "\n") for hook in cfg.hooks: hook.end_stage(self, stage_idx) print() self.timestamp("Done stage {}".format(stage_idx)) print("=" * 50) stage_idx += 1 self.curriculum_complete.append(stage_config) if not (threshold_reached or cfg.power_through): print("Failed to reach stopping criteria threshold on stage {} " "of the curriculum, terminating.".format(stage_idx)) break
def _run_stage(self, stage_idx, updater): """ Run main training loop for a stage of the curriculum. """ threshold_reached = False reason = "NotStarted" # Parse stopping criteria, set up early stopping stopping_criteria = cfg.get("stopping_criteria", None) if not stopping_criteria: stopping_criteria = updater.stopping_criteria if isinstance(stopping_criteria, str): stopping_criteria = stopping_criteria.split(",") self.stopping_criteria_name = stopping_criteria[0] if "max" in stopping_criteria[1]: self.maximize_sc = True elif "min" in stopping_criteria[1]: self.maximize_sc = False else: raise Exception("Ambiguous stopping criteria specification: {}".format(stopping_criteria[1])) early_stop = EarlyStopHook(patience=cfg.patience, maximize=self.maximize_sc) # Start stage print("\n" + "-" * 10 + " Training begins " + "-" * 10) self.timestamp("") print() total_hooks_time = 0.0 time_per_hook = 0.0 total_eval_time = 0.0 time_per_eval = 0.0 total_train_time = 0.0 time_per_example = 0.0 time_per_update = 0.0 n_eval = 0 while True: # Check whether to keep training if updater.n_updates >= cfg.max_steps: reason = "Maximum number of steps-per-stage reached" break if updater.n_experiences >= cfg.max_experiences: reason = "Maximum number of experiences-per-stage reached" break local_step = updater.n_updates global_step = self.global_step if local_step > 0 and local_step % cfg.checkpoint_step == 0: self.data.dump_data(local_step) evaluate = (local_step % cfg.eval_step) == 0 display = (local_step % cfg.display_step) == 0 render = (cfg.render_step > 0 and (local_step % cfg.render_step) == 0 and (local_step > 0 or cfg.render_first)) data_to_store = [] # --------------- Run hooks ------------------- hooks_start = time.time() for hook in cfg.hooks: if hook.call_per_timestep: run_hook = local_step == 0 and hook.initial run_hook |= local_step > 0 and local_step % hook.n == 0 if run_hook: hook_record = hook.step(self, updater, local_step) if hook_record: data_to_store.extend(dict(hook_record).items()) hooks_duration = time.time() - hooks_start if render and cfg.render_hook is not None: print("Rendering...") cfg.render_hook(updater) print("Done rendering.") if display: print("Displaying...") self.data.summarize_current_stage( local_step, global_step, updater.n_experiences, self.n_global_experiences) print("\nMy PID: {}\n".format(os.getpid())) print("Physical memory use: {}mb".format(memory_usage(physical=True))) print("Virtual memory use: {}mb".format(memory_usage(physical=False))) print("Avg time per update: {}s".format(time_per_update)) print("Avg time per eval: {}s".format(time_per_eval)) print("Avg time for hooks: {}s".format(time_per_hook)) if cfg.use_gpu: print(nvidia_smi()) # --------------- Possibly evaluate ------------------- if evaluate: print("Evaluating...") eval_start_time = time.time() val_record = updater.evaluate(cfg.batch_size, mode="val") eval_duration = time.time() - eval_start_time print("Done evaluating") val_record["duration"] = eval_duration n_eval += 1 total_eval_time += eval_duration time_per_eval = total_eval_time / n_eval data_to_store.append(("val", val_record)) if self.stopping_criteria_name not in val_record: print("Stopping criteria {} not in record returned " "by updater, using 0.0.".format(self.stopping_criteria_name)) stopping_criteria = val_record.get(self.stopping_criteria_name, 0.0) new_best, stop = early_stop.check(stopping_criteria, local_step, val_record) if new_best: print("Storing new best on step (l={}, g={}), " "constituting (l={}, g={}) experiences, " "with stopping criteria ({}) of {}.".format( local_step, global_step, updater.n_experiences, self.n_global_experiences, self.stopping_criteria_name, stopping_criteria)) best_path = self.data.path_for( 'weights/best_of_stage_{}'.format(stage_idx)) best_path = cfg.get('save_path', best_path) weight_start = time.time() best_path = updater.save(tf.get_default_session(), best_path) print("Done saving weights, took {} seconds".format(time.time() - weight_start)) self.data.record_values_for_stage( best_path=best_path, best_global_step=global_step) self.data.record_values_for_stage( **{'best_' + k: v for k, v in early_stop.best.items()}) if stop: print("Early stopping triggered.") reason = "Early stopping triggered" break if self.maximize_sc: threshold_reached = stopping_criteria >= cfg.threshold else: threshold_reached = stopping_criteria <= cfg.threshold if threshold_reached: reason = "Stopping criteria threshold reached" break # --------------- Perform an update ------------------- if cfg.do_train: if local_step % 100 == 0: print("Running update step {}...".format(local_step)) update_start_time = time.time() _old_n_experiences = updater.n_experiences update_record = updater.update(cfg.batch_size) update_duration = time.time() - update_start_time update_record["train"]["duration"] = update_duration if local_step % 100 == 0: print("Done update step.") if local_step % 100 == 0: start = time.time() update_record["train"]["memory_physical_mb"] = memory_usage(physical=True) update_record["train"]["memory_virtual_mb"] = memory_usage(physical=False) update_record["train"]["memory_gpu_mb"] = gpu_memory_usage() print("Memory check duration: {}".format(time.time() - start)) data_to_store.extend(dict(update_record).items()) n_experiences_delta = updater.n_experiences - _old_n_experiences self.n_global_experiences += n_experiences_delta total_train_time += update_duration time_per_example = total_train_time / updater.n_experiences time_per_update = total_train_time / updater.n_updates total_hooks_time += hooks_duration time_per_hook = total_hooks_time / updater.n_updates # --------------- Store data ------------------- records = defaultdict(dict) for mode, r in data_to_store: records[mode].update(r) self.data.store_step_data_and_summaries( stage_idx, local_step, global_step, updater.n_experiences, self.n_global_experiences, **records) self.data.record_values_for_stage( time_per_example=time_per_example, time_per_update=time_per_update, time_per_eval=time_per_eval, time_per_hook=time_per_hook, n_steps=local_step, n_experiences=updater.n_experiences, ) self.global_step += 1 # If `do_train` is False, we do no training and evaluate # exactly once, so only one iteration is required. if not cfg.do_train: reason = "`do_train` set to False" break return threshold_reached, reason
def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train, nsteps, ent_coef, vf_coef, max_grad_norm): sess = tf.get_default_session() print(policy) print(ob_space) print(ac_space) print(nbatch_act) print(nbatch_train) print(nsteps) print(ent_coef) print(vf_coef) print(max_grad_norm) act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False) train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True) A = train_model.pdtype.sample_placeholder([None]) ADV = tf.placeholder(tf.float32, [None]) R = tf.placeholder(tf.float32, [None]) OLDNEGLOGPAC = tf.placeholder(tf.float32, [None]) OLDVPRED = tf.placeholder(tf.float32, [None]) LR = tf.placeholder(tf.float32, []) CLIPRANGE = tf.placeholder(tf.float32, []) neglogpac = train_model.pd.neglogp(A) entropy = tf.reduce_mean(train_model.pd.entropy()) vpred = train_model.vf vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE) vf_losses1 = tf.square(vpred - R) vf_losses2 = tf.square(vpredclipped - R) vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2)) ratio = tf.exp(OLDNEGLOGPAC - neglogpac) pg_losses = -ADV * ratio pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE) pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2)) approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC)) clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE))) loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef with tf.variable_scope('model'): params = tf.trainable_variables() grads = tf.gradients(loss, params) if max_grad_norm is not None: grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm) grads = list(zip(grads, params)) trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5) _train = trainer.apply_gradients(grads) def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None): advs = returns - values advs = (advs - advs.mean()) / (advs.std() + 1e-8) td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr, CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values} if states is not None: td_map[train_model.S] = states td_map[train_model.M] = masks return sess.run([pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1] self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac'] def save(save_path): ps = sess.run(params) joblib.dump(ps, save_path) def load(load_path): loaded_params = joblib.load(load_path) #print(loaded_params) restores = [] for p, loaded_p in zip(params, loaded_params): restores.append(p.assign(loaded_p)) sess.run(restores) # If you want to load weights, also save/load observation scaling inside VecNormalize self.train = train self.train_model = train_model self.act_model = act_model self.step = act_model.step self.value = act_model.value self.initial_state = act_model.initial_state self.save = save self.load = load tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
def run_apply_grads(*, grads, lr): feed_dict = {p: g for p, g in zip(grad_placeholders, grads)} feed_dict[lr] = lr return tf.get_default_session().run(apply_grads_op, feed_dict=feed_dict)
def evaluate_softmax(X_data): sess = tf.get_default_session() somax = sess.run(soft_max, feed_dict={x: X_data}) return somax
def initialize(self, checkpoints=None, reset=False, reset_learning_rate=False, max_to_keep=1, keep_every_n_hours=0, sess=None, whitelist=None, blacklist=None, **kwargs): """ :param checkpoints: list of checkpoints to load (instead of latest checkpoint) :param reset: don't load latest checkpoint, reset learning rate and global step :param reset_learning_rate: reset the learning rate to its initial value :param max_to_keep: keep this many latest checkpoints at all times :param keep_every_n_hours: and keep checkpoints every n hours """ sess = sess or tf.get_default_session() if keep_every_n_hours <= 0 or keep_every_n_hours is None: keep_every_n_hours = float('inf') self.saver = tf.train.Saver( max_to_keep=max_to_keep, keep_checkpoint_every_n_hours=keep_every_n_hours, sharded=False) sess.run(tf.global_variables_initializer()) # load pre-trained embeddings for encoder_or_decoder, vocab in zip(self.encoders + self.decoders, self.vocabs): if encoder_or_decoder.embedding_file: utils.log('loading embeddings from: {}'.format( encoder_or_decoder.embedding_file)) embeddings = {} with open(encoder_or_decoder.embedding_file, encoding="utf-8") as embedding_file: for line in embedding_file: word, vector = line.split(' ', 1) if word in vocab.vocab: embeddings[word] = np.array( list(map(float, vector.split()))) # standardize (mean of 0, std of 0.01) mean = sum(embeddings.values()) / len(embeddings) std = np.sqrt( sum((value - mean)**2 for value in embeddings.values())) / (len(embeddings) - 1) for key in embeddings: embeddings[key] = 0.01 * (embeddings[key] - mean) / std # change TensorFlow variable's value with tf.variable_scope(tf.get_variable_scope(), reuse=True): embedding_var = tf.get_variable('embedding_' + encoder_or_decoder.name) embedding_value = embedding_var.eval() for word, i in vocab.vocab.items(): if word in embeddings: embedding_value[i] = embeddings[word] sess.run(embedding_var.assign(embedding_value)) if whitelist: with open(whitelist, encoding="utf-8") as f: whitelist = list(line.strip() for line in f) if blacklist: with open(blacklist, encoding="utf-8") as f: blacklist = list(line.strip() for line in f) else: blacklist = [] blacklist.append('dropout_keep_prob') if reset_learning_rate or reset: blacklist.append('learning_rate') if reset: blacklist.append('global_step') params = { k: kwargs.get(k) for k in ('variable_mapping', 'reverse_mapping') } if checkpoints and len(self.models) > 1: assert len(self.models) == len(checkpoints) for i, checkpoint in enumerate(checkpoints, 1): load_checkpoint(sess, None, checkpoint, blacklist=blacklist, whitelist=whitelist, prefix='model_{}'.format(i), **params) elif checkpoints: # load partial checkpoints for checkpoint in checkpoints: # checkpoint files to load load_checkpoint(sess, None, checkpoint, blacklist=blacklist, whitelist=whitelist, **params) elif not reset: load_checkpoint(sess, self.checkpoint_dir, blacklist=blacklist, whitelist=whitelist, **params) utils.debug('global step: {}'.format(self.global_step.eval())) utils.debug('baseline step: {}'.format(self.baseline_step.eval()))
def reset_graph(): """Closes the current default session and resets the graph.""" sess = tf.get_default_session() if sess: sess.close() tf.reset_default_graph()
def post_update(self, feed_dict, context): if self.steps_since_target_update > self.parameter: tf.get_default_session().run(self.target_agent_update) self.steps_since_target_update = 0 else: self.steps_since_target_update += 1
def decay_learning_rate(self): sess = tf.get_default_session() if sess is None or self.learning_rate_decay is None: raise ValueError('need session learning rate decay op') sess.run(self.learning_rate_decay)
def post_update(self, feed_dict, context): tf.get_default_session().run(self.target_agent_update)
def get_session(): """ Returns recently made TensorFlow session :return: tf.Session() """ return tf.get_default_session()
def run(self): args = self.args sess = self.sess seq_length = args.seq_length env = EnvBreakoutWrapper() states = np.zeros((seq_length, *self.a3cnet.state_shape), dtype=np.float32) actions = np.zeros((seq_length, ), dtype=np.int32) rewards = np.zeros((seq_length, ), dtype=np.float32) values = np.zeros((seq_length, ), dtype=np.float32) discounted_utilities = np.zeros((seq_length, ), dtype=np.float32) with sess.as_default(): # with tqdm(desc='Episode', total=args.episodes, unit=' episodes') as pbar: assert tf.get_default_session() is sess, 'session mismatch' # for episode in range(args.episodes): global episode global scores global stats while episode < args.episodes: total_rewards = 0 episode_completed = False eps_idx = 0 lives = 5 # reset environment env.reset() env.next_life() lstm_state = self.a3cnet.zero_lstm_state() while not episode_completed: # gather loss statistics policy_loss_list, value_loss_list, entropy_list = [], [], [] # save the LSTM state at the beginning of a sequence lstm_state_seq_start = lstm_state # simulate a sequence of steps for seq_idx in range(seq_length): # determine policy feed_dict = { self.a3cnet.state: env.state[np.newaxis, :], self.a3cnet.ph_lstm_state: lstm_state } res = sess.run([ self.a3cnet.policy, self.a3cnet.value, self.a3cnet.final_lstm_state ], feed_dict=feed_dict) policy = res[0][0][0] # [][batch][time] value = res[1][0][0][0] # [][batch][time][] lstm_state = res[2] assert all( policy >= 0), "policy not >0 {}".format(policy) assert abs(sum(policy) - 1) < 1e4, "sum policy not 1 {}".format( sum(policy)) # select action according to policy action = np.random.choice(self.a3cnet.action_size, p=policy) # execute action, get reward and next state reward, done, info = env.step(action) # save state, action, reward states[seq_idx, :] = env.prev_state actions[seq_idx] = action rewards[seq_idx] = reward values[seq_idx] = value # check if dead if lives > info['ale.lives']: # has died lives = info['ale.lives'] reward = -1 env.next_life() # update sum_of_rewards total_rewards += reward # end episode if done if done: break seq_end = (seq_idx + 1) eps_idx += (seq_idx + 1) # calculate discounted utilities if not done: feed_dict = { self.a3cnet.state: env.state[np.newaxis, :], self.a3cnet.ph_lstm_state: lstm_state } res = sess.run(self.a3cnet.value, feed_dict=feed_dict) value = res[0] running_sum = value elif seq_idx > 0: seq_end = seq_idx running_sum = value else: episode_completed = True break # nothing to train on for reverse_idx in range(seq_idx, -1, -1): running_sum = args.gamma * running_sum + rewards[ reverse_idx] discounted_utilities[reverse_idx] = running_sum # train feed_dict = { self.a3cnet.state: states[:seq_end, ...], self.a3cnet.actions: actions[:seq_end], self.a3cnet.value_target: discounted_utilities[:seq_end, ...], self.a3cnet.lr: args.lr, self.a3cnet.coeff_p: args.coeff_p, self.a3cnet.coeff_v: args.coeff_v, self.a3cnet.coeff_h: args.coeff_h, self.a3cnet.ph_lstm_state: lstm_state_seq_start } run_result = sess.run([ self.a3cnet.loss, self.a3cnet.train_step, self.a3cnet.policy_loss, self.a3cnet.value_loss, self.a3cnet.entropy, self.a3cnet.value, self.a3cnet.value_target, self.a3cnet.policy, self.a3cnet.logits, self.a3cnet.H ], feed_dict=feed_dict) loss, policy_loss, value_loss, entropy = run_result[ 0], run_result[2], run_result[3], run_result[4] value, value_target = run_result[5], run_result[6] policy = run_result[7] logits = run_result[8] H = run_result[9] policy_loss_list.append(policy_loss) value_loss_list.append(value_loss) entropy_list.append(entropy) if (eps_idx // seq_length) % 32 == 0: print( 'policy_loss={:2f}, value_loss={:2f}, entropy={:2f}, loss={:2f} ' .format(policy_loss, value_loss, entropy, loss), end='') print(actions[:seq_end]) # print(policy[0][0]) # print(logits[0][0]) # print(H) # print(rewards[:seq_end]) # print(discounted_utilities[:seq_end]) # print(values[:seq_end]) # print('value ', value) # print('value_target ', value_target) # determine whether episode completed if done or eps_idx > args.max_episode_length: episode_completed = True # save sum_of_rewards scores.append(total_rewards) stats['policy_loss'].append( np.mean(policy_loss_list ) if len(policy_loss_list) > 0 else 0) stats['value_loss'].append( np.mean(value_loss_list) if len(value_loss_list) > 0 else 0 ) stats['entropy'].append( np.mean(entropy_list) if len(entropy_list) > 0 else 0) episode += 1