Example #1
0
    def train_step(self, cases, weights, caching):
        if len(cases) != len(weights):
            raise ValueError('cases and weights must have the same length.')

        if len(cases) == 0:
            #logging.warn('Training on zero cases.')
            print >> sys.stderr, " WARNING: Zero cases   \033[F"
            # still increment the step
            sess = tf.get_default_session()
            sess.run(self._increment_step)
        elif not self._max_batch_size or len(cases) <= self._max_batch_size:
            print >> sys.stderr, " Updating ({} cases)   \033[F".format(len(cases))
            self.compute(self._take_step, cases, weights, caching)
        else:
            print >> sys.stderr, " Updating ({} cases)   \033[F".format(len(cases))
            assert not caching
            grads = None
            slices = range(0, len(cases), self._max_batch_size)
            for i in verboserate(slices, desc='Computing gradients ({} cases)'.format(len(cases))):
                cases_slice = cases[i:i + self._max_batch_size]
                weights_slice = weights[i:i + self._max_batch_size]
                grads_slice = self.compute(self._grad_tensors,
                                           cases_slice, weights_slice, False)
                if grads is None:
                    grads = grads_slice
                else:
                    for i in xrange(len(self._grad_tensors)):
                        grads[i] += grads_slice[i]
            sess = tf.get_default_session()
            feed_dict = dict(zip(self._combined_grad_placeholders, grads))
            sess.run(self._apply_gradients, feed_dict)
            sess.run(self._increment_step)
Example #2
0
def main(args):
    with tf.Graph().as_default():
        with tf.Session() as sess:
            # Load the model metagraph and checkpoint
            print('Model directory: %s' % args.model_dir)
            meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.model_dir))
            
            print('Metagraph file: %s' % meta_file)
            print('Checkpoint file: %s' % ckpt_file)

            model_dir_exp = os.path.expanduser(args.model_dir)
            saver = tf.train.import_meta_graph(os.path.join(model_dir_exp, meta_file), clear_devices=True)
            tf.get_default_session().run(tf.global_variables_initializer())
            tf.get_default_session().run(tf.local_variables_initializer())
            saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file))
            
            # Retrieve the protobuf graph definition and fix the batch norm nodes
            input_graph_def = sess.graph.as_graph_def()
            
            # Freeze the graph def
            output_graph_def = freeze_graph_def(sess, input_graph_def, 'embeddings')

        # Serialize and dump the output graph to the filesystem
        with tf.gfile.GFile(args.output_file, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph: %s" % (len(output_graph_def.node), args.output_file))
Example #3
0
def get_session():
  """Get the globally defined TensorFlow session.

  If the session is not already defined, then the function will create
  a global session.

  Returns:
    _ED_SESSION: tf.InteractiveSession.
  """
  global _ED_SESSION
  if tf.get_default_session() is None:
    _ED_SESSION = tf.InteractiveSession()
  else:
    _ED_SESSION = tf.get_default_session()

  save_stderr = sys.stderr
  try:
    import os
    sys.stderr = open(os.devnull, 'w')  # suppress keras import
    from keras import backend as K
    sys.stderr = save_stderr
    have_keras = True
  except ImportError:
    sys.stderr = save_stderr
    have_keras = False
  if have_keras:
    K.set_session(_ED_SESSION)

  return _ED_SESSION
Example #4
0
 def train(self, obs, actions, gaes, rewards, v_preds_next):
     tf.get_default_session().run(self.train_op, feed_dict={self.Policy.obs: obs,
                                                            self.Old_Policy.obs: obs,
                                                            self.actions: actions,
                                                            self.rewards: rewards,
                                                            self.v_preds_next: v_preds_next,
                                                            self.gaes: gaes})
Example #5
0
def get_session():
    """Returns the TF session to be used by the backend.

    If a default TensorFlow session is available, we will return it.

    Else, we will return the global Keras session.

    If no global Keras session exists at this point:
    we will create a new global session.

    Note that you can manually set the global session
    via `K.set_session(sess)`.
    """
    global _SESSION
    if tf.get_default_session() is not None:
        return tf.get_default_session()
    if _SESSION is None:
        if not os.environ.get("OMP_NUM_THREADS"):
            _SESSION = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
        else:
            nb_thread = int(os.environ.get("OMP_NUM_THREADS"))
            _SESSION = tf.Session(
                config=tf.ConfigProto(intra_op_parallelism_threads=nb_thread, allow_soft_placement=True)
            )
    return _SESSION
 def fit(self, xs, ys):
     if self.normalize_inputs:
         # recompute normalizing constants for inputs
         new_mean = np.mean(xs, axis=0, keepdims=True)
         new_std = np.std(xs, axis=0, keepdims=True) + 1e-8
         tf.get_default_session().run(tf.group(
             tf.assign(self.x_mean_var, new_mean),
             tf.assign(self.x_std_var, new_std),
         ))
     if self.use_trust_region and self.first_optimized:
         old_prob = self.f_prob(xs)
         inputs = [xs, ys, old_prob]
         optimizer = self.tr_optimizer
     else:
         inputs = [xs, ys]
         optimizer = self.optimizer
     loss_before = optimizer.loss(inputs)
     if self.name:
         prefix = self.name + "_"
     else:
         prefix = ""
     logger.record_tabular(prefix + 'LossBefore', loss_before)
     optimizer.optimize(inputs)
     loss_after = optimizer.loss(inputs)
     logger.record_tabular(prefix + 'LossAfter', loss_after)
     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
     self.first_optimized = True
Example #7
0
    def restore_trainer(self, filename):
        '''
        Load the training progress (including the model)

        Args:
            filename: path where the model will be saved
        '''

        self.modelsaver.restore(tf.get_default_session(), filename)
        self.saver.restore(tf.get_default_session(), filename + '_trainvars')
Example #8
0
 def test_logging_trainable(self):
   with tf.Graph().as_default() as g, self.test_session(g):
     var = tf.Variable(tf.constant(42.0), name='foo')
     var.initializer.run()
     cof = tf.constant(1.0)
     loss = tf.sub(tf.mul(var, cof), tf.constant(1.0))
     train_step = tf.train.GradientDescentOptimizer(0.5).minimize(loss)
     tf.get_default_session().run(train_step)
     self._run_monitor(learn.monitors.LoggingTrainable('foo'))
     self.assertRegexpMatches(str(self.logged_message), var.name)
Example #9
0
 def test_lookup_activations(self):
     x = tf.constant(-1.0, shape=[2, 2])
     with self.test_session():
         activations = ['relu','prelu','selu','crelu']
         for activation in activations:
             activation = ops.lookup(activation)(x)
             
             tf.get_default_session().run(tf.global_variables_initializer())
         
             self.assertNotEqual(x.eval()[0][0], activation.eval()[0][0])
    def fit(self, paths, policy=None, batch_size=32, max_itrs=100, logger=None, lr=1e-3,**kwargs):
        #self._compute_path_probs(paths, insert=True)
        self.eval_expert_probs(paths, policy, insert=True)
        self.eval_expert_probs(self.expert_trajs, policy, insert=True)
        obs, acts, path_probs = self.extract_paths(paths, keys=('observations', 'actions', 'a_logprobs'))
        expert_obs, expert_acts, expert_probs = self.extract_paths(self.expert_trajs, keys=('observations', 'actions', 'a_logprobs'))

        # Train discriminator
        for it in TrainingIterator(max_itrs, heartbeat=5):
            obs_batch, act_batch, lprobs_batch = \
                self.sample_batch(obs, acts, path_probs, batch_size=batch_size)

            expert_obs_batch, expert_act_batch, expert_lprobs_batch = \
                self.sample_batch(expert_obs, expert_acts, expert_probs, batch_size=batch_size)

            labels = np.zeros((batch_size*2, 1))
            labels[batch_size:] = 1.0
            obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0)
            act_batch = np.concatenate([act_batch, expert_act_batch], axis=0)
            lprobs_batch = np.expand_dims(np.concatenate([lprobs_batch, expert_lprobs_batch], axis=0), axis=1).astype(np.float32)

            loss, _ = tf.get_default_session().run([self.loss, self.step], feed_dict={
                self.act_t: act_batch,
                self.obs_t: obs_batch,
                self.labels: labels,
                self.lprobs: lprobs_batch,
                self.lr: lr
            })

            it.record('loss', loss)
            if it.heartbeat:
                print(it.itr_message())
                mean_loss = it.pop_mean('loss')
                print('\tLoss:%f' % mean_loss)
        if logger:
            energy, logZ, dtau = tf.get_default_session().run([self.energy, self.value_fn, self.d_tau],
                                                        feed_dict={self.act_t: acts, self.obs_t: obs,
                                                                   self.lprobs: np.expand_dims(path_probs, axis=1)})
            logger.record_tabular('IRLLogZ', np.mean(logZ))
            logger.record_tabular('IRLAverageEnergy', np.mean(energy))
            logger.record_tabular('IRLAverageLogPtau', np.mean(-energy-logZ))
            logger.record_tabular('IRLAverageLogQtau', np.mean(path_probs))
            logger.record_tabular('IRLMedianLogQtau', np.median(path_probs))
            logger.record_tabular('IRLAverageDtau', np.mean(dtau))

            energy, logZ, dtau = tf.get_default_session().run([self.energy, self.value_fn, self.d_tau],
                                                        feed_dict={self.act_t: expert_acts, self.obs_t: expert_obs,
                                                                   self.lprobs: np.expand_dims(expert_probs, axis=1)})
            logger.record_tabular('IRLAverageExpertEnergy', np.mean(energy))
            logger.record_tabular('IRLAverageExpertLogPtau', np.mean(-energy-logZ))
            logger.record_tabular('IRLAverageExpertLogQtau', np.mean(expert_probs))
            logger.record_tabular('IRLMedianExpertLogQtau', np.median(expert_probs))
            logger.record_tabular('IRLAverageExpertDtau', np.mean(dtau))
        return mean_loss
def get_session():
    global _session

    # Build/retrieve the session if it doesn't exist
    if _session is None:
        if tf.get_default_session() is not None:
            _session = tf.get_default_session()
        else:
            _session = tf.Session()

    return _session
Example #12
0
  def test_preserves_existing_session(self):
    with tf.Session() as sess:
      op = tf.reduce_sum([2, 2])
      self.assertIs(sess, tf.get_default_session())

      result = self._square(123)
      self.assertEqual(123 * 123, result)

      self.assertIs(sess, tf.get_default_session())
      number_of_lights = sess.run(op)
      self.assertEqual(number_of_lights, 4)
Example #13
0
    def zero_model_gradient_accumulators(cls) -> None:
        zero_operations = [
            tf.get_default_graph().get_operation_by_name(
                '{}/zero_model_gradient_accumulators'.format(
                    variable_scope_name))
            for variable_scope_name in [
                'empty_statistic',
                'move_rate',
                'game_state_as_update',
                'updated_statistic',
                'updated_update',
                'cost_function']]

        tf.get_default_session().run(zero_operations)
def predict_with_three_models_on_hashtags(hashtag_dir, hashtag_emb_dir, trial_hashtag_names, labels_exist=True):
    # eval_hashtag_names = get_hashtag_file_names(SEMEVAL_HUMOR_EVAL_DIR)
    emb_char_predictions = []
    emb_predictions = []
    char_predictions = []
    per_hashtag_first_tweet_ids = []
    per_hashtag_second_tweet_ids = []
    K.clear_session()
    K.set_session(tf.get_default_session())
    hp1 = humor_predictor.HumorPredictor(EMB_CHAR_HUMOR_MODEL_DIR, use_emb_model=True, use_char_model=True)
    for trial_hashtag_name in trial_hashtag_names:
        np_predictions, np_output_prob, np_labels, first_tweet_ids, second_tweet_ids = hp1(hashtag_dir,
                                                                                          trial_hashtag_name)
        emb_char_predictions.append(np_output_prob)
        per_hashtag_first_tweet_ids.append(first_tweet_ids)
        per_hashtag_second_tweet_ids.append(second_tweet_ids)

    K.clear_session()
    K.set_session(tf.get_default_session())
    hp2 = humor_predictor.HumorPredictor(EMB_HUMOR_MODEL_DIR, use_emb_model=True, use_char_model=False)
    for trial_hashtag_name in trial_hashtag_names:
        np_predictions, np_output_prob, np_labels, first_tweet_ids, second_tweet_ids = hp2(hashtag_dir,
                                                                                          trial_hashtag_name)
        emb_predictions.append(np_output_prob)

    K.clear_session()
    K.set_session(tf.get_default_session())
    hp3 = humor_predictor.HumorPredictor(CHAR_HUMOR_MODEL_DIR, use_emb_model=False, use_char_model=True)

    for trial_hashtag_name in trial_hashtag_names:
        np_predictions, np_output_prob, np_labels, first_tweet_ids, second_tweet_ids = hp3(hashtag_dir,
                                                                                       trial_hashtag_name)
        char_predictions.append(np_output_prob)

    all_predictions = []
    for i in range(len(trial_hashtag_names)):
        hashtag_all_predictions = np.concatenate(
            [np.reshape(emb_char_predictions[i], [-1, 1]), np.reshape(emb_predictions[i], [-1, 1]), np.reshape(char_predictions[i], [-1, 1])], axis=1)
        all_predictions.append(hashtag_all_predictions)

    hashtag_labels = None
    if labels_exist:
        hashtag_labels = []
        for hashtag_name in trial_hashtag_names:
            print 'Loading label for hashtag %s' % hashtag_name
            np_first_tweets, np_second_tweets, np_labels, first_tweet_ids, second_tweet_ids, np_hashtag = \
                load_hashtag_data(hashtag_emb_dir, hashtag_name)
            hashtag_labels.append(np_labels)

    return all_predictions, hashtag_labels, per_hashtag_first_tweet_ids, per_hashtag_second_tweet_ids
Example #15
0
def main(args):
    with tf.Graph().as_default():
        with tf.Session() as sess:
            # Load the model metagraph and checkpoint
            print('Model directory: %s' % args.model_dir)
            meta_file, ckpt_file = facenet.get_model_filenames(os.path.expanduser(args.model_dir))
            
            print('Metagraph file: %s' % meta_file)
            print('Checkpoint file: %s' % ckpt_file)

            model_dir_exp = os.path.expanduser(args.model_dir)
            saver = tf.train.import_meta_graph(os.path.join(model_dir_exp, meta_file), clear_devices=True)
            tf.get_default_session().run(tf.global_variables_initializer())
            tf.get_default_session().run(tf.local_variables_initializer())
            saver.restore(tf.get_default_session(), os.path.join(model_dir_exp, ckpt_file))
            
            # Retrieve the protobuf graph definition and fix the batch norm nodes
            gd = sess.graph.as_graph_def()
            for node in gd.node:            
                if node.op == 'RefSwitch':
                    node.op = 'Switch'
                    for index in xrange(len(node.input)):
                        if 'moving_' in node.input[index]:
                            node.input[index] = node.input[index] + '/read'
                elif node.op == 'AssignSub':
                    node.op = 'Sub'
                    if 'use_locking' in node.attr: del node.attr['use_locking']
                elif node.op == 'AssignAdd':
                    node.op = 'Add'
                    if 'use_locking' in node.attr: del node.attr['use_locking']
            
            # Get the list of important nodes
            output_node_names = 'embeddings'
            whitelist_names = []
            for node in gd.node:
                if node.name.startswith('InceptionResnetV1') or node.name.startswith('embeddings') or node.name.startswith('phase_train'):
                    print(node.name)
                    whitelist_names.append(node.name)

            # Replace all the variables in the graph with constants of the same values
            output_graph_def = graph_util.convert_variables_to_constants(
                sess, gd, output_node_names.split(","),
                variable_names_whitelist=whitelist_names)

        # Serialize and dump the output graph to the filesystem
        with tf.gfile.GFile(args.output_file, 'wb') as f:
            f.write(output_graph_def.SerializeToString())
        print("%d ops in the final graph." % len(output_graph_def.node))
Example #16
0
 def start(self):
     if self._need_default_sess:
         assert tf.get_default_session() is not None, \
             "Not session is bind to predictors, " \
             "MultiThreadAsyncPredictor.start() has to be called under a default session!"
     for t in self.threads:
         t.start()
Example #17
0
def traced_run(fetches):
  """Runs fetches, dumps timeline files in current directory."""

  global timeline_counter
  run_metadata = tf.RunMetadata()

  config = load_config()
  log_fn = "%s-%s-%s"%(config.task_type, config.task_id, timeline_counter)
  sess = tf.get_default_session()
  
  root = os.getcwd()+"/data"
  os.system('mkdir -p '+root)
  
  from tensorflow.python.client import timeline

  results = sess.run(fetches,
                     options=run_options,
                     run_metadata=run_metadata);
  tl = timeline.Timeline(step_stats=run_metadata.step_stats)
  ctf = tl.generate_chrome_trace_format(show_memory=True,
                                          show_dataflow=False)
  open(root+"/timeline_%s.json"%(log_fn,), "w").write(ctf)
  open(root+"/stepstats_%s.pbtxt"%(log_fn,), "w").write(str(
    run_metadata.step_stats))
  timeline_counter+=1
  return results
Example #18
0
  def applyOptimizer(self, opt, steps=5, is_sparse=False):
    if is_sparse:
      var0 = tf.Variable([[0.0], [0.0]])
      var1 = tf.Variable([[0.0], [0.0]])
      grads0 = tf.IndexedSlices(tf.constant([0.1], shape=[1, 1]),
                                tf.constant([0]),
                                tf.constant([2, 1]))
      grads1 = tf.IndexedSlices(tf.constant([0.02], shape=[1, 1]),
                                tf.constant([1]),
                                tf.constant([2, 1]))
    else:
      var0 = tf.Variable([0.0, 0.0])
      var1 = tf.Variable([0.0, 0.0])
      grads0 = tf.constant([0.1, 0.2])
      grads1 = tf.constant([0.01, 0.02])

    update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
    tf.initialize_all_variables().run()

    sess = tf.get_default_session()
    v0_val, v1_val = sess.run([var0, var1])
    if is_sparse:
      self.assertAllClose([[0.0], [0.0]], v0_val)
      self.assertAllClose([[0.0], [0.0]], v1_val)
    else:
      self.assertAllClose([0.0, 0.0], v0_val)
      self.assertAllClose([0.0, 0.0], v1_val)

    # Run Ftrl for a few steps
    for _ in range(steps):
      update.run()

    v0_val, v1_val = sess.run([var0, var1])
    return v0_val, v1_val
Example #19
0
    def __init__(self, batch=64, use_cpus=False):

        image_shape = [batch, 224, 224, 3]
        labels_shape = [batch]

        # Synthetic image should be within [0, 255].
        images = tf.truncated_normal(
            image_shape,
            dtype=tf.float32,
            mean=127,
            stddev=60,
            name='synthetic_images')

        # Minor hack to avoid H2D copy when using synthetic data
        inputs = tf.contrib.framework.local_variable(
            images, name='gpu_cached_images')
        labels = tf.random_uniform(
            labels_shape,
            minval=0,
            maxval=999,
            dtype=tf.int32,
            name='synthetic_labels')

        model = model_config.get_model_config("resnet101", MockDataset())
        logits, aux = model.build_network(
            inputs, data_format=use_cpus and "NHWC" or "NCHW")
        loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=logits, labels=labels)

        # Implement model interface
        self.loss = tf.reduce_mean(loss, name='xentropy-loss')
        self.optimizer = tf.train.GradientDescentOptimizer(1e-6)

        self.variables = ray_tf_utils.TensorFlowVariables(
            self.loss, tf.get_default_session())
Example #20
0
def get_global_step_value():
    """
    Returns:
        int: global_step value in current graph and session"""
    return tf.train.global_step(
        tf.get_default_session(),
        get_global_step_var())
Example #21
0
    def _trigger_epoch(self):
        try:
            if not self.meta_graph_written:
                self.saver.export_meta_graph(
                        os.path.join(logger.LOG_DIR,
                            'graph-{}.meta'.format(logger.get_time_str())),
                        collection_list=self.graph.get_all_collection_keys())
                self.meta_graph_written = True
            self.saver.save(
                tf.get_default_session(),
                self.path,
                global_step=self.global_step,
                write_meta_graph=False)

            # create a symbolic link for the latest model
            latest = self.saver.last_checkpoints[-1]
            basename = os.path.basename(latest)
            linkname = os.path.join(os.path.dirname(latest), 'latest')
            try:
                os.unlink(linkname)
            except OSError:
                pass
            os.symlink(basename, linkname)
        except (OSError, IOError):   # disk error sometimes.. just ignore it
            logger.exception("Exception in ModelSaver.trigger_epoch!")
Example #22
0
 def test_outputs(self, model, inputs, output_tensors, outputs):
     """Test for correct output."""
     sess = tf.get_default_session()
     guarantee_initialized_variables(sess)
     args, kwargs = inputs
     test_outputs = model.compute(output_tensors, *args, **kwargs)
     assert_array_collections_equal(outputs, test_outputs, decimal=4)
Example #23
0
 def get_param_shapes(self, **tags):
     tag_tuple = tuple(sorted(list(tags.items()), key=lambda x: x[0]))
     if tag_tuple not in self._cached_param_shapes:
         params = self.get_params(**tags)
         param_values = tf.get_default_session().run(params)
         self._cached_param_shapes[tag_tuple] = [val.shape for val in param_values]
     return self._cached_param_shapes[tag_tuple]
Example #24
0
    def __init__(self):
        # Import data
        error = None
        for _ in range(10):
            try:
                self.mnist = input_data.read_data_sets(
                    "/tmp/tensorflow/mnist/input_data", one_hot=True)
                error = None
                break
            except Exception as e:
                error = e
                time.sleep(5)
        if error:
            raise ValueError("Failed to import data", error)

        # Set seed and build layers
        tf.set_random_seed(0)

        self.x = tf.placeholder(tf.float32, [None, 784], name="x")
        self.y_ = tf.placeholder(tf.float32, [None, 10], name="y_")
        y_conv, self.keep_prob = deepnn(self.x)

        # Need to define loss and optimizer attributes
        self.loss = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(
                labels=self.y_, logits=y_conv))
        self.optimizer = tf.train.AdamOptimizer(1e-4)
        self.variables = ray_tf_utils.TensorFlowVariables(
            self.loss, tf.get_default_session())

        # For evaluating test accuracy
        correct_prediction = tf.equal(
            tf.argmax(y_conv, 1), tf.argmax(self.y_, 1))
        self.accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
Example #25
0
	def eval_value(self, state):
		sess = tf.get_default_session()
		state_batch = state[newaxis, :, :]
		value_batch = sess.run(self.value_tensor,
			feed_dict={self.state_input: state_batch})
		value = value_batch[0, 0]
		return value
Example #26
0
 def callback(self,data):
     with session.as_default():
         assert tf.get_default_session() is session
         input_image = np.flipud(data.data.reshape(image_size,image_size).astype(np.float32)).reshape(-1,image_size,image_size,1)
         out_class, out_angle = test_model(input_image)
         pre_class = tf.nn.softmax(out_class)
         pre_angle = tf.nn.softmax(out_angle).eval()
         angle = np.sum(np.multiply(pre_angle, angles_list))/np.sum(pre_angle)
         pre_dict = dict(zip(list(range(num_labels)),pre_class.eval()[0]))
         sorted_pre_dict = sorted(pre_dict.items(), key=operator.itemgetter(1))
         name1 = value2name[sorted_pre_dict[-1][0]]
         name1 = name2string[name1]
         value1 = str(sorted_pre_dict[-1][1])
         name2 = value2name[sorted_pre_dict[-2][0]]
         name2 = name2string[name2]
         value2 = str(sorted_pre_dict[-2][1])
         pre = PredictionMSG()
         pre.name1, pre.value1, pre.name2, pre.value2, pre.angle = name1, float(value1), name2, float(value2), angle
         self.pub1.publish(pre)
         image = ((input_image.reshape(image_size,image_size) + 0.65)*255).astype(np.uint8)
         pt1x = int(self.pt1x * math.cos(math.radians(angle)) + self.pt1y * -math.sin(math.radians(angle))) + 40
         pt1y = int(self.pt1x * math.sin(math.radians(angle)) + self.pt1y * math.cos(math.radians(angle))) + 40
         pt2x = int(self.pt2x * math.cos(math.radians(angle)) + self.pt2y * -math.sin(math.radians(angle))) + 40
         pt2y = int(self.pt2x * math.sin(math.radians(angle)) + self.pt2y * math.cos(math.radians(angle))) + 40
         cv2.line(image,(pt1x,pt1y),(pt2x,pt2y),255,2)
         ros_image = self.bridge.cv2_to_imgmsg(image, encoding="mono8")
         self.pub2.publish(ros_image)
         sys.stdout.write(".")
         sys.stdout.flush()
Example #27
0
 def get_grad(self, obs, actions, gaes, rewards, v_preds_next):
     return tf.get_default_session().run(self.gradients, feed_dict={self.Policy.obs: obs,
                                                                    self.Old_Policy.obs: obs,
                                                                    self.actions: actions,
                                                                    self.rewards: rewards,
                                                                    self.v_preds_next: v_preds_next,
                                                                    self.gaes: gaes})
Example #28
0
def validate_probtype(probtype, pdparam):
    N = 100000
    # Check to see if mean negative log likelihood == differential entropy
    Mval = np.repeat(pdparam[None, :], N, axis=0)
    M = probtype.param_placeholder([N])
    X = probtype.sample_placeholder([N])
    pd = probtype.pdfromflat(M)
    calcloglik = U.function([X, M], pd.logp(X))
    calcent = U.function([M], pd.entropy())
    Xval = tf.get_default_session().run(pd.sample(), feed_dict={M:Mval})
    logliks = calcloglik(Xval, Mval)
    entval_ll = - logliks.mean() #pylint: disable=E1101
    entval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    entval = calcent(Mval).mean() #pylint: disable=E1101
    assert np.abs(entval - entval_ll) < 3 * entval_ll_stderr # within 3 sigmas

    # Check to see if kldiv[p,q] = - ent[p] - E_p[log q]
    M2 = probtype.param_placeholder([N])
    pd2 = probtype.pdfromflat(M2)
    q = pdparam + np.random.randn(pdparam.size) * 0.1
    Mval2 = np.repeat(q[None, :], N, axis=0)
    calckl = U.function([M, M2], pd.kl(pd2))
    klval = calckl(Mval, Mval2).mean() #pylint: disable=E1101
    logliks = calcloglik(Xval, Mval2)
    klval_ll = - entval - logliks.mean() #pylint: disable=E1101
    klval_ll_stderr = logliks.std() / np.sqrt(N) #pylint: disable=E1101
    assert np.abs(klval - klval_ll) < 3 * klval_ll_stderr # within 3 sigmas
    print('ok on', probtype, pdparam)
    def fit(self, trajs, batch_size=32, max_itrs=100, **kwargs):
        obs, acts = self.extract_paths(trajs)
        expert_obs, expert_acts = self.expert_trajs

        # Train discriminator
        for it in TrainingIterator(max_itrs, heartbeat=5):
            obs_batch, act_batch = self.sample_batch(obs, acts, batch_size=batch_size)
            expert_obs_batch, expert_act_batch = self.sample_batch(expert_obs, expert_acts, batch_size=batch_size)
            labels = np.zeros((batch_size*2, 1))
            labels[batch_size:] = 1.0
            obs_batch = np.concatenate([obs_batch, expert_obs_batch], axis=0)
            act_batch = np.concatenate([act_batch, expert_act_batch], axis=0)

            loss, _ = tf.get_default_session().run([self.loss, self.step], feed_dict={
                self.act_t: act_batch,
                self.obs_t: obs_batch,
                self.labels: labels,
                self.lr: 1e-3
            })

            it.record('loss', loss)
            if it.heartbeat:
                print(it.itr_message())
                mean_loss = it.pop_mean('loss')
                print('\tLoss:%f' % mean_loss)
        return mean_loss
Example #30
0
 def fit(self, xs, ys):
     sess = tf.get_default_session()
     if self._normalize_inputs:
         # recompute normalizing constants for inputs
         sess.run([
             tf.assign(self._x_mean_var, np.mean(xs, axis=0, keepdims=True)),
             tf.assign(self._x_std_var, np.std(xs, axis=0, keepdims=True) + 1e-8),
         ])
     if self._normalize_outputs:
         # recompute normalizing constants for outputs
         sess.run([
             tf.assign(self._y_mean_var, np.mean(ys, axis=0, keepdims=True)),
             tf.assign(self._y_std_var, np.std(ys, axis=0, keepdims=True) + 1e-8),
         ])
     if self._use_trust_region:
         old_means, old_log_stds = self._f_pdists(xs)
         inputs = [xs, ys, old_means, old_log_stds]
     else:
         inputs = [xs, ys]
     loss_before = self._optimizer.loss(inputs)
     if self._name:
         prefix = self._name + "_"
     else:
         prefix = ""
     logger.record_tabular(prefix + 'LossBefore', loss_before)
     self._optimizer.optimize(inputs)
     loss_after = self._optimizer.loss(inputs)
     logger.record_tabular(prefix + 'LossAfter', loss_after)
     if self._use_trust_region:
         logger.record_tabular(prefix + 'MeanKL', self._optimizer.constraint_val(inputs))
     logger.record_tabular(prefix + 'dLoss', loss_before - loss_after)
Example #31
0
File: test.py Project: liziniu/RLX
def main():
    FLAGS.set_seed()
    FLAGS.freeze()

    env = create_env(FLAGS.env.id,
                     seed=FLAGS.seed,
                     log_dir=FLAGS.log_dir,
                     absorbing_state=FLAGS.GAIL.learn_absorbing,
                     rescale_action=FLAGS.env.rescale_action)
    env_eval = create_env(FLAGS.env.id,
                          seed=FLAGS.seed + 1000,
                          log_dir=FLAGS.log_dir,
                          absorbing_state=FLAGS.GAIL.learn_absorbing,
                          rescale_action=FLAGS.env.rescale_action)
    dim_state = env.observation_space.shape[0]
    dim_action = env.action_space.shape[0]

    normalizers = Normalizers(dim_action=dim_action, dim_state=dim_state)
    policy = GaussianMLPPolicy(dim_state,
                               dim_action,
                               FLAGS.TRPO.policy_hidden_sizes,
                               normalizer=normalizers.state)
    vfn = MLPVFunction(dim_state, FLAGS.TRPO.vf_hidden_sizes,
                       normalizers.state)
    algo = TRPO(vfn=vfn,
                policy=policy,
                dim_state=dim_state,
                dim_action=dim_action,
                **FLAGS.TRPO.algo.as_dict())

    subsampling_rate = env.max_episode_steps // FLAGS.GAIL.trajectory_size
    discriminator = Discriminator(dim_state,
                                  dim_action,
                                  normalizers=normalizers,
                                  subsampling_rate=subsampling_rate,
                                  **FLAGS.GAIL.discriminator.as_dict())

    tf.get_default_session().run(tf.global_variables_initializer())

    # load expert dataset
    expert_dataset = load_expert_dataset(FLAGS.GAIL.buf_load)
    expert_reward = expert_dataset.get_average_reward()
    logger.info('Expert Reward %f', expert_reward)
    if FLAGS.GAIL.learn_absorbing:
        expert_dataset.add_absorbing_states(env)
    expert_dataset.subsample_trajectories(FLAGS.GAIL.traj_limit)
    logger.info('Original dataset size {}'.format(len(expert_dataset)))
    expert_dataset.subsample_transitions(subsampling_rate)
    logger.info('Subsampled dataset size {}'.format(len(expert_dataset)))

    saver = nn.ModuleDict({
        'policy': policy,
        'vfn': vfn,
        'normalizers': normalizers
    })
    runner = Runner(env,
                    max_steps=env.max_episode_steps,
                    gamma=FLAGS.TRPO.gamma,
                    lambda_=FLAGS.TRPO.lambda_,
                    add_absorbing_state=FLAGS.GAIL.learn_absorbing)
    print(saver)

    max_ent_coef = FLAGS.TRPO.algo.ent_coef
    for t in range(0, FLAGS.GAIL.total_timesteps,
                   FLAGS.TRPO.rollout_samples * FLAGS.GAIL.g_iters):
        time_st = time.time()
        if t % FLAGS.GAIL.eval_freq == 0:
            eval_returns, eval_lengths = evaluate(policy, env_eval)
            log_kvs(prefix='Evaluate',
                    kvs=dict(iter=t,
                             episode=dict(returns=np.mean(eval_returns),
                                          lengths=int(np.mean(eval_lengths)))))

        # Generator
        generator_dataset = None
        for n_update in range(FLAGS.GAIL.g_iters):
            data, ep_infos = runner.run(policy, FLAGS.TRPO.rollout_samples)
            if FLAGS.TRPO.normalization:
                normalizers.state.update(data.state)
                normalizers.action.update(data.action)
                normalizers.diff.update(data.next_state - data.state)
            if t == 0 and n_update == 0 and not FLAGS.GAIL.learn_absorbing:
                data_ = data.copy()
                data_ = data_.reshape(
                    [FLAGS.TRPO.rollout_samples // env.n_envs, env.n_envs])
                for e in range(env.n_envs):
                    samples = data_[:, e]
                    masks = 1 - (samples.done | samples.timeout)[...,
                                                                 np.newaxis]
                    masks = masks[:-1]
                    assert np.allclose(samples.state[1:] * masks,
                                       samples.next_state[:-1] * masks)
            t += FLAGS.TRPO.rollout_samples
            data.reward = discriminator.get_reward(data.state, data.action)
            advantages, values = runner.compute_advantage(vfn, data)
            train_info = algo.train(max_ent_coef, data, advantages, values)
            fps = int(FLAGS.TRPO.rollout_samples / (time.time() - time_st))
            train_info['reward'] = np.mean(data.reward)
            train_info['fps'] = fps
            log_kvs(prefix='TRPO', kvs=dict(iter=t, **train_info))

            generator_dataset = data

        # Discriminator
        for n_update in range(FLAGS.GAIL.d_iters):
            batch_size = FLAGS.GAIL.d_batch_size
            d_train_infos = dict()
            for generator_subset in generator_dataset.iterator(batch_size):
                expert_batch = expert_dataset.sample(batch_size)
                expert_state = np.stack([t.obs for t in expert_batch])
                expert_action = np.stack([t.action for t in expert_batch])
                expert_mask = np.stack([
                    t.mask for t in expert_batch
                ]).flatten() if FLAGS.GAIL.learn_absorbing else None
                train_info = discriminator.train(
                    expert_state,
                    expert_action,
                    generator_subset.state,
                    generator_subset.action,
                    expert_mask,
                )
                for k, v in train_info.items():
                    if k not in d_train_infos:
                        d_train_infos[k] = []
                    d_train_infos[k].append(v)
            d_train_infos = {k: np.mean(v) for k, v in d_train_infos.items()}
            if n_update == FLAGS.GAIL.d_iters - 1:
                log_kvs(prefix='Discriminator',
                        kvs=dict(iter=t, **d_train_infos))

        if t % FLAGS.TRPO.save_freq == 0:
            np.save('{}/stage-{}'.format(FLAGS.log_dir, t), saver.state_dict())
            np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
    np.save('{}/final'.format(FLAGS.log_dir), saver.state_dict())
Example #32
0
def init_tf(config_dict=dict()):
    if tf.get_default_session() is None:
        tf.set_random_seed(np.random.randint(1 << 31))
        create_session(config_dict, force_as_default=True)
Example #33
0
def load_model(model_uri, tf_sess=None):
    """
    Load an MLflow model that contains the TensorFlow flavor from the specified path.

    *With TensorFlow version <2.0.0, this method must be called within a TensorFlow graph context.*

    :param model_uri: The location, in URI format, of the MLflow model. For example:

                      - ``/Users/me/path/to/local/model``
                      - ``relative/path/to/local/model``
                      - ``s3://my_bucket/path/to/model``
                      - ``runs:/<mlflow_run_id>/run-relative/path/to/model``
                      - ``models:/<model_name>/<model_version>``
                      - ``models:/<model_name>/<stage>``

                      For more information about supported URI schemes, see
                      `Referencing Artifacts <https://www.mlflow.org/docs/latest/concepts.html#
                      artifact-locations>`_.


    :param tf_sess: The TensorFlow session in which to load the model. If using TensorFlow
                    version >= 2.0.0, this argument is ignored. If using TensorFlow <2.0.0, if no
                    session is passed to this function, MLflow will attempt to load the model using
                    the default TensorFlow session.  If no default session is available, then the
                    function raises an exception.
    :return: For TensorFlow < 2.0.0, a TensorFlow signature definition of type:
             ``tensorflow.core.protobuf.meta_graph_pb2.SignatureDef``. This defines the input and
             output tensors for model inference.
             For TensorFlow >= 2.0.0, A callable graph (tf.function) that takes inputs and
             returns inferences.

    >>> import mlflow.tensorflow
    >>> import tensorflow as tf
    >>> tf_graph = tf.Graph()
    >>> tf_sess = tf.Session(graph=tf_graph)
    >>> with tf_graph.as_default():
    >>>     signature_definition = mlflow.tensorflow.load_model(model_uri="model_uri",
    >>>                            tf_sess=tf_sess)
    >>>     input_tensors = [tf_graph.get_tensor_by_name(input_signature.name)
    >>>                      for _, input_signature in signature_def.inputs.items()]
    >>>     output_tensors = [tf_graph.get_tensor_by_name(output_signature.name)
    >>>                       for _, output_signature in signature_def.outputs.items()]
    """

    if LooseVersion(tensorflow.__version__) < LooseVersion('2.0.0'):
        if not tf_sess:
            tf_sess = tensorflow.get_default_session()
            if not tf_sess:
                raise MlflowException("No TensorFlow session found while calling load_model()." +
                                      "You can set the default Tensorflow session before calling" +
                                      " load_model via `session.as_default()`, or directly pass " +
                                      "a session in which to load the model via the tf_sess " +
                                      "argument.")

    else:
        if tf_sess:
            warnings.warn("A TensorFlow session was passed into load_model, but the " +
                          "currently used version is TF 2.0 where sessions are deprecated. " +
                          "The tf_sess argument will be ignored.", FutureWarning)
    local_model_path = _download_artifact_from_uri(artifact_uri=model_uri)
    tf_saved_model_dir, tf_meta_graph_tags, tf_signature_def_key =\
        _get_and_parse_flavor_configuration(model_path=local_model_path)
    return _load_tensorflow_saved_model(tf_saved_model_dir=tf_saved_model_dir,
                                        tf_meta_graph_tags=tf_meta_graph_tags,
                                        tf_signature_def_key=tf_signature_def_key,
                                        tf_sess=tf_sess)
Example #34
0
 def set_discount_d_loss_factor(self, value=.1):
     sess = tf.get_default_session()
     if sess is None:
         raise ValueError("in the with tf.Session() as sess block")
     sess.run(self.set_discount_d_loss_fac_op, feed_dict={self.D_loss_factor_ph: value})
Example #35
0
 def run_optimize(*, feed_dict):
     assert lr in feed_dict, 'feed_dict need to contain learning rate.'
     return tf.get_default_session().run(optimize_op, feed_dict)
Example #36
0
 def start(self):
     self._sess = tf.get_default_session()
     super().start()
Example #37
0
    def __init__(self,
                 observation_space,
                 action_space,
                 config,
                 existing_inputs=None):
        """
        Arguments:
            observation_space: Environment observation space specification.
            action_space: Environment action space specification.
            config (dict): Configuration values for PPO graph.
            existing_inputs (list): Optional list of tuples that specify the
                placeholders upon which the graph should be built upon.
        """
        config = dict(ray.rllib.agents.ppo.ppo.DEFAULT_CONFIG, **config)
        self.sess = tf.get_default_session()
        self.action_space = action_space
        self.config = config
        self.kl_coeff_val = self.config["kl_coeff"]
        self.kl_target = self.config["kl_target"]
        dist_cls, logit_dim = ModelCatalog.get_action_dist(
            action_space, self.config["model"])

        if existing_inputs:
            obs_ph, value_targets_ph, adv_ph, act_ph, \
                logits_ph, vf_preds_ph, prev_actions_ph, prev_rewards_ph = \
                existing_inputs[:8]
            existing_state_in = existing_inputs[8:-1]
            existing_seq_lens = existing_inputs[-1]
        else:
            obs_ph = tf.placeholder(tf.float32,
                                    name="obs",
                                    shape=(None, ) + observation_space.shape)
            adv_ph = tf.placeholder(tf.float32,
                                    name="advantages",
                                    shape=(None, ))
            act_ph = ModelCatalog.get_action_placeholder(action_space)
            logits_ph = tf.placeholder(tf.float32,
                                       name="logits",
                                       shape=(None, logit_dim))
            vf_preds_ph = tf.placeholder(tf.float32,
                                         name="vf_preds",
                                         shape=(None, ))
            value_targets_ph = tf.placeholder(tf.float32,
                                              name="value_targets",
                                              shape=(None, ))
            prev_actions_ph = ModelCatalog.get_action_placeholder(action_space)
            prev_rewards_ph = tf.placeholder(tf.float32, [None],
                                             name="prev_reward")
            existing_state_in = None
            existing_seq_lens = None
        self.observations = obs_ph
        self.prev_actions = prev_actions_ph
        self.prev_rewards = prev_rewards_ph

        self.loss_in = [
            (SampleBatch.CUR_OBS, obs_ph),
            (Postprocessing.VALUE_TARGETS, value_targets_ph),
            (Postprocessing.ADVANTAGES, adv_ph),
            (SampleBatch.ACTIONS, act_ph),
            (BEHAVIOUR_LOGITS, logits_ph),
            (SampleBatch.VF_PREDS, vf_preds_ph),
            (SampleBatch.PREV_ACTIONS, prev_actions_ph),
            (SampleBatch.PREV_REWARDS, prev_rewards_ph),
        ]
        self.model = ModelCatalog.get_model(
            {
                "obs": obs_ph,
                "prev_actions": prev_actions_ph,
                "prev_rewards": prev_rewards_ph,
                "is_training": self._get_is_training_placeholder(),
            },
            observation_space,
            action_space,
            logit_dim,
            self.config["model"],
            state_in=existing_state_in,
            seq_lens=existing_seq_lens)

        # KL Coefficient
        self.kl_coeff = tf.get_variable(initializer=tf.constant_initializer(
            self.kl_coeff_val),
                                        name="kl_coeff",
                                        shape=(),
                                        trainable=False,
                                        dtype=tf.float32)

        self.logits = self.model.outputs
        curr_action_dist = dist_cls(self.logits)
        self.sampler = curr_action_dist.sample()
        if self.config["use_gae"]:
            if self.config["vf_share_layers"]:
                self.value_function = self.model.value_function()
            else:
                vf_config = self.config["model"].copy()
                # Do not split the last layer of the value function into
                # mean parameters and standard deviation parameters and
                # do not make the standard deviations free variables.
                vf_config["free_log_std"] = False
                if vf_config["use_lstm"]:
                    vf_config["use_lstm"] = False
                    logger.warning(
                        "It is not recommended to use a LSTM model with "
                        "vf_share_layers=False (consider setting it to True). "
                        "If you want to not share layers, you can implement "
                        "a custom LSTM model that overrides the "
                        "value_function() method.")
                with tf.variable_scope("value_function"):
                    self.value_function = ModelCatalog.get_model(
                        {
                            "obs": obs_ph,
                            "prev_actions": prev_actions_ph,
                            "prev_rewards": prev_rewards_ph,
                            "is_training": self._get_is_training_placeholder(),
                        }, observation_space, action_space, 1,
                        vf_config).outputs
                    self.value_function = tf.reshape(self.value_function, [-1])
        else:
            self.value_function = tf.zeros(shape=tf.shape(obs_ph)[:1])

        if self.model.state_in:
            max_seq_len = tf.reduce_max(self.model.seq_lens)
            mask = tf.sequence_mask(self.model.seq_lens, max_seq_len)
            mask = tf.reshape(mask, [-1])
        else:
            mask = tf.ones_like(adv_ph, dtype=tf.bool)

        self.loss_obj = PPOLoss(action_space,
                                value_targets_ph,
                                adv_ph,
                                act_ph,
                                logits_ph,
                                vf_preds_ph,
                                curr_action_dist,
                                self.value_function,
                                self.kl_coeff,
                                mask,
                                entropy_coeff=self.config["entropy_coeff"],
                                clip_param=self.config["clip_param"],
                                vf_clip_param=self.config["vf_clip_param"],
                                vf_loss_coeff=self.config["vf_loss_coeff"],
                                use_gae=self.config["use_gae"])

        LearningRateSchedule.__init__(self, self.config["lr"],
                                      self.config["lr_schedule"])
        TFPolicyGraph.__init__(
            self,
            observation_space,
            action_space,
            self.sess,
            obs_input=obs_ph,
            action_sampler=self.sampler,
            action_prob=curr_action_dist.sampled_action_prob(),
            loss=self.loss_obj.loss,
            model=self.model,
            loss_inputs=self.loss_in,
            state_inputs=self.model.state_in,
            state_outputs=self.model.state_out,
            prev_action_input=prev_actions_ph,
            prev_reward_input=prev_rewards_ph,
            seq_lens=self.model.seq_lens,
            max_seq_len=config["model"]["max_seq_len"])

        self.sess.run(tf.global_variables_initializer())
        self.explained_variance = explained_variance(value_targets_ph,
                                                     self.value_function)
        self.stats_fetches = {
            "cur_kl_coeff": self.kl_coeff,
            "cur_lr": tf.cast(self.cur_lr, tf.float64),
            "total_loss": self.loss_obj.loss,
            "policy_loss": self.loss_obj.mean_policy_loss,
            "vf_loss": self.loss_obj.mean_vf_loss,
            "vf_explained_var": self.explained_variance,
            "kl": self.loss_obj.mean_kl,
            "entropy": self.loss_obj.mean_entropy
        }
Example #38
0
 def __call__(self, theta):
     tf.get_default_session().run(self.op, feed_dict={self.theta: theta})
Example #39
0
 def save(self):
     save_checkpoint(tf.get_default_session(), self.saver,
                     self.checkpoint_dir, self.global_step)
Example #40
0
 def have_data_for_dequeue(self):
   return self.have_more(tf.get_default_session())
Example #41
0
def learn(env,
          eval_env,
          policy_func,
          reward_giver,
          expert_dataset,
          rank,
          pretrained,
          pretrained_weight,
          *,
          g_step,
          d_step,
          entcoeff,
          save_per_iter,
          ckpt_dir,
          log_dir,
          timesteps_per_batch,
          evaluation_freq,
          task_name,
          gamma,
          lam,
          max_kl,
          cg_iters,
          cg_damping=1e-2,
          vf_stepsize=3e-4,
          d_stepsize=3e-4,
          vf_iters=3,
          num_epochs=1000,
          callback=None):

    # configure log
    # logger.configure(dir=log_dir)

    nworkers = MPI.COMM_WORLD.Get_size()
    rank = MPI.COMM_WORLD.Get_rank()
    np.set_printoptions(precision=3)
    # Setup losses and stuff
    # ----------------------------------------
    ob_space = env.observation_space
    ac_space = env.action_space
    pi = policy_func("pi",
                     ob_space,
                     ac_space,
                     reuse=(pretrained_weight != None))
    oldpi = policy_func("oldpi", ob_space, ac_space)
    atarg = tf.placeholder(
        dtype=tf.float32,
        shape=[None])  # Target advantage function (if applicable)
    ret = tf.placeholder(dtype=tf.float32, shape=[None])  # Empirical return

    ob = U.get_placeholder_cached(name="ob")
    ac = pi.pdtype.sample_placeholder([None])

    kloldnew = oldpi.pd.kl(pi.pd)
    ent = pi.pd.entropy()
    meankl = tf.reduce_mean(kloldnew)
    meanent = tf.reduce_mean(ent)
    entbonus = entcoeff * meanent

    vferr = tf.reduce_mean(tf.square(pi.vpred - ret))

    ratio = tf.exp(pi.pd.logp(ac) -
                   oldpi.pd.logp(ac))  # advantage * pnew / pold
    surrgain = tf.reduce_mean(ratio * atarg)

    optimgain = surrgain + entbonus
    losses = [optimgain, meankl, entbonus, surrgain, meanent]
    loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"]

    dist = meankl

    all_var_list = pi.get_trainable_variables()
    var_list = [
        v for v in all_var_list
        if v.name.startswith("pi/pol") or v.name.startswith("pi/logstd")
    ]
    vf_var_list = [v for v in all_var_list if v.name.startswith("pi/vff")]
    # assert len(var_list) == len(vf_var_list) + 1
    d_adam = MpiAdam(reward_giver.get_trainable_variables())
    vfadam = MpiAdam(vf_var_list)

    get_flat = U.GetFlat(var_list)
    set_from_flat = U.SetFromFlat(var_list)
    klgrads = tf.gradients(dist, var_list)
    flat_tangent = tf.placeholder(dtype=tf.float32,
                                  shape=[None],
                                  name="flat_tan")
    shapes = [var.get_shape().as_list() for var in var_list]
    start = 0
    tangents = []
    for shape in shapes:
        sz = U.intprod(shape)
        tangents.append(tf.reshape(flat_tangent[start:start + sz], shape))
        start += sz
    gvp = tf.add_n([
        tf.reduce_sum(g * tangent)
        for (g, tangent) in zipsame(klgrads, tangents)
    ])  # pylint: disable=E1111
    fvp = U.flatgrad(gvp, var_list)

    assign_old_eq_new = U.function(
        [], [],
        updates=[
            tf.assign(oldv, newv)
            for (oldv,
                 newv) in zipsame(oldpi.get_variables(), pi.get_variables())
        ])
    compute_losses = U.function([ob, ac, atarg], losses)
    compute_lossandgrad = U.function([ob, ac, atarg], losses +
                                     [U.flatgrad(optimgain, var_list)])
    compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp)
    compute_vflossandgrad = U.function([ob, ret],
                                       U.flatgrad(vferr, vf_var_list))

    @contextmanager
    def timed(msg):
        if rank == 0:
            print(colorize(msg, color='magenta'))
            tstart = time.time()
            yield
            print(
                colorize("done in %.3f seconds" % (time.time() - tstart),
                         color='magenta'))
        else:
            yield

    def allmean(x):
        assert isinstance(x, np.ndarray)
        out = np.empty_like(x)
        MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM)
        out /= nworkers
        return out

    U.initialize()
    th_init = get_flat()
    MPI.COMM_WORLD.Bcast(th_init, root=0)
    set_from_flat(th_init)
    d_adam.sync()
    vfadam.sync()
    if rank == 0:
        print("Init param sum", th_init.sum(), flush=True)

    # Prepare for rollouts
    # ----------------------------------------
    seg_gen = traj_segment_generator(pi,
                                     env,
                                     reward_giver,
                                     timesteps_per_batch,
                                     stochastic=True)

    episodes_so_far = 0
    timesteps_so_far = 0
    iters_so_far = 0
    tstart = time.time()
    lenbuffer = deque(maxlen=40)  # rolling buffer for episode lengths
    rewbuffer = deque(maxlen=40)  # rolling buffer for episode rewards
    true_rewbuffer = deque(maxlen=40)

    g_loss_stats = stats(loss_names)
    d_loss_stats = stats(reward_giver.loss_name)
    ep_stats = stats(["True_rewards", "Rewards", "Episode_length"])
    # if provide pretrained weight
    if pretrained_weight is not None:
        U.load_state(pretrained_weight, var_list=pi.get_variables())

    for epoch in range(num_epochs):
        if callback: callback(locals(), globals())

        # Save model
        if rank == 0 and iters_so_far % save_per_iter == 0 and ckpt_dir is not None:
            fname = os.path.join(ckpt_dir, task_name)
            os.makedirs(os.path.dirname(fname), exist_ok=True)
            saver = tf.train.Saver()
            saver.save(tf.get_default_session(), fname)

        logger.log("********** Epoch %i ************" % epoch)

        def fisher_vector_product(p):
            return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p

        # ------------------ Update G ------------------
        logger.log("Optimizing Policy...")
        total_obs = []
        total_acs = []
        total_ep_rets = []
        total_ep_lens = []
        total_ep_true_rets = []
        for g_step_num in range(g_step):
            with timed("sampling"):
                seg = seg_gen.__next__()

            # Add seg into total_seg
            total_obs.append(seg["ob"])
            total_acs.append(seg["ac"])
            total_ep_rets.append(seg["ep_rets"])
            total_ep_lens.append(seg["ep_lens"])
            total_ep_true_rets.append(seg["ep_true_rets"])

            add_vtarg_and_adv(seg, gamma, lam)
            # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets))
            ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[
                "tdlamret"]
            vpredbefore = seg[
                "vpred"]  # predicted value function before update
            atarg = (atarg - atarg.mean()) / atarg.std(
            )  # standardized advantage function estimate

            if hasattr(pi, "ob_rms"):
                pi.ob_rms.update(ob)  # update running mean/std for policy

            args = seg["ob"], seg["ac"], atarg
            fvpargs = [arr[::5] for arr in args]

            assign_old_eq_new(
            )  # set old parameter values to new parameter values
            with timed("computegrad"):
                *lossbefore, g = compute_lossandgrad(*args)
            lossbefore = allmean(np.array(lossbefore))
            g = allmean(g)
            if np.allclose(g, 0):
                logger.log("Got zero gradient. not updating")
            else:
                with timed("cg"):
                    stepdir = cg(fisher_vector_product,
                                 g,
                                 cg_iters=cg_iters,
                                 verbose=rank == 0)
                assert np.isfinite(stepdir).all()
                shs = .5 * stepdir.dot(fisher_vector_product(stepdir))
                lm = np.sqrt(shs / max_kl)
                # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g))
                fullstep = stepdir / lm
                expectedimprove = g.dot(fullstep)
                surrbefore = lossbefore[0]
                stepsize = 1.0
                thbefore = get_flat()
                for _ in range(10):
                    thnew = thbefore + fullstep * stepsize
                    set_from_flat(thnew)
                    meanlosses = surr, kl, *_ = allmean(
                        np.array(compute_losses(*args)))
                    improve = surr - surrbefore
                    logger.log("Expected: %.3f Actual: %.3f" %
                               (expectedimprove, improve))
                    if not np.isfinite(meanlosses).all():
                        logger.log("Got non-finite value of losses -- bad!")
                    elif kl > max_kl * 1.5:
                        logger.log("violated KL constraint. shrinking step.")
                    elif improve < 0:
                        logger.log("surrogate didn't improve. shrinking step.")
                    else:
                        logger.log("Stepsize OK!")
                        break
                    stepsize *= .5
                else:
                    logger.log("couldn't compute a good step")
                    set_from_flat(thbefore)
                if nworkers > 1 and iters_so_far % 20 == 0:
                    paramsums = MPI.COMM_WORLD.allgather(
                        (thnew.sum(),
                         vfadam.getflat().sum()))  # list of tuples
                    assert all(
                        np.allclose(ps, paramsums[0]) for ps in paramsums[1:])
            with timed("vf"):
                for _ in range(vf_iters):
                    for (mbob, mbret) in dataset.iterbatches(
                        (seg["ob"], seg["tdlamret"]),
                            include_final_partial_batch=False,
                            batch_size=128):
                        if hasattr(pi, "obs_rms"):
                            pi.obs_rms.update(
                                mbob)  # update running mean/std for policy
                        g = allmean(compute_vflossandgrad(mbob, mbret))
                        vfadam.update(g, vf_stepsize)

            # Evaluate current policy
            if (g_step * epoch + g_step_num) % evaluation_freq == 0:
                evaluate_policy(pi, reward_giver, eval_env,
                                g_step * epoch + g_step_num,
                                timesteps_per_batch, tstart)

        #  ------------------ Update D ------------------
        logger.log("Optimizing Discriminator...")
        total_obs = np.vstack(total_obs)
        total_acs = np.vstack(total_acs)
        total_ep_rets = np.concatenate(total_ep_rets)
        total_ep_lens = np.concatenate(total_ep_lens)
        total_ep_true_rets = np.concatenate(total_ep_true_rets)

        logger.log(fmt_row(13, reward_giver.loss_name))
        ob_expert, ac_expert = expert_dataset.get_next_batch(len(total_obs))
        batch_size = len(total_obs) // d_step
        d_losses = [
        ]  # list of tuples, each of which gives the loss for a minibatch
        for ob_batch, ac_batch in dataset.iterbatches(
            (total_obs, total_acs),
                include_final_partial_batch=False,
                batch_size=batch_size):
            ob_expert, ac_expert = expert_dataset.get_next_batch(len(ob_batch))
            # Update running mean/std for reward_giver
            if hasattr(reward_giver, "obs_rms"):
                reward_giver.obs_rms.update(
                    np.concatenate((ob_batch, ob_expert), 0))
            *newlosses, g = reward_giver.lossandgrad(ob_batch, ac_batch,
                                                     ob_expert, ac_expert)
            d_adam.update(allmean(g), d_stepsize)
            d_losses.append(newlosses)
        logger.log(fmt_row(13, np.mean(d_losses, axis=0)))
Example #42
0
 def one_more_enqueue_is_enough(self):
   tf_session = tf.get_default_session()
   return tf_session.run(self.one_more_enqueue_is_enough_op)
Example #43
0
def get_session(config=None):
    """Get default session or create one with a given config"""
    sess = tf.get_default_session()
    if sess is None:
        sess = make_session(config=config, make_default=True)
    return sess
Example #44
0
 def __call__(self):
     return tf.get_default_session().run(self.op)
Example #45
0
def deep_q_learning(sess,
                    env,
                    q_estimator,
                    target_estimator,
                    state_processor,
                    num_episodes,
                    experiment_dir,
                    replay_memory_size=500000,
                    replay_memory_init_size=50000,
                    update_target_estimator_every=10000,
                    discount_factor=0.99,
                    epsilon_start=1.0,
                    epsilon_end=0.1,
                    epsilon_decay_steps=500000,
                    batch_size=32,
                    record_video_every=50):
    """
    Q-Learning algorithm for off-policy TD control using Function Approximation.
    Finds the optimal greedy policy while following an epsilon-greedy policy.

    Args:
        sess: Tensorflow Session object
        env: OpenAI environment
        q_estimator: Estimator object used for the q values
        target_estimator: Estimator object used for the targets
        state_processor: A StateProcessor object
        num_episodes: Number of episodes to run for
        experiment_dir: Directory to save Tensorflow summaries in
        replay_memory_size: Size of the replay memory
        replay_memory_init_size: Number of random experiences to sampel when initializing 
          the reply memory.
        update_target_estimator_every: Copy parameters from the Q estimator to the 
          target estimator every N steps
        discount_factor: Gamma discount factor
        epsilon_start: Chance to sample a random action when taking an action.
          Epsilon is decayed over time and this is the start value
        epsilon_end: The final minimum value of epsilon after decaying is done
        epsilon_decay_steps: Number of steps to decay epsilon over
        batch_size: Size of batches to sample from the replay memory
        record_video_every: Record a video every N episodes

    Returns:
        An EpisodeStats object with two numpy arrays for episode_lengths and episode_rewards.
    """

    Transition = namedtuple(
        "Transition", ["state", "action", "reward", "next_state", "done"])

    # The replay memory
    replay_memory = []

    # Keeps track of useful statistics
    stats = plotting.EpisodeStats(episode_lengths=np.zeros(num_episodes),
                                  episode_rewards=np.zeros(num_episodes))

    # Create directories for checkpoints and summaries
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    checkpoint_path = os.path.join(checkpoint_dir, "model")
    monitor_path = os.path.join(experiment_dir, "monitor")

    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    if not os.path.exists(monitor_path):
        os.makedirs(monitor_path)

    saver = tf.train.Saver()
    # Load a previous checkpoint if we find one
    latest_checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
    if latest_checkpoint:
        print("Loading model checkpoint {}...\n".format(latest_checkpoint))
        saver.restore(sess, latest_checkpoint)

    total_t = sess.run(tf.contrib.framework.get_global_step())

    # The epsilon decay schedule
    epsilons = np.linspace(epsilon_start, epsilon_end, epsilon_decay_steps)

    # The policy we're following
    policy = make_epsilon_greedy_policy(q_estimator, len(VALID_ACTIONS))

    # Populate the replay memory with initial experience
    print("Populating replay memory...")
    state = env.reset()
    state = state_processor.process(sess, state)
    state = np.stack([state] * 4, axis=2)
    for i in range(replay_memory_init_size):
        action_probs = policy(sess, state,
                              epsilons[min(total_t, epsilon_decay_steps - 1)])
        action = np.random.choice(np.arange(len(action_probs)), p=action_probs)
        next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
        next_state = state_processor.process(sess, next_state)
        next_state = np.append(state[:, :, 1:],
                               np.expand_dims(next_state, 2),
                               axis=2)
        replay_memory.append(
            Transition(state, action, reward, next_state, done))
        if done:
            state = env.reset()
            state = state_processor.process(sess, state)
            state = np.stack([state] * 4, axis=2)
        else:
            state = next_state

    # Record videos
    # Use the gym env Monitor wrapper
    env = Monitor(env,
                  directory=monitor_path,
                  resume=True,
                  video_callable=lambda count: count % record_video_every == 0)

    for i_episode in range(num_episodes):

        # Save the current checkpoint
        saver.save(tf.get_default_session(), checkpoint_path)

        # Reset the environment
        state = env.reset()
        state = state_processor.process(sess, state)
        state = np.stack([state] * 4, axis=2)
        loss = None

        # One step in the environment
        for t in itertools.count():

            # Epsilon for this time step
            epsilon = epsilons[min(total_t, epsilon_decay_steps - 1)]

            # Add epsilon to Tensorboard
            episode_summary = tf.Summary()
            episode_summary.value.add(simple_value=epsilon, tag="epsilon")
            q_estimator.summary_writer.add_summary(episode_summary, total_t)

            # Maybe update the target estimator
            if total_t % update_target_estimator_every == 0:
                copy_model_parameters(sess, q_estimator, target_estimator)
                print("\nCopied model parameters to target network.")

            # Print out which step we're on, useful for debugging.
            print("\rStep {} ({}) @ Episode {}/{}, loss: {}".format(
                t, total_t, i_episode + 1, num_episodes, loss),
                  end="")
            sys.stdout.flush()

            # Take a step
            action_probs = policy(sess, state, epsilon)
            action = np.random.choice(np.arange(len(action_probs)),
                                      p=action_probs)
            next_state, reward, done, _ = env.step(VALID_ACTIONS[action])
            next_state = state_processor.process(sess, next_state)
            next_state = np.append(state[:, :, 1:],
                                   np.expand_dims(next_state, 2),
                                   axis=2)

            # If our replay memory is full, pop the first element
            if len(replay_memory) == replay_memory_size:
                replay_memory.pop(0)

            # Save transition to replay memory
            replay_memory.append(
                Transition(state, action, reward, next_state, done))

            # Update statistics
            stats.episode_rewards[i_episode] += reward
            stats.episode_lengths[i_episode] = t

            # Sample a minibatch from the replay memory
            samples = random.sample(replay_memory, batch_size)
            states_batch, action_batch, reward_batch, next_states_batch, done_batch = map(
                np.array, zip(*samples))

            # Calculate q values and targets (Double DQN)
            q_values_next = q_estimator.predict(sess, next_states_batch)
            best_actions = np.argmax(q_values_next, axis=1)
            q_values_next_target = target_estimator.predict(
                sess, next_states_batch)
            targets_batch = reward_batch + np.invert(done_batch).astype(np.float32) * \
                discount_factor * q_values_next_target[np.arange(batch_size), best_actions]

            # Perform gradient descent update
            states_batch = np.array(states_batch)
            loss = q_estimator.update(sess, states_batch, action_batch,
                                      targets_batch)

            if done:
                break

            state = next_state
            total_t += 1

        # Add summaries to tensorboard
        episode_summary = tf.Summary()
        episode_summary.value.add(
            simple_value=stats.episode_rewards[i_episode],
            node_name="episode_reward",
            tag="episode_reward")
        episode_summary.value.add(
            simple_value=stats.episode_lengths[i_episode],
            node_name="episode_length",
            tag="episode_length")
        q_estimator.summary_writer.add_summary(episode_summary, total_t)
        q_estimator.summary_writer.flush()

        yield total_t, plotting.EpisodeStats(
            episode_lengths=stats.episode_lengths[:i_episode + 1],
            episode_rewards=stats.episode_rewards[:i_episode + 1])

    env.monitor.close()
    return stats
Example #46
0
import tensorflow as tf

with tf.Session() as sess:
    print("1111111111111111111111111111111")
    print(tf.get_default_session())

sess2 = tf.Session()
print("2222222222222222222222222222222222")
print(tf.get_default_session())

sess3 = tf.Session()
with sess3:
    print("33333333333333333333333333333333")
    print(tf.get_default_session())
Example #47
0
def run(*args, **kwargs): # Run the specified ops in the default session.
    return tf.get_default_session().run(*args, **kwargs)
Example #48
0
def build_model(hps, kind="train", datasets=None):
    """Builds a model from either random initialization, or saved parameters.

  Args:
    hps: The hyper parameters for the model.
    kind: (optional) The kind of model to build.  Training vs inference require
      different graphs.
    datasets: The datasets structure (see top of lfads.py).

  Returns:
    an LFADS model.
  """

    build_kind = kind
    if build_kind == "write_model_params":
        build_kind = "train"
    with tf.variable_scope("LFADS", reuse=None):
        model = LFADS(hps, kind=build_kind, datasets=datasets)

    if not os.path.exists(hps.lfads_save_dir):
        print("Save directory %s does not exist, creating it." %
              hps.lfads_save_dir)
        os.makedirs(hps.lfads_save_dir)

    cp_pb_ln = hps.checkpoint_pb_load_name
    cp_pb_ln = 'checkpoint' if cp_pb_ln == "" else cp_pb_ln
    if cp_pb_ln == 'checkpoint':
        print("Loading latest training checkpoint in: ", hps.lfads_save_dir)
        saver = model.seso_saver
    elif cp_pb_ln == 'checkpoint_lve':
        print("Loading lowest validation checkpoint in: ", hps.lfads_save_dir)
        saver = model.lve_saver
    else:
        print("Loading checkpoint: ", cp_pb_ln, ", in: ", hps.lfads_save_dir)
        saver = model.seso_saver

    ckpt = tf.train.get_checkpoint_state(hps.lfads_save_dir,
                                         latest_filename=cp_pb_ln)

    session = tf.get_default_session()
    print("ckpt: ", ckpt)
    if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
        print("Reading model parameters from %s" % ckpt.model_checkpoint_path)
        saver.restore(session, ckpt.model_checkpoint_path)
    else:
        print("Created model with fresh parameters.")
        if kind in [
                "posterior_sample_and_average", "posterior_push_mean",
                "prior_sample", "write_model_params"
        ]:
            print("Possible error!!! You are running ", kind, " on a newly \
      initialized model!")
            # cannot print ckpt.model_check_point path if no ckpt
            print("Are you sure you sure a checkpoint in ", hps.lfads_save_dir,
                  " exists?")

        tf.global_variables_initializer().run()

    if ckpt:
        train_step_str = re.search('-[0-9]+$',
                                   ckpt.model_checkpoint_path).group()
    else:
        train_step_str = '-0'

    fname = 'hyperparameters' + train_step_str + '.txt'
    hp_fname = os.path.join(hps.lfads_save_dir, fname)
    hps_for_saving = jsonify_dict(hps)
    utils.write_data(hp_fname, hps_for_saving, use_json=True)

    return model
Example #49
0
File: train.py Project: alcinos/dps
    def _run(self):
        print(cfg.to_string())

        threshold_reached = True
        self.global_step = 0
        self.n_global_experiences = 0
        self.curriculum_remaining = self.curriculum + []
        self.curriculum_complete = []

        stage_idx = 0
        while self.curriculum_remaining:
            print("\n" + "=" * 50)
            self.timestamp("Starting stage {}".format(stage_idx))
            print("\n")

            if cfg.start_tensorboard:
                restart_tensorboard(self.experiment_store.path, cfg.tbport, cfg.reload_interval)

            stage_config = self.curriculum_remaining.pop(0)
            stage_config = Config(stage_config)

            self.data.start_stage(stage_idx, stage_config)

            with ExitStack() as stack:

                # --------------- Stage set-up -------------------

                print("\n" + "-" * 10 + " Stage set-up " + "-" * 10)

                print("\nNew config values for this stage are: \n{}\n".format(pformat(stage_config)))
                stack.enter_context(stage_config)

                stage_prepare_func = cfg.get("stage_prepare_func", None)
                if callable(stage_prepare_func):
                    stage_prepare_func()  # Modify the stage config in arbitrary ways before starting stage

                self.mpi_context.start_stage()

                # Configure and create session and graph for stage.
                session_config = tf.ConfigProto()
                session_config.intra_op_parallelism_threads = cfg.get('intra_op_parallelism_threads', 0)
                session_config.inter_op_parallelism_threads = cfg.get('inter_op_parallelism_threads', 0)
                session_config.log_device_placement = cfg.get('log_device_placement', 0)

                if cfg.use_gpu:
                    per_process_gpu_memory_fraction = getattr(cfg, 'per_process_gpu_memory_fraction', None)
                    if per_process_gpu_memory_fraction:
                        session_config.gpu_options.per_process_gpu_memory_fraction = per_process_gpu_memory_fraction

                    gpu_allow_growth = getattr(cfg, 'gpu_allow_growth', None)
                    if gpu_allow_growth:
                        session_config.gpu_options.allow_growth = gpu_allow_growth

                if cfg.use_gpu:
                    print("Using GPU if available.")
                    print("Using {}% of GPU memory.".format(
                        100 * session_config.gpu_options.per_process_gpu_memory_fraction))
                    print("Allowing growth of GPU memory: {}".format(session_config.gpu_options.allow_growth))

                graph = tf.Graph()
                sess = tf.Session(graph=graph, config=session_config)

                # This HAS to come after the creation of the session, otherwise
                # it allocates all GPU memory if using the GPU.
                print("\nAvailable devices: ")
                from tensorflow.python.client import device_lib
                print(device_lib.list_local_devices())

                if not cfg.use_gpu:
                    print("Not using GPU.")
                    stack.enter_context(graph.device("/cpu:0"))

                stack.enter_context(graph.as_default())
                stack.enter_context(sess)
                stack.enter_context(sess.as_default())

                # Set the seed for the stage. Notice we generate a new tf seed for each stage.
                tf_seed = gen_seed()
                print("Setting tensorflow seed to generated seed: {}\n".format(tf_seed))
                tf.set_random_seed(tf_seed)

                # Set limit on CPU RAM for the stage
                cpu_ram_limit_mb = cfg.get("cpu_ram_limit_mb", None)
                if cpu_ram_limit_mb is not None:
                    stack.enter_context(memory_limit(cfg.cpu_ram_limit_mb))

                print("Building env...\n")

                # Maybe build env
                if stage_idx == 0 or not cfg.preserve_env:
                    if getattr(self, 'env', None):
                        self.env.close()

                    self.env = cfg.build_env()

                if hasattr(self.env, "print_memory_footprint"):
                    self.env.print_memory_footprint()

                print("\nDone building env.\n")
                print("Building updater...\n")

                import warnings
                with warnings.catch_warnings():
                    warnings.simplefilter('once')

                    if cfg.n_procs > 1:
                        updater = cfg.get_updater(self.env, mpi_context=self.mpi_context)
                    else:
                        updater = cfg.get_updater(self.env)

                    updater.stage_idx = stage_idx
                    updater.exp_dir = self.exp_dir

                    updater.build_graph()
                    print("\nDone building updater.\n")

                walk_variable_scopes(max_depth=3)

                # Maybe initialize network weights.
                # Let a *path_specification* be one of three things:
                #     1. An integer specifying a stage to load the best hypothesis from.
                #     2. A string of format: "stage_idx,kind" where `stage_idx` specifies a stage to load from
                #        and `kind` is either "final" or "best", specifying whether to load final or best
                #        hypothesis from that stage.
                #     3. A path on the filesystem that gives a prefix for a tensorflow checkpoint file to load from.
                #
                # Then cfg.load_path can either be a path_specification itself, in which case all variables
                # in the network will be loaded from that path_specification, or a dictionary mapping from
                # variable scope names to path specifications, in which case all variables in each supplied
                # variable scope name will be loaded from the path_specification paired with that scope name.
                load_path = cfg.load_path
                if load_path is not None:
                    if isinstance(load_path, str) or isinstance(load_path, int):
                        load_path = {"": load_path}

                    load_path = dict(load_path)

                    # Sort in increasing order, so that it if one variable scope lies within another scope,
                    # the outer scope gets loaded before the inner scope, rather than having the outer scope
                    # wipe out the inner scope.
                    items = sorted(load_path.items())

                    for var_scope, path in items:
                        variables = {v.name: v for v in trainable_variables(var_scope, for_opt=False)}
                        if not variables:
                            print("No variables to load in scope {}.".format(str(var_scope)))
                            continue

                        saver = tf.train.Saver(variables)

                        load_stage, kind = None, None

                        if isinstance(path, int):
                            load_stage = path
                            kind = "best"
                        elif isinstance(path, str):
                            try:
                                split = path.split(',')
                                load_stage = int(split[0])
                                kind = 'best' if len(split) > 1 else split[1]
                                assert kind in 'best final'.split(), "path={}".format(path)
                            except Exception:
                                load_stage, kind = None, None

                        if load_stage is not None:
                            if stage_idx == 0:
                                print(
                                    "Not loading var scope \"{}\" from stage {}, "
                                    "currently in stage 0.".format(var_scope, load_stage))
                                continue
                            else:
                                key = kind + '_path'
                                completed_history = self.data.history[:-1]
                                path = completed_history[load_stage][key]

                        path = os.path.realpath(path)

                        saver.restore(tf.get_default_session(), path)

                        print("Loading var scope \"{}\" from {}.".format(var_scope, path))
                else:
                    print("Using a fresh set of weights, not loading anything.")

                tf.train.get_or_create_global_step()
                sess.run(uninitialized_variables_initializer())
                sess.run(tf.assert_variables_initialized())

                for hook in cfg.hooks:
                    assert isinstance(hook, Hook)
                    hook.start_stage(self, updater, stage_idx)

                threshold_reached = False
                reason = None

                try:
                    # --------------- Run stage -------------------

                    start = time.time()
                    phys_memory_before = memory_usage(physical=True)
                    gpu_memory_before = gpu_memory_usage()

                    threshold_reached, reason = self._run_stage(stage_idx, updater)

                except KeyboardInterrupt:
                    reason = "User interrupt"

                except NotImplementedError as e:
                    # There is a bug in pdb_postmortem that prevents instances of `NotImplementedError`
                    # from being handled properly, so replace it with an instance of `Exception`.
                    if cfg.robust:
                        traceback.print_exc()
                        reason = "Exception occurred ({})".format(repr(e))
                    else:
                        raise Exception("NotImplemented") from e

                except Exception as e:
                    reason = "Exception occurred ({})".format(repr(e))
                    if cfg.robust:
                        traceback.print_exc()
                    else:
                        raise

                except Alarm:
                    reason = "Time limit exceeded"
                    raise

                finally:
                    phys_memory_after = memory_usage(physical=True)
                    gpu_memory_after = gpu_memory_usage()

                    self.data.record_values_for_stage(
                        stage_duration=time.time()-start,
                        phys_memory_before_mb=phys_memory_before,
                        phys_memory_delta_mb=phys_memory_after - phys_memory_before,
                        gpu_memory_before_mb=gpu_memory_before,
                        gpu_memory_delta_mb=gpu_memory_after - gpu_memory_before
                    )

                    self.data.record_values_for_stage(reason=reason)

                    print("\n" + "-" * 10 + " Optimization complete " + "-" * 10)
                    print("\nReason: {}.\n".format(reason))

                    final_path = self.data.path_for('weights/final_for_stage_{}'.format(stage_idx))
                    final_path = cfg.get('save_path', final_path)
                    final_path = updater.save(tf.get_default_session(), final_path)
                    self.data.record_values_for_stage(final_path=final_path)

                    # --------------- Maybe render performance of best hypothesis -------------------

                    do_final_testing = (
                        "Exception occurred" not in reason
                        and reason != "Time limit exceeded"
                        and 'best_path' in self.data.current_stage_record)

                    if do_final_testing:
                        try:
                            print("\n" + "-" * 10 + " Final testing/rendering " + "-" * 10)

                            print("Best hypothesis for this stage was found on "
                                  "step (l: {best_local_step}, g: {best_global_step}) "
                                  "with stopping criteria ({sc_name}) of {best_stopping_criteria}.".format(
                                      sc_name=self.stopping_criteria_name, **self.data.current_stage_record))

                            best_path = self.data.current_stage_record['best_path']
                            print("Loading best hypothesis for this stage "
                                  "from file {}...".format(best_path))
                            updater.restore(sess, best_path)

                            test_record = updater.evaluate(cfg.batch_size, mode="test")

                            for hook in cfg.hooks:
                                if hook.call_per_timestep and hook.final:
                                    hook_record = hook.step(self, updater)

                                    if hook_record:
                                        assert len(hook_record) == 1
                                        for k, d in dict(hook_record).items():
                                            test_record.update(d)

                            self.data.record_values_for_stage(
                                **{'_test_' + k: v for k, v in test_record.items()})

                            if cfg.render_step > 0 and cfg.render_hook is not None:
                                print("Rendering...")
                                cfg.render_hook(updater)
                                print("Done rendering.")

                        except BaseException:
                            print("Exception occurred while performing final testing/rendering: ")
                            traceback.print_exc()

                    else:
                        print("\n" + "-" * 10 + " Skipping final testing/rendering " + "-" * 10)

                    # --------------- Finish up the stage -------------------

                    self.data.end_stage(updater.n_updates)

                    print("\n" + "-" * 10 + " Running end-of-stage hooks " + "-" * 10 + "\n")
                    for hook in cfg.hooks:
                        hook.end_stage(self, stage_idx)

                    print()
                    self.timestamp("Done stage {}".format(stage_idx))
                    print("=" * 50)

                    stage_idx += 1
                    self.curriculum_complete.append(stage_config)

                if not (threshold_reached or cfg.power_through):
                    print("Failed to reach stopping criteria threshold on stage {} "
                          "of the curriculum, terminating.".format(stage_idx))
                    break
Example #50
0
File: train.py Project: alcinos/dps
    def _run_stage(self, stage_idx, updater):
        """ Run main training loop for a stage of the curriculum. """

        threshold_reached = False
        reason = "NotStarted"

        # Parse stopping criteria, set up early stopping
        stopping_criteria = cfg.get("stopping_criteria", None)
        if not stopping_criteria:
            stopping_criteria = updater.stopping_criteria

        if isinstance(stopping_criteria, str):
            stopping_criteria = stopping_criteria.split(",")

        self.stopping_criteria_name = stopping_criteria[0]
        if "max" in stopping_criteria[1]:
            self.maximize_sc = True
        elif "min" in stopping_criteria[1]:
            self.maximize_sc = False
        else:
            raise Exception("Ambiguous stopping criteria specification: {}".format(stopping_criteria[1]))

        early_stop = EarlyStopHook(patience=cfg.patience, maximize=self.maximize_sc)

        # Start stage
        print("\n" + "-" * 10 + " Training begins " + "-" * 10)
        self.timestamp("")
        print()

        total_hooks_time = 0.0
        time_per_hook = 0.0

        total_eval_time = 0.0
        time_per_eval = 0.0

        total_train_time = 0.0
        time_per_example = 0.0
        time_per_update = 0.0

        n_eval = 0

        while True:
            # Check whether to keep training
            if updater.n_updates >= cfg.max_steps:
                reason = "Maximum number of steps-per-stage reached"
                break

            if updater.n_experiences >= cfg.max_experiences:
                reason = "Maximum number of experiences-per-stage reached"
                break

            local_step = updater.n_updates
            global_step = self.global_step

            if local_step > 0 and local_step % cfg.checkpoint_step == 0:
                self.data.dump_data(local_step)

            evaluate = (local_step % cfg.eval_step) == 0
            display = (local_step % cfg.display_step) == 0
            render = (cfg.render_step > 0
                      and (local_step % cfg.render_step) == 0
                      and (local_step > 0 or cfg.render_first))

            data_to_store = []

            # --------------- Run hooks -------------------

            hooks_start = time.time()

            for hook in cfg.hooks:
                if hook.call_per_timestep:
                    run_hook = local_step == 0 and hook.initial
                    run_hook |= local_step > 0 and local_step % hook.n == 0

                    if run_hook:
                        hook_record = hook.step(self, updater, local_step)

                        if hook_record:
                            data_to_store.extend(dict(hook_record).items())

            hooks_duration = time.time() - hooks_start

            if render and cfg.render_hook is not None:
                print("Rendering...")
                cfg.render_hook(updater)
                print("Done rendering.")

            if display:
                print("Displaying...")
                self.data.summarize_current_stage(
                    local_step, global_step, updater.n_experiences, self.n_global_experiences)
                print("\nMy PID: {}\n".format(os.getpid()))
                print("Physical memory use: {}mb".format(memory_usage(physical=True)))
                print("Virtual memory use: {}mb".format(memory_usage(physical=False)))

                print("Avg time per update: {}s".format(time_per_update))
                print("Avg time per eval: {}s".format(time_per_eval))
                print("Avg time for hooks: {}s".format(time_per_hook))

                if cfg.use_gpu:
                    print(nvidia_smi())

            # --------------- Possibly evaluate -------------------

            if evaluate:
                print("Evaluating...")
                eval_start_time = time.time()
                val_record = updater.evaluate(cfg.batch_size, mode="val")
                eval_duration = time.time() - eval_start_time
                print("Done evaluating")

                val_record["duration"] = eval_duration

                n_eval += 1
                total_eval_time += eval_duration
                time_per_eval = total_eval_time / n_eval

                data_to_store.append(("val", val_record))

                if self.stopping_criteria_name not in val_record:
                    print("Stopping criteria {} not in record returned "
                          "by updater, using 0.0.".format(self.stopping_criteria_name))

                stopping_criteria = val_record.get(self.stopping_criteria_name, 0.0)
                new_best, stop = early_stop.check(stopping_criteria, local_step, val_record)

                if new_best:
                    print("Storing new best on step (l={}, g={}), "
                          "constituting (l={}, g={}) experiences, "
                          "with stopping criteria ({}) of {}.".format(
                              local_step, global_step,
                              updater.n_experiences, self.n_global_experiences,
                              self.stopping_criteria_name, stopping_criteria))

                    best_path = self.data.path_for(
                        'weights/best_of_stage_{}'.format(stage_idx))
                    best_path = cfg.get('save_path', best_path)

                    weight_start = time.time()
                    best_path = updater.save(tf.get_default_session(), best_path)

                    print("Done saving weights, took {} seconds".format(time.time() - weight_start))

                    self.data.record_values_for_stage(
                        best_path=best_path, best_global_step=global_step)
                    self.data.record_values_for_stage(
                        **{'best_' + k: v for k, v in early_stop.best.items()})

                if stop:
                    print("Early stopping triggered.")
                    reason = "Early stopping triggered"
                    break

                if self.maximize_sc:
                    threshold_reached = stopping_criteria >= cfg.threshold
                else:
                    threshold_reached = stopping_criteria <= cfg.threshold

                if threshold_reached:
                    reason = "Stopping criteria threshold reached"
                    break

            # --------------- Perform an update -------------------

            if cfg.do_train:
                if local_step % 100 == 0:
                    print("Running update step {}...".format(local_step))

                update_start_time = time.time()

                _old_n_experiences = updater.n_experiences

                update_record = updater.update(cfg.batch_size)

                update_duration = time.time() - update_start_time
                update_record["train"]["duration"] = update_duration

                if local_step % 100 == 0:
                    print("Done update step.")

                if local_step % 100 == 0:
                    start = time.time()
                    update_record["train"]["memory_physical_mb"] = memory_usage(physical=True)
                    update_record["train"]["memory_virtual_mb"] = memory_usage(physical=False)
                    update_record["train"]["memory_gpu_mb"] = gpu_memory_usage()
                    print("Memory check duration: {}".format(time.time() - start))

                data_to_store.extend(dict(update_record).items())

                n_experiences_delta = updater.n_experiences - _old_n_experiences
                self.n_global_experiences += n_experiences_delta

                total_train_time += update_duration
                time_per_example = total_train_time / updater.n_experiences
                time_per_update = total_train_time / updater.n_updates

                total_hooks_time += hooks_duration
                time_per_hook = total_hooks_time / updater.n_updates

            # --------------- Store data -------------------

            records = defaultdict(dict)
            for mode, r in data_to_store:
                records[mode].update(r)

            self.data.store_step_data_and_summaries(
                stage_idx, local_step, global_step,
                updater.n_experiences, self.n_global_experiences,
                **records)

            self.data.record_values_for_stage(
                time_per_example=time_per_example,
                time_per_update=time_per_update,
                time_per_eval=time_per_eval,
                time_per_hook=time_per_hook,
                n_steps=local_step,
                n_experiences=updater.n_experiences,
            )

            self.global_step += 1

            # If `do_train` is False, we do no training and evaluate
            # exactly once, so only one iteration is required.
            if not cfg.do_train:
                reason = "`do_train` set to False"
                break

        return threshold_reached, reason
Example #51
0
    def __init__(self, *, policy, ob_space, ac_space, nbatch_act, nbatch_train,
                nsteps, ent_coef, vf_coef, max_grad_norm):
        sess = tf.get_default_session()
        print(policy)
        print(ob_space)
        print(ac_space)
        print(nbatch_act)
        print(nbatch_train)
        print(nsteps)
        print(ent_coef)
        print(vf_coef)
        print(max_grad_norm)

        act_model = policy(sess, ob_space, ac_space, nbatch_act, 1, reuse=False)
        train_model = policy(sess, ob_space, ac_space, nbatch_train, nsteps, reuse=True)

        A = train_model.pdtype.sample_placeholder([None])
        ADV = tf.placeholder(tf.float32, [None])
        R = tf.placeholder(tf.float32, [None])
        OLDNEGLOGPAC = tf.placeholder(tf.float32, [None])
        OLDVPRED = tf.placeholder(tf.float32, [None])
        LR = tf.placeholder(tf.float32, [])
        CLIPRANGE = tf.placeholder(tf.float32, [])

        neglogpac = train_model.pd.neglogp(A)
        entropy = tf.reduce_mean(train_model.pd.entropy())

        vpred = train_model.vf
        vpredclipped = OLDVPRED + tf.clip_by_value(train_model.vf - OLDVPRED, - CLIPRANGE, CLIPRANGE)
        vf_losses1 = tf.square(vpred - R)
        vf_losses2 = tf.square(vpredclipped - R)
        vf_loss = .5 * tf.reduce_mean(tf.maximum(vf_losses1, vf_losses2))
        ratio = tf.exp(OLDNEGLOGPAC - neglogpac)
        pg_losses = -ADV * ratio
        pg_losses2 = -ADV * tf.clip_by_value(ratio, 1.0 - CLIPRANGE, 1.0 + CLIPRANGE)
        pg_loss = tf.reduce_mean(tf.maximum(pg_losses, pg_losses2))
        approxkl = .5 * tf.reduce_mean(tf.square(neglogpac - OLDNEGLOGPAC))
        clipfrac = tf.reduce_mean(tf.to_float(tf.greater(tf.abs(ratio - 1.0), CLIPRANGE)))
        loss = pg_loss - entropy * ent_coef + vf_loss * vf_coef
        with tf.variable_scope('model'):
            params = tf.trainable_variables()
        grads = tf.gradients(loss, params)
        if max_grad_norm is not None:
            grads, _grad_norm = tf.clip_by_global_norm(grads, max_grad_norm)
        grads = list(zip(grads, params))
        trainer = tf.train.AdamOptimizer(learning_rate=LR, epsilon=1e-5)
        _train = trainer.apply_gradients(grads)

        def train(lr, cliprange, obs, returns, masks, actions, values, neglogpacs, states=None):
            advs = returns - values
            advs = (advs - advs.mean()) / (advs.std() + 1e-8)
            td_map = {train_model.X:obs, A:actions, ADV:advs, R:returns, LR:lr,
                    CLIPRANGE:cliprange, OLDNEGLOGPAC:neglogpacs, OLDVPRED:values}
            if states is not None:
                td_map[train_model.S] = states
                td_map[train_model.M] = masks
            return sess.run([pg_loss, vf_loss, entropy, approxkl, clipfrac, _train], td_map)[:-1]
        self.loss_names = ['policy_loss', 'value_loss', 'policy_entropy', 'approxkl', 'clipfrac']

        def save(save_path):
            ps = sess.run(params)
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            #print(loaded_params)
            restores = []
            for p, loaded_p in zip(params, loaded_params):
                restores.append(p.assign(loaded_p))
            sess.run(restores)
            # If you want to load weights, also save/load observation scaling inside VecNormalize

        self.train = train
        self.train_model = train_model
        self.act_model = act_model
        self.step = act_model.step
        self.value = act_model.value
        self.initial_state = act_model.initial_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess) #pylint: disable=E1101
Example #52
0
 def run_apply_grads(*, grads, lr):
     feed_dict = {p: g for p, g in zip(grad_placeholders, grads)}
     feed_dict[lr] = lr
     return tf.get_default_session().run(apply_grads_op, feed_dict=feed_dict)
def evaluate_softmax(X_data):
    sess = tf.get_default_session()
    somax = sess.run(soft_max, feed_dict={x: X_data})
    return somax
Example #54
0
    def initialize(self,
                   checkpoints=None,
                   reset=False,
                   reset_learning_rate=False,
                   max_to_keep=1,
                   keep_every_n_hours=0,
                   sess=None,
                   whitelist=None,
                   blacklist=None,
                   **kwargs):
        """
        :param checkpoints: list of checkpoints to load (instead of latest checkpoint)
        :param reset: don't load latest checkpoint, reset learning rate and global step
        :param reset_learning_rate: reset the learning rate to its initial value
        :param max_to_keep: keep this many latest checkpoints at all times
        :param keep_every_n_hours: and keep checkpoints every n hours
        """
        sess = sess or tf.get_default_session()

        if keep_every_n_hours <= 0 or keep_every_n_hours is None:
            keep_every_n_hours = float('inf')

        self.saver = tf.train.Saver(
            max_to_keep=max_to_keep,
            keep_checkpoint_every_n_hours=keep_every_n_hours,
            sharded=False)

        sess.run(tf.global_variables_initializer())

        # load pre-trained embeddings
        for encoder_or_decoder, vocab in zip(self.encoders + self.decoders,
                                             self.vocabs):
            if encoder_or_decoder.embedding_file:
                utils.log('loading embeddings from: {}'.format(
                    encoder_or_decoder.embedding_file))
                embeddings = {}
                with open(encoder_or_decoder.embedding_file,
                          encoding="utf-8") as embedding_file:
                    for line in embedding_file:
                        word, vector = line.split(' ', 1)
                        if word in vocab.vocab:
                            embeddings[word] = np.array(
                                list(map(float, vector.split())))
                # standardize (mean of 0, std of 0.01)
                mean = sum(embeddings.values()) / len(embeddings)
                std = np.sqrt(
                    sum((value - mean)**2
                        for value in embeddings.values())) / (len(embeddings) -
                                                              1)
                for key in embeddings:
                    embeddings[key] = 0.01 * (embeddings[key] - mean) / std

                # change TensorFlow variable's value
                with tf.variable_scope(tf.get_variable_scope(), reuse=True):
                    embedding_var = tf.get_variable('embedding_' +
                                                    encoder_or_decoder.name)
                    embedding_value = embedding_var.eval()
                    for word, i in vocab.vocab.items():
                        if word in embeddings:
                            embedding_value[i] = embeddings[word]
                    sess.run(embedding_var.assign(embedding_value))

        if whitelist:
            with open(whitelist, encoding="utf-8") as f:
                whitelist = list(line.strip() for line in f)
        if blacklist:
            with open(blacklist, encoding="utf-8") as f:
                blacklist = list(line.strip() for line in f)
        else:
            blacklist = []

        blacklist.append('dropout_keep_prob')

        if reset_learning_rate or reset:
            blacklist.append('learning_rate')
        if reset:
            blacklist.append('global_step')

        params = {
            k: kwargs.get(k)
            for k in ('variable_mapping', 'reverse_mapping')
        }

        if checkpoints and len(self.models) > 1:
            assert len(self.models) == len(checkpoints)
            for i, checkpoint in enumerate(checkpoints, 1):
                load_checkpoint(sess,
                                None,
                                checkpoint,
                                blacklist=blacklist,
                                whitelist=whitelist,
                                prefix='model_{}'.format(i),
                                **params)
        elif checkpoints:  # load partial checkpoints
            for checkpoint in checkpoints:  # checkpoint files to load
                load_checkpoint(sess,
                                None,
                                checkpoint,
                                blacklist=blacklist,
                                whitelist=whitelist,
                                **params)
        elif not reset:
            load_checkpoint(sess,
                            self.checkpoint_dir,
                            blacklist=blacklist,
                            whitelist=whitelist,
                            **params)

        utils.debug('global step: {}'.format(self.global_step.eval()))
        utils.debug('baseline step: {}'.format(self.baseline_step.eval()))
Example #55
0
def reset_graph():
    """Closes the current default session and resets the graph."""
    sess = tf.get_default_session()
    if sess:
        sess.close()
    tf.reset_default_graph()
Example #56
0
 def post_update(self, feed_dict, context):
     if self.steps_since_target_update > self.parameter:
         tf.get_default_session().run(self.target_agent_update)
         self.steps_since_target_update = 0
     else:
         self.steps_since_target_update += 1
Example #57
0
 def decay_learning_rate(self):
     sess = tf.get_default_session()
     if sess is None or self.learning_rate_decay is None:
         raise ValueError('need session learning rate decay op')
     sess.run(self.learning_rate_decay)
Example #58
0
 def post_update(self, feed_dict, context):
     tf.get_default_session().run(self.target_agent_update)
Example #59
0
def get_session():
    """
    Returns recently made TensorFlow session
    :return: tf.Session()
    """
    return tf.get_default_session()
Example #60
0
    def run(self):
        args = self.args
        sess = self.sess

        seq_length = args.seq_length

        env = EnvBreakoutWrapper()

        states = np.zeros((seq_length, *self.a3cnet.state_shape),
                          dtype=np.float32)
        actions = np.zeros((seq_length, ), dtype=np.int32)
        rewards = np.zeros((seq_length, ), dtype=np.float32)
        values = np.zeros((seq_length, ), dtype=np.float32)
        discounted_utilities = np.zeros((seq_length, ), dtype=np.float32)

        with sess.as_default():
            # with tqdm(desc='Episode', total=args.episodes, unit=' episodes') as pbar:
            assert tf.get_default_session() is sess, 'session mismatch'
            # for episode in range(args.episodes):
            global episode
            global scores
            global stats

            while episode < args.episodes:
                total_rewards = 0
                episode_completed = False
                eps_idx = 0
                lives = 5
                # reset environment
                env.reset()
                env.next_life()
                lstm_state = self.a3cnet.zero_lstm_state()
                while not episode_completed:
                    # gather loss statistics
                    policy_loss_list, value_loss_list, entropy_list = [], [], []
                    # save the LSTM state at the beginning of a sequence
                    lstm_state_seq_start = lstm_state
                    # simulate a sequence of steps
                    for seq_idx in range(seq_length):
                        # determine policy
                        feed_dict = {
                            self.a3cnet.state: env.state[np.newaxis, :],
                            self.a3cnet.ph_lstm_state: lstm_state
                        }
                        res = sess.run([
                            self.a3cnet.policy, self.a3cnet.value,
                            self.a3cnet.final_lstm_state
                        ],
                                       feed_dict=feed_dict)
                        policy = res[0][0][0]  # [][batch][time]
                        value = res[1][0][0][0]  # [][batch][time][]
                        lstm_state = res[2]
                        assert all(
                            policy >= 0), "policy not >0 {}".format(policy)
                        assert abs(sum(policy) -
                                   1) < 1e4, "sum policy not 1 {}".format(
                                       sum(policy))
                        # select action according to policy
                        action = np.random.choice(self.a3cnet.action_size,
                                                  p=policy)
                        # execute action, get reward and next state
                        reward, done, info = env.step(action)
                        # save state, action, reward
                        states[seq_idx, :] = env.prev_state
                        actions[seq_idx] = action
                        rewards[seq_idx] = reward
                        values[seq_idx] = value
                        # check if dead
                        if lives > info['ale.lives']:  # has died
                            lives = info['ale.lives']
                            reward = -1
                            env.next_life()
                        # update sum_of_rewards
                        total_rewards += reward
                        # end episode if done
                        if done:
                            break

                    seq_end = (seq_idx + 1)
                    eps_idx += (seq_idx + 1)

                    # calculate discounted utilities
                    if not done:
                        feed_dict = {
                            self.a3cnet.state: env.state[np.newaxis, :],
                            self.a3cnet.ph_lstm_state: lstm_state
                        }
                        res = sess.run(self.a3cnet.value, feed_dict=feed_dict)
                        value = res[0]
                        running_sum = value
                    elif seq_idx > 0:
                        seq_end = seq_idx
                        running_sum = value
                    else:
                        episode_completed = True
                        break  # nothing to train on
                    for reverse_idx in range(seq_idx, -1, -1):
                        running_sum = args.gamma * running_sum + rewards[
                            reverse_idx]
                        discounted_utilities[reverse_idx] = running_sum

                    # train
                    feed_dict = {
                        self.a3cnet.state:
                        states[:seq_end, ...],
                        self.a3cnet.actions:
                        actions[:seq_end],
                        self.a3cnet.value_target:
                        discounted_utilities[:seq_end, ...],
                        self.a3cnet.lr:
                        args.lr,
                        self.a3cnet.coeff_p:
                        args.coeff_p,
                        self.a3cnet.coeff_v:
                        args.coeff_v,
                        self.a3cnet.coeff_h:
                        args.coeff_h,
                        self.a3cnet.ph_lstm_state:
                        lstm_state_seq_start
                    }
                    run_result = sess.run([
                        self.a3cnet.loss, self.a3cnet.train_step,
                        self.a3cnet.policy_loss, self.a3cnet.value_loss,
                        self.a3cnet.entropy, self.a3cnet.value,
                        self.a3cnet.value_target, self.a3cnet.policy,
                        self.a3cnet.logits, self.a3cnet.H
                    ],
                                          feed_dict=feed_dict)

                    loss, policy_loss, value_loss, entropy = run_result[
                        0], run_result[2], run_result[3], run_result[4]
                    value, value_target = run_result[5], run_result[6]
                    policy = run_result[7]
                    logits = run_result[8]
                    H = run_result[9]
                    policy_loss_list.append(policy_loss)
                    value_loss_list.append(value_loss)
                    entropy_list.append(entropy)
                    if (eps_idx // seq_length) % 32 == 0:
                        print(
                            'policy_loss={:2f}, value_loss={:2f}, entropy={:2f}, loss={:2f} '
                            .format(policy_loss, value_loss, entropy, loss),
                            end='')
                        print(actions[:seq_end])
                        # print(policy[0][0])
                        # print(logits[0][0])
                        # print(H)
                    # print(rewards[:seq_end])
                    # print(discounted_utilities[:seq_end])
                    # print(values[:seq_end])

                    # print('value ', value)
                    # print('value_target ', value_target)

                    # determine whether episode completed
                    if done or eps_idx > args.max_episode_length:
                        episode_completed = True

                # save sum_of_rewards
                scores.append(total_rewards)
                stats['policy_loss'].append(
                    np.mean(policy_loss_list
                            ) if len(policy_loss_list) > 0 else 0)
                stats['value_loss'].append(
                    np.mean(value_loss_list) if len(value_loss_list) > 0 else 0
                )
                stats['entropy'].append(
                    np.mean(entropy_list) if len(entropy_list) > 0 else 0)
                episode += 1