def _make(self, hp, global_step=None): ts = NS() ts.global_step = global_step ts.x = tf.placeholder(dtype=tf.int32, name="x") ts.seq = self.model.make_training_graph(x=ts.x, length=self.segment_length) ts.final_state = ts.seq.final_state ts.loss = ts.seq.loss ts.error = ts.seq.error ts.learning_rate = tf.Variable(hp.initial_learning_rate, dtype=tf.float32, trainable=False, name="learning_rate") ts.decay_op = tf.assign(ts.learning_rate, ts.learning_rate * hp.decay_rate) ts.optimizer = tf.train.AdamOptimizer(ts.learning_rate) ts.params = tf.trainable_variables() print[param.name for param in ts.params] ts.gradients = tf.gradients( ts.loss, ts.params, # secret memory-conserving sauce aggregation_method=tf.AggregationMethod.EXPERIMENTAL_TREE) loose_params = [ param for param, gradient in util.equizip(ts.params, ts.gradients) if gradient is None ] if loose_params: raise ValueError("loose parameters: %s" % " ".join(param.name for param in loose_params)) # tensorflow fails miserably to compute gradient for these for reg_var in tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES): ts.gradients[ts.params.index(reg_var)] += ( hp.weight_decay * tf.gradients(tf.sqrt(tf.reduce_sum(reg_var**2)), [reg_var])[0]) ts.clipped_gradients, _ = tf.clip_by_global_norm( ts.gradients, hp.clip_norm) ts.training_op = ts.optimizer.apply_gradients( util.equizip(ts.clipped_gradients, ts.params), global_step=ts.global_step) ts.summaries = [ tf.summary.scalar("loss_train", ts.loss), tf.summary.scalar("error_train", ts.error), tf.summary.scalar("learning_rate", ts.learning_rate) ] for parameter, gradient in util.equizip(ts.params, ts.gradients): ts.summaries.append( tf.summary.scalar("meanlogabs_%s" % parameter.name, tfutil.meanlogabs(parameter))) ts.summaries.append( tf.summary.scalar("meanlogabsgrad_%s" % parameter.name, tfutil.meanlogabs(gradient))) return ts
def run(self, session, examples, max_step_count=None, hp=None, aggregates=None): aggregates = NS(aggregates or {}) for key in "loss error".split(): if key not in aggregates: aggregates[key] = util.MeanAggregate() tensors = self.tensors.Extract(*[key for key in aggregates.Keys()]) tensors.Update(self.tensors.Extract("final_state.model")) state = NS(step=0, model=self.model.initial_state(hp.batch_size)) try: for batch in util.batches(examples, hp.batch_size): for segment in util.segments(batch, hp.segment_length, overlap=hp.chunk_size): if max_step_count is not None and state.step >= max_step_count: raise StopIteration() x, = util.examples_as_arrays(segment) feed_dict = {self.tensors.x: x.T} feed_dict.update(self.model.feed_dict(state.model)) values = NS.FlatCall( ft.partial(session.run, feed_dict=feed_dict), tensors) for key in aggregates.Keys(): aggregates[key].add(values[key]) sys.stderr.write(".") state.model = values.final_state.model state.step += 1 except StopIteration: pass sys.stderr.write("\n") values = NS( (key, aggregate.value) for key, aggregate in aggregates.Items()) values.summaries = [ tf.Summary.Value(tag="%s_valid" % key, simple_value=values[key]) for key in "loss error".split() ] print "### evaluation loss %6.5f error %6.5f" % (values.loss, values.error) if np.isnan(values.loss): raise ValueError("loss has become NaN") return values