def test_graph_input_order(): in1 = tx.Input(n_units=1, name="in1", dtype=tf.float32, constant=False) in2 = tx.Input(n_units=1, name="in2", dtype=tf.float32, constant=False) in12 = tx.Add(in1, in2) in3 = tx.Constant(tf.ones(shape=[1], dtype=tf.float32)) in123 = tx.Add(in12, in3) graph = tx.Graph.build(inputs=None, outputs=in123) # print("\n") # for layer,p in graph.dependency_iter().items(): # print(layer.name) # print(p) print(list(map(lambda x: x.name, graph.in_nodes)))
def _build_graph(self, layer, previous_state): with layer_scope(self): if previous_state is None: input_batch = tf.shape(layer.tensor)[0] zero_state = tf.zeros([input_batch, self.n_units]) self.previous_state = tx.TensorLayer(zero_state, self.n_units) if self.share_state_with is None: # determines the weight of the previous state # we could add the bias at the end but this way we just define a single bias for the r unit self.r_current_w = tx.Linear(layer, self.n_units, bias=True, weight_init=self.init, name="r_current_w") self.r_recurrent_w = tx.Linear(self.previous_state, self.n_units, bias=False, weight_init=self.recurrent_init, name="r_current_w") self.u_current_w = tx.Linear(layer, self.n_units, bias=True, weight_init=self.init, name="u_current_w") self.u_recurrent_w = tx.Linear(self.previous_state, self.n_units, bias=False, weight_init=self.recurrent_init, name="u_current_w") self.current_w = tx.Linear(layer, self.n_units, bias=True, weight_init=self.init, name="current_w") self.recurrent_w = tx.Linear(self.previous_state, self.n_units, bias=False, weight_init=self.recurrent_init, name="recurrent_w") # kernel_gate = tx.Activation() kernel_act = tx.Activation(kernel_linear, self.activation) self.kernel = tx.Compose(kernel_linear, kernel_act) else: self.kernel = self.share_state_with.kernel.reuse_with(layer) self.recurrent_kernel = self.share_state_with.recurrent_kernel.reuse_with( self.previous_state) r_state = tx.Add(r_current_w, r_recurrent_w) r_state = tx.Bias(r_state) r_gate = tx.Activation(r_state, fn=tx.sigmoid, name="r_gate") # """Gated recurrent unit (GRU) with nunits cells.""" return self.kernel.tensor + self.recurrent_kernel.tensor
def test_merge_add_shape(): x1 = tx.Input([[2.]], n_units=1, name="x1") x2 = tx.Input([[2.]], n_units=1, name="x2") add = tx.Add(x1, x2) assert len(add.shape) == 2 assert add.shape[-1] == 1 assert add.shape[0] is None
def test_module_reuse_order(): x1 = tx.Input([[2.]], n_units=1, name="x1") x2 = tx.Input([[2.]], n_units=1, name="x2") x3 = tx.Input([[1.]], n_units=1, name="x3") h = tx.Add(x2, x3) y = tx.Add(x1, h) module = tx.Module(inputs=[x1, x2, x3], output=y) x1_ = tx.Constant([[2.]], name="x1b") x2_ = tx.Constant([[2.]], name="x2b") m2 = module.reuse_with(x1_, x2_) m1 = module() m2 = m2() assert tx.tensor_equal(m1, m2)
def test_build_graph(): x1 = tx.Input(n_units=1000, constant=False, dtype=tf.float32) x2 = tx.Input(init_value=tf.ones([1, 3]), dtype=tf.float32, constant=True) y10 = tx.Linear(x1, n_units=3) y11 = tx.Activation(y10) y1 = tx.Module(x1, y11) y2 = tx.Add(y1, x2) output = y2 graph = Graph.build(inputs=None, outputs=[y1, y2]) # module condenses 2 nodes so it's 4 and not 6 assert len(graph.nodes) == 4 @tf.function def simple_graph(in0): x1.value = in0 return y2() simple_graph_2 = Graph.build(inputs=[x1, x2], outputs=y2) simple_graph_2 = tf.function(simple_graph_2) g = Graph.build(inputs=[x1, x2], outputs=y2) y2fn = y2.as_function() data = tf.ones([256, 1000]) x1.value = data compiled_fn = g.as_function(ord_inputs=x1, ord_outputs=output) assert tx.tensor_equal(compiled_fn(data), y2fn()) assert tx.tensor_equal(compiled_fn(data), simple_graph_2()[0]) from timeit import timeit def update_run(): x1.value = tf.random.uniform([256, 1000]) return y2fn() n = 1000 t_update_run = timeit(update_run, number=n) t_generated = timeit(lambda: compiled_fn(tf.random.uniform([256, 1000])), number=n) t_compile_value_set = timeit( lambda: simple_graph(tf.random.uniform([256, 1000])), number=n) t_graph_call_tf = timeit( lambda: simple_graph_2(tf.random.uniform([256, 1000])), number=n) assert t_generated < t_update_run assert t_generated < t_compile_value_set assert t_generated < t_graph_call_tf assert t_update_run > t_compile_value_set o1 = compiled_fn(tf.random.uniform([256, 1000])) o2 = compiled_fn(tf.random.uniform([256, 1000])) assert not tx.tensor_equal(o1, o2)
def test_dependency_iter(): """ Dependency iterator after adding leaves to the graph """ x1 = tx.Input(n_units=2, name="x1", constant=False) x2 = tx.Input(n_units=2, name="x2", constant=False) y1 = tx.Linear(x2, 2, name="y1") y2 = tx.Linear(y1, 2, name="y2") y3 = tx.Linear(x1, 2, name="y3") graph = Graph.build(inputs=[x1, x2], outputs=[y2, y3]) dep = graph.dependency_iter() dep_iter = list(dep) assert sorted(dep.values()) assert dep_iter[0] is x1 assert dep_iter[1] is x2 assert y1 in dep_iter[-2:] assert y2 in dep_iter[-2:] # ANOTHER GRAPH x1 = tx.Input(n_units=1, name="x1") x2 = tx.Input(n_units=1, name="x2") x3 = tx.Input(n_units=1, name="x3") h = tx.Add(x1, x2, name="h") y = tx.Add(x3, h, name="y") g = Graph.build(inputs=None, outputs=y) priorities = g.dependency_iter() assert priorities[y] == (2, 0) assert priorities[x1] == (0, 1) assert priorities[y] > priorities[h]
def test_multi_output_graph(): data1 = [[1., 1.]] data2 = [[2., 1.]] in1 = tx.Input(data1, 2, name="in1", constant=False) in2 = tx.Input(data2, 2, name="in2") linear1 = tx.Linear(in1, 1) linear2 = tx.Linear(tx.Add(in1, in2), 1) graph = tx.Graph.build(inputs=None, outputs=[linear1, linear2]) result1 = graph() assert len(result1) == 2 graph2 = tx.Graph.build(inputs=None, outputs=[linear2]) result2 = graph2() assert len(result2) == 1 assert tx.tensor_equal(result2[0], result1[-1])
def test_module_gate(): """ Module + Gate Integration """ x1 = tx.Input([[1, 1, 1, 1]], n_units=4, dtype=tf.float32) x2 = tx.Input([[1, 1]], n_units=2, dtype=tf.float32) x1 = tx.Add(x1, x1) gate = tx.Gate(input_layer=x1, gate_input=x2, gate_fn=tf.sigmoid) gate_module = tx.Module([x1, x2], gate) x3 = tx.Input([[1, 1, 1, 1]], n_units=4, dtype=tf.float32) x4 = tx.Input([[1, 1]], n_units=2, dtype=tf.float32) m2 = gate_module.reuse_with(x3, x4) result1 = gate_module() result2 = m2() result3 = gate_module.compute(x3, x4) assert tx.tensor_equal(result1, result2 * 2) assert tx.tensor_equal(result2, result3)
def test_graph_as_function(): data = [[1., 2.]] in1 = tx.Input(n_units=1, name="in1", dtype=tf.float32, constant=False) in2 = tx.Input(n_units=1, name="in1", dtype=tf.float32, constant=False) in3 = tx.Constant(tf.ones(shape=[1], dtype=tf.float32)) in12 = tx.Add(in1, in2, in3) graph = tx.Graph.build(inputs=[in1, in2, in3], outputs=in12) fn = graph.as_function_v2(ord_inputs=[in1, in2, in3], stateful_inputs=True, compile=False, fn_name="add") # fn = graph.as_function_v2(stateful_inputs=True, compile=False) # TODO I should make sure the function converts the inputs to tensors # to make sure I don't pass lists around assert fn(np.array([[1.]], dtype=np.float), np.array([[1.]], dtype=np.float)) == [[3]] assert fn() == [[3]] assert fn([[1.]], [[2.]]) == [[4]] assert fn() == [[4]] assert fn([[2.]]) == [[5]]
def __init__(self, ctx_size, vocab_size, embed_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, use_gate=True, use_hidden=False, h_dim=100, h_activation=tx.elu, h_init=tx.he_normal_init(), h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_dropout=True, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5, use_nce=False, nce_samples=100): # GRAPH INPUTS run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input") loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target") eval_inputs = loss_inputs # RUN GRAPH # if I create a scope here the Tensorboard graph will be a mess to read # because it groups everything by nested scope names # instead if I choose to create different scopes for train and eval only # the graph stays readable because it allows us to use the same names # under different scopes while still sharing variables var_reg = [] with tf.name_scope("run"): feature_lookup = tx.Lookup(run_inputs, ctx_size, [vocab_size, embed_dim], embed_init, name="lookup") var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() if use_gate or use_hidden: hl = tx.Linear(feature_lookup, h_dim, h_init, bias=True, name="h_linear") ha = tx.Activation(hl, h_activation, name="h_activation") h = tx.Compose(hl, ha, name="hidden") var_reg.append(hl.weights) features = feature_lookup if use_gate: gate_w = tx.Linear(h, ctx_size, bias=True) gate = tx.Gate(features, gate_input=gate_w) # gate = tx.Module([h, features], gate) features = gate var_reg.append(gate_w.weights) x_to_f = tx.Linear(features, embed_dim, x_to_f_init, bias=True, name="x_to_f") var_reg.append(x_to_f.weights) f_prediction = x_to_f if use_hidden: h_to_f = tx.Linear(h, embed_dim, h_to_f_init, bias=True, name="h_to_f") var_reg.append(h_to_f.weights) f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted") # RI DECODING =============================================== shared_weights = tf.transpose( feature_lookup.weights) if embed_share else None logit_init = logit_init if not embed_share else None run_logits = tx.Linear(f_prediction, vocab_size, logit_init, shared_weights, bias=True, name="logits") if not embed_share: var_reg.append(run_logits.weights) y_prob = tx.Activation(run_logits, tx.softmax) # TRAIN GRAPH =============================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(run_inputs) features = tx.Dropout(feature_lookup, probability=keep_prob) else: features = feature_lookup if use_gate or use_hidden: if use_dropout: h = h.reuse_with(features) h = tx.Dropout(h, probability=keep_prob) if use_gate: gate_w = gate_w.reuse_with(h) features = gate.reuse_with(layer=features, gate_input=gate_w) f_prediction = x_to_f.reuse_with(features) if use_hidden: h_to_f = h_to_f.reuse_with(h) if use_dropout: h_to_f = tx.Dropout(h_to_f, probability=keep_prob) f_prediction = tx.Add(f_prediction, h_to_f) else: f_prediction = f_prediction.reuse_with(features) train_logits = run_logits.reuse_with(f_prediction) if use_nce: # uniform gets good enough results if enough samples are used # but we can load the empirical unigram distribution # or learn the unigram distribution during training sampled_values = uniform_sampler(loss_inputs.tensor, 1, nce_samples, True, vocab_size) train_loss = tf.nn.nce_loss(weights=tf.transpose( train_logits.weights), biases=train_logits.bias, inputs=f_prediction.tensor, labels=loss_inputs.tensor, num_sampled=nce_samples, num_classes=vocab_size, num_true=1, sampled_values=sampled_values) else: one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy( one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # EVAL GRAPH =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # SETUP MODEL CONTAINER ==================================== super().__init__(run_inputs=run_inputs, run_outputs=y_prob, train_inputs=run_inputs, train_outputs=y_prob, eval_inputs=run_inputs, eval_outputs=y_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)
def __init__(self, ctx_size, vocab_size, k_dim, ri_tensor: RandomIndexTensor, embed_dim, embed_init=tx.random_uniform(minval=-0.01, maxval=0.01), x_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), logit_init=tx.random_uniform(minval=-0.01, maxval=0.01), embed_share=True, logit_bias=False, use_gate=True, use_hidden=False, h_dim=100, h_activation=tx.elu, h_init=tx.he_normal_init(), h_to_f_init=tx.random_uniform(minval=-0.01, maxval=0.01), use_dropout=True, embed_dropout=False, keep_prob=0.95, l2_loss=False, l2_loss_coef=1e-5): # GRAPH INPUTS run_inputs = tx.Input(ctx_size, dtype=tf.int32, name="input") loss_inputs = tx.Input(n_units=1, dtype=tf.int32, name="target") eval_inputs = loss_inputs # RUN GRAPH ===================================================== var_reg = [] with tf.name_scope("run"): # RI ENCODING =============================================== # convert ids to ris gather a set of random indexes based on the ids in a sequence # ri_layer = tx.TensorLayer(ri_tensor, n_units=k_dim) # ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) with tf.name_scope("ri_encode"): # used to compute logits if isinstance(ri_tensor, RandomIndexTensor): ri_layer = tx.TensorLayer(ri_tensor.to_sparse_tensor(), k_dim) ri_inputs = ri_tensor.gather(run_inputs.tensor) ri_inputs = ri_inputs.to_sparse_tensor() ri_inputs = tx.TensorLayer(ri_inputs, k_dim) else: ri_layer = tx.TensorLayer(ri_tensor, k_dim) ri_inputs = tx.gather_sparse(ri_layer.tensor, run_inputs.tensor) ri_inputs = tx.TensorLayer(ri_inputs, k_dim) # use those sparse indexes to lookup a set of features based on the ri values feature_lookup = tx.Lookup(ri_inputs, ctx_size, [k_dim, embed_dim], embed_init, name="lookup") var_reg.append(feature_lookup.weights) feature_lookup = feature_lookup.as_concat() # =========================================================== if use_gate or use_hidden: hl = tx.Linear(feature_lookup, h_dim, h_init, bias=True, name="h_linear") ha = tx.Activation(hl, h_activation, name="h_activation") h = tx.Compose(hl, ha, name="hidden") var_reg.append(hl.weights) features = feature_lookup if use_gate: features = tx.Gate(features, ctx_size, gate_input=h) gate = features var_reg.append(features.gate_weights) x_to_f = tx.Linear(features, embed_dim, x_to_f_init, bias=True, name="x_to_f") var_reg.append(x_to_f.weights) f_prediction = x_to_f if use_hidden: h_to_f = tx.Linear(h, embed_dim, h_to_f_init, bias=True, name="h_to_f") var_reg.append(h_to_f.weights) f_prediction = tx.Add(x_to_f, h_to_f, name="f_predicted") # RI DECODING =============================================== shared_weights = feature_lookup.weights if embed_share else None logit_init = logit_init if not embed_share else None # embedding feature vectors for all words: shape [vocab_size, embed_dim] # later, for NCE we don't need to get all the features all_embeddings = tx.Linear(ri_layer, embed_dim, logit_init, shared_weights, name="logits", bias=False) # dot product of f_predicted . all_embeddings with bias for each target word run_logits = tx.Linear(f_prediction, n_units=vocab_size, shared_weights=all_embeddings.tensor, transpose_weights=True, bias=logit_bias) if not embed_share: var_reg.append(all_embeddings.weights) # =========================================================== run_embed_prob = tx.Activation(run_logits, tx.softmax) # TRAIN GRAPH =================================================== with tf.name_scope("train"): if use_dropout and embed_dropout: feature_lookup = feature_lookup.reuse_with(ri_inputs) features = tx.Dropout(feature_lookup, probability=keep_prob) else: features = feature_lookup if use_gate or use_hidden: if use_dropout: h = h.reuse_with(features) h = tx.Dropout(h, probability=keep_prob) if use_gate: features = gate.reuse_with(features, gate_input=h) f_prediction = x_to_f.reuse_with(features) if use_hidden: h_to_f = h_to_f.reuse_with(h) if use_dropout: h_to_f = tx.Dropout(h_to_f, probability=keep_prob) f_prediction = tx.Add(f_prediction, h_to_f) else: f_prediction = f_prediction.reuse_with(features) # we already define all_embeddings from which these logits are computed before so this should be ok train_logits = run_logits.reuse_with(f_prediction) train_embed_prob = tx.Activation(train_logits, tx.softmax, name="train_output") one_hot = tx.dense_one_hot(column_indices=loss_inputs.tensor, num_cols=vocab_size) train_loss = tx.categorical_cross_entropy(one_hot, train_logits.tensor) train_loss = tf.reduce_mean(train_loss) if l2_loss: losses = [tf.nn.l2_loss(var) for var in var_reg] train_loss = train_loss + l2_loss_coef * tf.add_n(losses) # EVAL GRAPH =============================================== with tf.name_scope("eval"): one_hot = tx.dense_one_hot(column_indices=eval_inputs.tensor, num_cols=vocab_size) eval_loss = tx.categorical_cross_entropy(one_hot, run_logits.tensor) eval_loss = tf.reduce_mean(eval_loss) # SETUP MODEL CONTAINER ==================================== super().__init__(run_inputs=run_inputs, run_outputs=run_embed_prob, train_inputs=run_inputs, train_outputs=train_embed_prob, eval_inputs=run_inputs, eval_outputs=run_embed_prob, train_out_loss=train_loss, train_in_loss=loss_inputs, eval_out_score=eval_loss, eval_in_score=eval_inputs)