def architecture_children(self): # TODO set LRN n = num_filters / 8 + 1 nodes = [ # NOTE: not explicitly giving the first conv a pad of "same", # since the first conv can have any output shape tn.DnnConv2DWithBiasNode(self.name + "_conv0"), tn.IdentityNode(self.name + "_z0"), tn.ReLUNode(self.name + "_z0_relu"), lrn.LocalResponseNormalizationNode(self.name + "_z0_lrn"), tn.IdentityNode(self.name + "_x0"), ] for t in range(1, self.steps + 1): nodes += [ tn.DnnConv2DWithBiasNode(self.name + "_conv%d" % t, stride=(1, 1), pad="same"), tn.ElementwiseSumNode(self.name + "_sum%d" % t, [ tn.ReferenceNode(self.name + "_sum%d_curr" % t, reference=self.name + "_conv%d" % t), tn.ReferenceNode(self.name + "_sum%d_prev" % t, reference=self.name + "_z0") ]), tn.IdentityNode(self.name + "_z%d" % t), tn.ReLUNode(self.name + "_z%d_relu" % t), lrn.LocalResponseNormalizationNode(self.name + "_z%d_lrn" % t), tn.IdentityNode(self.name + "_x%d" % t), ] return [tn.SequentialNode(self.name + "_sequential", nodes)]
def architecture_children(self): gate_node = tn.SequentialNode( self.name + "_gate_seq", [ batch_fold.AddAxisNode(self.name + "_add_axis", axis=2), batch_fold.FoldUnfoldAxisIntoBatchNode( self.name + "_batch_fold", # NOTE: using dnn conv, since pooling is normally strided # and the normal conv is slow with strides tn.DnnConv2DWithBiasNode(self.name + "_conv", num_filters=1), axis=1), batch_fold.RemoveAxisNode(self.name + "_remove_axis", axis=2), tn.SigmoidNode(self.name + "_gate_sigmoid") ]) inverse_gate_node = tn.SequentialNode(self.name + "_max_gate", [ tn.ReferenceNode(self.name + "_gate_ref", reference=gate_node.name), tn.MultiplyConstantNode(self.name + "_", value=-1), tn.AddConstantNode(self.name + "_add1", value=1) ]) mean_node = tn.ElementwiseProductNode( self.name + "_mean_product", [tn.MeanPool2DNode(self.name + "_mean_pool"), gate_node]) max_node = tn.ElementwiseProductNode( self.name + "_max_product", [tn.MaxPool2DNode(self.name + "_max_pool"), inverse_gate_node]) return [ tn.ElementwiseSumNode(self.name + "_sum", [mean_node, max_node]) ]
def architecture_children(self): children = self.raw_children() gate = children["gate"] transform = children["transform"] # prepare gates transform_gate = tn.SequentialNode( self.name + "_transformgate", [ gate, # add initial value as bias instead # TODO parameterize tn.AddConstantNode(self.name + "_biastranslation", value=-4), tn.SigmoidNode(self.name + "_transformgatesigmoid") ]) # carry gate = 1 - transform gate carry_gate = tn.SequentialNode(self.name + "_carrygate", [ tn.ReferenceNode(self.name + "_transformgateref", reference=transform_gate.name), tn.MultiplyConstantNode(self.name + "_invert", value=-1), tn.AddConstantNode(self.name + "_add", value=1) ]) # combine with gates gated_transform = tn.ElementwiseProductNode( self.name + "_gatedtransform", [transform_gate, transform]) gated_carry = tn.ElementwiseProductNode( self.name + "_gatedcarry", [carry_gate, tn.IdentityNode(self.name + "_carry")]) res = tn.ElementwiseSumNode(self.name + "_res", [gated_carry, gated_transform]) return [res]
def test_batch_normalization_node(): network = tn.AdamNode( "adam", { "subtree": tn.SequentialNode("seq", [ tn.InputNode("x", shape=(None, 10)), batch_normalization.BatchNormalizationNode("bn"), tn.DenseNode("d", num_units=1), ]), "cost": tn.TotalCostNode( "cost", { "target": tn.InputNode("y", shape=(None, 1)), "pred": tn.ReferenceNode("pred_ref", reference="d"), }, cost_function=treeano.utils.squared_error) }).network() fn = network.function(["x", "y"], ["cost"], include_updates=True) x = 100 + 100 * np.random.randn(100, 10).astype(fX) y = np.random.randn(100, 1).astype(fX) prev_cost = fn(x, y)[0] for _ in range(3): cost = fn(x, y)[0] assert cost < prev_cost prev_cost = cost
def test_reference_node(): network = tn.SequentialNode("s", [ tn.InputNode("input1", shape=(3, 4, 5)), tn.InputNode("input2", shape=(5, 4, 3)), tn.ReferenceNode("ref", reference="input1"), ]).network() fn = network.function(["input1"], ["ref"]) x = np.random.randn(3, 4, 5).astype(fX) np.testing.assert_allclose(fn(x)[0], x)
def test_affine_spatial_transformer_node_build(): localization_network = tn.HyperparameterNode( "loc", tn.SequentialNode( "loc_seq", [tn.DenseNode("loc_fc1", num_units=50), tn.ReLUNode("loc_relu3"), tn.DenseNode("loc_fc2", num_units=6, inits=[treeano.inits.ZeroInit()])]), num_filters=32, filter_size=(5, 5), pool_size=(2, 2), ) model = tn.HyperparameterNode( "model", tn.SequentialNode( "seq", [tn.InputNode("x", shape=(None, 1, 60, 60)), spatial_transformer.AffineSpatialTransformerNode( "st", localization_network, output_shape=(20, 20)), tn.DenseNode("fc1"), tn.ReLUNode("relu1"), tn.DropoutNode("do1"), tn.DenseNode("fc2", num_units=10), tn.SoftmaxNode("pred"), ]), num_filters=32, filter_size=(3, 3), pool_size=(2, 2), num_units=256, dropout_probability=0.5, inits=[treeano.inits.HeNormalInit()], ) with_updates = tn.HyperparameterNode( "with_updates", tn.AdamNode( "adam", {"subtree": model, "cost": tn.TotalCostNode("cost", { "pred": tn.ReferenceNode("pred_ref", reference="model"), "target": tn.InputNode("y", shape=(None,), dtype="int32")}, )}), cost_function=treeano.utils.categorical_crossentropy_i32, ) network = with_updates.network() network.build() # build eagerly to share weights
def GradNetOptimizerInterpolationNode(name, children, early, late, **kwargs): """ interpolates updates from 2 optimizers nodes NOTE: this is a hack to take in node constructors as arguments """ assert set(children.keys()) == {"subtree", "cost"} subtree = children["subtree"] cost = children["cost"] cost_ref = tn.ReferenceNode(name + "_costref", reference=cost.name) late_subtree = tn.UpdateScaleNode(name + "_late_update_scale", subtree) late_node = late(name + "_late", {"subtree": late_subtree, "cost": cost}) early_subtree = tn.UpdateScaleNode(name + "_early_update_scale", late_node) early_node = early(name + "_early", { "subtree": early_subtree, "cost": cost_ref }) # NOTE: need separate node to forward hyperparameter return _GradNetOptimizerInterpolationNode(name, early_node, **kwargs)
def test_save_last_inputs_and_networks(): class StateDiffNode(treeano.NodeImpl): def compute_output(self, network, in_vw): foo_vw = network.create_vw("foo", shape=(), is_shared=True, tags={"parameter", "weight"}, inits=[]) network.create_vw("default", variable=abs(in_vw.variable - foo_vw.variable), shape=()) network = tn.AdamNode( "adam", { "subtree": tn.SequentialNode( "s", [tn.InputNode("i", shape=()), StateDiffNode("ss")]), "cost": tn.ReferenceNode("r", reference="s") }).network() # eagerly create shared variables network.build() save_handler = canopy.handlers.save_last_inputs_and_networks(5) fn = canopy.handlers.handled_fn(network, [save_handler], {"x": "i"}, {"out": "s"}, include_updates=True) inputs = [{"x": treeano.utils.as_fX(np.random.randn())} for _ in range(10)] outputs = [fn(i) for i in inputs] nt.assert_equal(save_handler.inputs_, inputs[-5:]) # PY3: calling list on zip to make it eager # otherwise, save_handler.value_dicts_ looks at the mutating # value ducts for value_dict, i, o in list( zip(save_handler.value_dicts_, inputs[-5:], outputs[-5:])): canopy.network_utils.load_value_dict(network, value_dict) nt.assert_equal(o, fn(i))
def test_anrat_node(): network = tn.AdamNode( "adam", { "subtree": tn.InputNode("x", shape=(None, 1)), "cost": anrat.ANRATNode( "cost", { "target": tn.InputNode("y", shape=(None, 1)), "pred": tn.ReferenceNode("pred_ref", reference="x"), }) }).network() fn = network.function(["x", "y"], ["cost"], include_updates=True) for x_raw, y_raw in [(3.4, 2), (4.2, 4.2)]: x = np.array([[x_raw]], dtype=fX) y = np.array([[y_raw]], dtype=fX) prev_cost = fn(x, y)[0] for _ in range(3): cost = fn(x, y)[0] assert cost < prev_cost prev_cost = cost
def test_grad_net_optimizer_interpolation_node(): class StateNode(treeano.NodeImpl): input_keys = () def compute_output(self, network): network.create_vw( name="default", shape=(), is_shared=True, tags=["parameter"], inits=[], ) def updater(const): class UpdaterNode(treeano.nodes.updates.StandardUpdatesNode): def _new_update_deltas(self, network, vws, grads): return treeano.UpdateDeltas({vw.variable: const for vw in vws}) return UpdaterNode network = tn.SharedHyperparameterNode( "n", gradnet.GradNetOptimizerInterpolationNode( "g", { "subtree": StateNode("s"), "cost": tn.ReferenceNode("r", reference="s") }, early=updater(-1), late=updater(1)), hyperparameter="late_gate").network() fn1 = network.function([("n", "hyperparameter")], [], include_updates=True) fn2 = network.function([], ["n"]) gates_and_answers = [(0, -1), (0.25, -1.5), (1, -0.5), (1, 0.5)] for gate, ans in gates_and_answers: fn1(gate) np.testing.assert_allclose(ans, fn2()[0], rtol=1e-1)
pool_stride=(2, 2), pool_pad=(1, 1), inits=[treeano.inits.OrthogonalInit()], ) with_updates = tn.HyperparameterNode( "with_updates", tn.AdamNode( "adam", { "subtree": model, "cost": tn.TotalCostNode( "cost", { "pred": tn.ReferenceNode("pred_ref", reference="model"), "target": tn.InputNode("y", shape=(None, ), dtype="int32") }, ) }), cost_function=treeano.utils.categorical_crossentropy_i32, ) network = with_updates.network() network.build() # build eagerly to share weights valid_fn = canopy.handled_fn(network, [ canopy.handlers.time_call(key="valid_time"), canopy.handlers.override_hyperparameters(dropout_probability=0), canopy.handlers.batch_pad(BATCH_SIZE, keys=["x", "y"]), canopy.handlers.chunk_variables(batch_size=BATCH_SIZE, variables=["x", "y"])
{"pred": tn.IdentityNode("pred_id"), "target": tn.InputNode("y", shape=(None,), dtype="int32")}, cost_function=treeano.utils.categorical_crossentropy_i32), tn.InputElementwiseSumNode("total_cost")]), num_units=32, cost_reference="total_cost", dropout_probability=0.5, inits=[treeano.inits.XavierNormalInit()], ) with_updates = tn.HyperparameterNode( "with_updates", tn.AdamNode( "adam", {"subtree": model, "cost": tn.ReferenceNode("cost_ref", reference="total_cost")}), ) network = with_updates.network() network.build() # build eagerly to share weights BATCH_SIZE = 500 valid_fn = canopy.handled_fn( network, [canopy.handlers.time_call(key="valid_time"), canopy.handlers.override_hyperparameters(dropout_probability=0), canopy.handlers.chunk_variables(batch_size=BATCH_SIZE, variables=["x", "y"])], {"x": "x", "y": "y"}, {"total_cost": "total_cost", "pred": "pred"})
def load_network(update_scale_factor): localization_network = tn.HyperparameterNode( "loc", tn.SequentialNode( "loc_seq", [tn.DnnMaxPoolNode("loc_pool1"), tn.DnnConv2DWithBiasNode("loc_conv1"), tn.DnnMaxPoolNode("loc_pool2"), bn.NoScaleBatchNormalizationNode("loc_bn1"), tn.ReLUNode("loc_relu1"), tn.DnnConv2DWithBiasNode("loc_conv2"), bn.NoScaleBatchNormalizationNode("loc_bn2"), tn.ReLUNode("loc_relu2"), tn.DenseNode("loc_fc1", num_units=50), bn.NoScaleBatchNormalizationNode("loc_bn3"), tn.ReLUNode("loc_relu3"), tn.DenseNode("loc_fc2", num_units=6, inits=[treeano.inits.NormalWeightInit(std=0.001)])]), num_filters=20, filter_size=(5, 5), pool_size=(2, 2), ) st_node = st.AffineSpatialTransformerNode( "st", localization_network, output_shape=(20, 20)) model = tn.HyperparameterNode( "model", tn.SequentialNode( "seq", [tn.InputNode("x", shape=(None, 1, 60, 60)), # scaling the updates of the spatial transformer # seems to be very helpful, to allow the clasification # net to learn what to look for, before prematurely # looking tn.UpdateScaleNode( "st_update_scale", st_node, update_scale_factor=update_scale_factor), tn.Conv2DWithBiasNode("conv1"), tn.MaxPool2DNode("mp1"), bn.NoScaleBatchNormalizationNode("bn1"), tn.ReLUNode("relu1"), tn.Conv2DWithBiasNode("conv2"), tn.MaxPool2DNode("mp2"), bn.NoScaleBatchNormalizationNode("bn2"), tn.ReLUNode("relu2"), tn.GaussianDropoutNode("do1"), tn.DenseNode("fc1"), bn.NoScaleBatchNormalizationNode("bn3"), tn.ReLUNode("relu3"), tn.DenseNode("fc2", num_units=10), tn.SoftmaxNode("pred"), ]), num_filters=32, filter_size=(3, 3), pool_size=(2, 2), num_units=256, dropout_probability=0.5, inits=[treeano.inits.HeUniformInit()], bn_update_moving_stats=True, ) with_updates = tn.HyperparameterNode( "with_updates", tn.AdamNode( "adam", {"subtree": model, "cost": tn.TotalCostNode("cost", { "pred": tn.ReferenceNode("pred_ref", reference="model"), "target": tn.InputNode("y", shape=(None,), dtype="int32")}, )}), cost_function=treeano.utils.categorical_crossentropy_i32, learning_rate=2e-3, ) network = with_updates.network() network.build() # build eagerly to share weights return network
[tn.SequentialNode( "y_vars", [tn.DenseNode("fc_y", num_units=10), tn.SoftmaxNode("y_pred"), tn.AuxiliaryCostNode( "classification_cost", {"target": tn.InputNode("y", shape=(None,), dtype="int32")}, cost_function=treeano.utils.categorical_crossentropy_i32)]), tn.SequentialNode( "z_vars", [tn.DenseNode("fc_z", num_units=LATENT_SIZE), tn.AuxiliaryCostNode( "xcov_cost", {"target": tn.ReferenceNode("y_ref", reference="y_pred")}, cost_function=cross_covariance)])], axis=1), tn.DenseNode("fc3"), tn.ReLUNode("relu3"), tn.DenseNode("fc4"), tn.ReLUNode("relu4"), tn.DenseNode("reconstruction", num_units=28 * 28), tn.TotalCostNode( "cost", {"pred": tn.IdentityNode("recon_id"), "target": tn.ReferenceNode("in_ref", reference="x")}, cost_function=treeano.utils.squared_error), tn.MultiplyConstantNode("mul_reconstruction_error", value=0.1), tn.InputElementwiseSumNode("total_cost")]), num_units=512,
def test_hyperparameter_node_serialization(): tn.check_serialization(tn.HyperparameterNode("a", tn.ReferenceNode("b")))
}, { "from": "sigma", "to": "REINFORCE", "to_key": "sigma" }, { "from": "reward", "to": "REINFORCE", "to_key": "reward" }, { "from": "sampled", "to": "REINFORCE", "to_key": "sampled" }, { "from": "REINFORCE" }]]) network = tn.AdamNode("adam", { "subtree": graph, "cost": tn.ReferenceNode("cost", reference="REINFORCE") }, learning_rate=0.1).network() fn = network.function([], ["graph", "mu"], include_updates=True) mus = [] for i in range(1000): _, mu = fn() print("Iter:", i, "Predicted constant:", mu) mus.append(mu) print("MSE from optimal constant:", np.mean((np.array(mus) - 3.5)**2))
def test_reference_node_serialization(): tn.check_serialization(tn.ReferenceNode("a")) tn.check_serialization(tn.ReferenceNode("a", reference="bar"))