def test_undefined_grad_opt(): # Make sure that undefined grad get removed in optimized graph. random = RandomStreams(np.random.randint(1, 2147462579)) pvals = theano.shared(np.random.rand(10, 20).astype(theano.config.floatX)) pvals = pvals / pvals.sum(axis=1) pvals = gradient.zero_grad(pvals) samples = random.multinomial(pvals=pvals, n=1) samples = theano.tensor.cast(samples, pvals.dtype) samples = gradient.zero_grad(samples) cost = theano.tensor.sum(samples + pvals) grad = theano.tensor.grad(cost, samples) f = theano.function([], grad) theano.printing.debugprint(f) assert not any([isinstance(node.op, gradient.UndefinedGrad) for node in f.maker.fgraph.apply_nodes])
def test_op_removed(self): x = theano.tensor.matrix("x") y = x * gradient.zero_grad(x) f = theano.function([x], y) # need to refer to theano.gradient.zero_grad here, # theano.gradient.zero_grad is a wrapper function! assert gradient.zero_grad_ not in [node.op for node in f.maker.fgraph.toposort()]
def test_op_removed(self): x = theano.tensor.matrix('x') y = x * gradient.zero_grad(x) f = theano.function([x], y) # need to refer to theano.gradient.zero_grad here, # theano.gradient.zero_grad is a wrapper function! assert gradient.zero_grad_ not in \ [node.op for node in f.maker.fgraph.toposort()]
def test_grad(self): T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix("x") expressions_gradients = [ (x * gradient.zero_grad(x), x), (x * gradient.zero_grad(T.exp(x)), T.exp(x)), (gradient.zero_grad(x), T.constant(0.0)), (x**2 * gradient.zero_grad(x), 2 * x**2), ] for expr, expr_grad in expressions_gradients: g = gradient.grad(expr.sum(), x) # gradient according to theano f = theano.function([x], g, on_unused_input="ignore") # desired gradient f2 = theano.function([x], expr_grad, on_unused_input="ignore") assert np.allclose(f(a), f2(a))
def test_grad(self): T = theano.tensor a = np.asarray(self.rng.randn(5, 5), dtype=config.floatX) x = T.matrix("x") expressions_gradients = [ (x * gradient.zero_grad(x), x), (x * gradient.zero_grad(T.exp(x)), T.exp(x)), (gradient.zero_grad(x), T.constant(0.0)), (x ** 2 * gradient.zero_grad(x), 2 * x ** 2), ] for expr, expr_grad in expressions_gradients: g = gradient.grad(expr.sum(), x) # gradient according to theano f = theano.function([x], g, on_unused_input="ignore") # desired gradient f2 = theano.function([x], expr_grad, on_unused_input="ignore") assert np.allclose(f(a), f2(a))
def target_decode_step(self, target_embedding, h1, h2, o, a_c1, a_c2, mask): # a_c1 for feed in RNN # a_c2 for calculate score # Calculate attention score s = T.dot(o, self.attention_s) n, d = s.shape s = s.reshape((1, n, d)) attention_score = T.tanh(s + a_c2) l, n, d = attention_score.shape attention_score = attention_score.reshape((l * n, d)) attention_score = T.dot(attention_score, self.attetion_v) attention_score = attention_score.reshape((l, n)) max_clip = zero_grad(T.max(attention_score, axis=0)) attention_score = T.exp(attention_score - max_clip.reshape((1, n))) attention_score = attention_score.reshape((l, n, 1)) * mask denorm = T.sum(attention_score, axis=0) attention_score = attention_score / denorm.reshape((1, n, 1)) # Calculate attention content attention_content = T.sum(attention_score * a_c1, axis=0) # Decoding GRU layer 1 h_in = T.concatenate([h1, attention_content, target_embedding], axis=1) gate = get_output(self.gru_de_gate_1, h_in) u1 = gate[:, :self.hid_size] r1 = gate[:, self.hid_size:] reset_h1 = h1 * r1 c_in = T.concatenate([reset_h1, attention_content, target_embedding], axis=1) c1 = get_output(self.gru_de_candidate_1, c_in) h1 = (1.0 - u1) * h1 + u1 * c1 h_in = T.concatenate([h1, h2, attention_content, target_embedding], axis=1) gate = get_output(self.gru_de_gate_2, h_in) u2 = gate[:, :self.hid_size] r2 = gate[:, self.hid_size:] reset_h2 = h2 * r2 c_in = T.concatenate( [h1, reset_h2, attention_content, target_embedding], axis=1) c2 = get_output(self.gru_de_candidate_2, c_in) h2 = (1.0 - u1) * h2 + u2 * c2 o = T.concatenate([h1, h2], axis=-1) o = get_output(self.decode_out_mlp, o) return h1, h2, o, attention_content
def test_rop(self): T = theano.tensor x = T.vector() v = T.vector() y = gradient.zero_grad(x) rop = T.Rop(y, x, v) f = theano.function([x, v], rop, on_unused_input="ignore") a = np.asarray(self.rng.randn(5), dtype=config.floatX) u = np.asarray(self.rng.randn(5), dtype=config.floatX) assert np.count_nonzero(f(a, u)) == 0
def test_rop(self): T = theano.tensor x = T.vector() v = T.vector() y = gradient.zero_grad(x) rop = T.Rop(y, x, v) f = theano.function([x, v], rop, on_unused_input='ignore') a = np.asarray(self.rng.randn(5), dtype=config.floatX) u = np.asarray(self.rng.randn(5), dtype=config.floatX) assert np.count_nonzero(f(a, u)) == 0
def decoding_step(self, embedding, h1, h2, o, s_embedding, a_c1, a_c2, mask): s = T.dot(o, self.attention_s) n, d = s.shape s = s.reshape((n, 1, d)) attention_score = T.tanh(s + a_c2) n, l, d = attention_score.shape attention_score = attention_score.reshape((l * n, d)) attention_score = T.dot(attention_score, self.attetion_v) attention_score = attention_score.reshape((n, l)) max_clip = zero_grad(T.max(attention_score, axis=-1)) attention_score = T.exp(attention_score - max_clip.reshape((n, 1))) attention_score = attention_score * mask denorm = T.sum(attention_score, axis=-1) attention_score = attention_score / denorm.reshape((n, 1)) attention_content = T.sum(attention_score.reshape((n, l, 1)) * a_c1, axis=1) # Decoding GRU layer 1 input_info = T.concatenate([embedding, h1, attention_content], axis=-1) gate1 = get_output(self.gru_de_gate_1, input_info) u1 = gate1[:, :self.hid_size] r1 = gate1[:, self.hid_size:] reset_h1 = h1 * r1 c_in = T.concatenate([embedding, reset_h1, attention_content], axis=-1) c1 = get_output(self.gru_de_candidate_1, c_in) h1 = (1.0 - u1) * h1 + u1 * c1 # Decoding GRU layer 2 input_info = T.concatenate([embedding, h1, h2, attention_content], axis=-1) gate2 = get_output(self.gru_de_gate_2, input_info) u2 = gate2[:, :self.hid_size] r2 = gate2[:, self.hid_size:] reset_h2 = h2 * r2 c_in = T.concatenate([embedding, h1, reset_h2, attention_content], axis=1) c2 = get_output(self.gru_de_candidate_2, c_in) h2 = (1.0 - u2) * h2 + u2 * c2 o = get_output(self.decode_out_mlp, T.concatenate([h1, h2], axis=-1)) score_in = T.concatenate([embedding, o, attention_content], axis=-1) s = get_output(self.score, score_in) sample_score = T.dot(s, s_embedding) return h1, h2, o, s, sample_score, attention_score
def symbolic_elbo(self, source, target): """ Return a symbolic variable, representing the ELBO, for the given minibatch. :param num_samples: The number of samples to use to evaluate the ELBO. :return elbo: The symbolic variable representing the ELBO. """ n = source.shape[0] l = source.shape[1] # Get input embedding source_embedding = get_output(self.input_embedding, source) source_embedding = get_output( self.encoder, source_embedding.reshape( (n * l, self.embedding_dim))) source_embedding = source_embedding.reshape((n, l, self.hid_size)) # Create input mask encode_mask = T.cast(T.gt(source, -1), "float32") # Create decoding mask d_m = T.cast(T.gt(target, -1), "float32") decode_mask = d_m[:, 1:] # Init decoding states h_init = T.zeros((n, self.hid_size)) source_embedding = source_embedding * encode_mask.reshape((n, l, 1)) read_attention_weight = self.attention_weight read_attention_bias = self.attention_bias read_attention_bias = read_attention_bias.reshape((1, 2)) sample_embed = self.target_output_embedding.W decode_in_embedding = get_output(self.target_input_embedding, target) decode_in_embedding = decode_in_embedding[:, :-1] decode_in_embedding = decode_in_embedding.dimshuffle((1, 0, 2)) # create the time step f = 0.3 max_t = T.ceil(l * f) time_steps = T.cast(T.arange(max_t), "float32") key_init = T.tile(self.key_init.reshape((1, self.key_dim)), (n, 1)) read_pos = T.arange(l, dtype="float32") + 1.0 read_pos = read_pos.reshape( (1, l)) / (T.sum(encode_mask, axis=-1).reshape((n, 1)) + 1.0) ([h1, h2, keys, c, addresses], update) = theano.scan( self.encoding_step, outputs_info=[h_init, h_init, key_init, None, None], non_sequences=[ source_embedding, read_pos, read_attention_weight, read_attention_bias, encode_mask ], sequences=[time_steps]) # Create Attention mask t = addresses.shape[0] true_times = T.ceil(T.sum(encode_mask, axis=-1) * f) true_times = T.cast(true_times.reshape((n, 1)), "float32") attention_mask = T.cast( T.le(time_steps.reshape((1, time_steps.shape[0])), true_times), "float32") # Decoding RNN l, n, d = c.shape attention_c1 = c.reshape((n * l, d)) attention_c2 = T.dot(attention_c1, self.attention_h_2) attention_c1 = attention_c1.reshape((l, n, self.hid_size)) attention_c1 = attention_c1.dimshuffle((1, 0, 2)) attention_c2 = attention_c2.reshape((l, n, self.output_score_dim)) attention_c2 = attention_c2.dimshuffle((1, 0, 2)) decode_init = get_output(self.decode_init_mlp, T.concatenate([h1[-1], h2[-1]], axis=-1)) o_init = get_output(self.decode_out_mlp, decode_init) ([h1, h2, o, s, sample_score, att_score], update) = theano.scan(self.decoding_step, outputs_info=[ decode_init[:, :self.hid_size], decode_init[:, self.hid_size:], o_init, None, None, None ], sequences=[decode_in_embedding], non_sequences=[ sample_embed.T, attention_c1, attention_c2, attention_mask ]) # Get sample embedding l = sample_score.shape[0] n = sample_score.shape[1] max_clip = T.max(sample_score, axis=-1) score_clip = zero_grad(max_clip) sample_score = T.exp(sample_score - score_clip.reshape((l, n, 1))) sample_score = T.sum(sample_score, axis=-1) # Get true embedding true_embed = get_output(self.target_output_embedding, target[:, 1:]) true_embed = true_embed.dimshuffle((1, 0, 2)) true_embed = true_embed.reshape((n * l, self.output_score_dim)) d = s.shape[-1] s = s.reshape((n * l, d)) score = T.exp( T.sum(s * true_embed, axis=-1).reshape((l, n)) - score_clip) score = score.reshape((l, n)) prob = score / sample_score prob = prob.dimshuffle((1, 0)) # Loss per sentence loss = decode_mask * T.log(prob + 1e-5) loss = -T.mean(T.sum(loss, axis=1)) return loss, addresses
def softsign(h, epsilon=1e-3): _mu = abs(h).mean() h_epsilon = np.float32(epsilon) * _mu act = (h / (abs(h) + zero_grad(h_epsilon))) return act
def logsumexp(X): """Expects an NxD tensor""" m = zero_grad(T.max(X, axis=1, keepdims=True)) return m + T.log(T.sum(T.exp(X - m), axis=1, keepdims=True))
def symbolic_elbo(self, source, target): """ Return a symbolic variable, representing the ELBO, for the given minibatch. :param num_samples: The number of samples to use to evaluate the ELBO. :return elbo: The symbolic variable representing the ELBO. """ n = target.shape[0] # Encoding mask encode_mask = T.cast(T.gt(source, -1), "float32") source_input_embedding = get_output(self.input_embedding, source) n, l = encode_mask.shape encode_mask = encode_mask.reshape((n, l, 1)) encode_mask = encode_mask.dimshuffle((1, 0, 2)) source_input_embedding = source_input_embedding.dimshuffle((1, 0, 2)) # Encoding RNN h_init = T.zeros((n, self.hid_size)) ([h_e_1, h_e_2, e_o], update) = theano.scan(self.source_encode_step, outputs_info=[h_init, h_init, None], sequences=[source_input_embedding, encode_mask]) decode_mask = T.cast(T.gt(target, -1), "float32")[:, 1:] # Decoding RNN attention_candidate = e_o l, n, d = attention_candidate.shape attention_c1 = attention_candidate.reshape((n * l, d)) attention_c2 = T.dot(attention_c1, self.attention_h_2) attention_c1 = attention_c1.reshape((l, n, self.output_score_dim)) attention_c2 = attention_c2.reshape((l, n, self.output_score_dim)) target_input = target[:, :-1] n, l = target_input.shape target_input = target_input.reshape((n * l, )) target_input_embedding = get_output(self.target_input_embedding, target_input) target_input_embedding = target_input_embedding.reshape( (n, l, self.embedding_dim)) target_input_embedding = target_input_embedding.dimshuffle((1, 0, 2)) decode_init = get_output( self.decode_init_mlp, T.concatenate([h_e_1[-1], h_e_2[-1]], axis=-1)) o_init = get_output(self.decode_out_mlp, decode_init) ([h_d_1, h_d_2, d_o, attention_content], update) = theano.scan( self.target_decode_step, outputs_info=[ decode_init[:, :self.hid_size], decode_init[:, self.hid_size:], o_init, None ], sequences=[target_input_embedding], non_sequences=[attention_c1, attention_c2, encode_mask]) score_eva_in = T.concatenate( [d_o, attention_content, target_input_embedding], axis=-1) ([h, score], update) = theano.scan(self.score_eval_step, sequences=[score_eva_in], non_sequences=[self.target_output_embedding.W], outputs_info=[None, None]) h = h.dimshuffle((1, 0, 2)) score = score.dimshuffle((1, 0, 2)) max_clip = T.max(score, axis=-1) max_clip = zero_grad(max_clip) score = T.exp(score - max_clip.reshape((n, l, 1))) denominator = T.sum(score, axis=-1) # Get true embedding target_out = target[:, 1:] n, l = target_out.shape target_out = target_out.reshape((n * l, )) true_embed = get_output(self.target_output_embedding, target_out) true_embed = true_embed.reshape((n * l, self.output_score_dim)) h = h.reshape((n * l, self.output_score_dim)) true_score = T.exp( T.sum(h * true_embed, axis=-1) - max_clip.reshape((l * n, ))) true_score = true_score.reshape((n, l)) prob = true_score / denominator # Loss per sentence loss = decode_mask * T.log(prob + 1e-5) loss = -T.mean(T.sum(loss, axis=1)) return loss
def deepfool(model, inputs, labels=None, num_classes=None, norm='l2', max_iter=25, clip_dist=None, over_shoot=0.02): """Theano implementation of DeepFool https://arxiv.org/abs/1511.04599 """ assert norm in ['l1', 'l2'] if num_classes is None: raise RuntimeError("Number of classes need to be provided for Theano") if labels is None: labels = gradient.zero_grad(T.argmax(model(inputs), axis=1)) batch_size = inputs.shape[0] batch_indices = T.arange(batch_size) def find_perturb(perturbation): logits_os = model(inputs + (1 + over_shoot) * perturbation) y_pred = T.argmax(logits_os, axis=1) is_mistake = T.neq(y_pred, labels) current_ind = batch_indices[(1 - is_mistake).nonzero()] should_stop = T.all(is_mistake) # continue generating perturbation only for correctly classified inputs_subset = inputs[current_ind] perturbation_subset = perturbation[current_ind] labels_subset = labels[current_ind] batch_subset = T.arange(inputs_subset.shape[0]) x_adv = inputs_subset + perturbation_subset logits = model(x_adv) corrects = logits[batch_subset, labels_subset] jac = jacobian(logits, x_adv, num_classes) # deepfool f = logits - T.shape_padright(corrects) w = jac - T.shape_padaxis(jac[batch_subset, labels_subset], axis=1) reduce_ind = range(2, inputs.ndim + 1) if norm == 'l2': dist = T.abs_(f) / w.norm(2, axis=reduce_ind) else: dist = T.abs_(f) / T.sum(T.abs_(w), axis=reduce_ind) # remove correct targets dist = T.set_subtensor(dist[batch_subset, labels_subset], T.constant(np.inf)) l = T.argmin(dist, axis=1) dist_l = dist[batch_subset, l].dimshuffle(0, 'x', 'x', 'x') # avoid numerical instability and clip max value if clip_dist is not None: dist_l = T.clip(dist_l, 0, clip_dist) w_l = w[batch_subset, l] if norm == 'l2': reduce_ind = range(1, inputs.ndim) perturbation_upd = dist_l * w_l / w_l.norm( 2, reduce_ind, keepdims=True) else: perturbation_upd = dist_l * T.sgn(w_l) perturbation = ifelse( should_stop, perturbation, T.inc_subtensor(perturbation[current_ind], perturbation_upd)) return perturbation, scan_module.until(should_stop) initial_perturbation = T.zeros_like(inputs) results, _ = theano.scan(find_perturb, outputs_info=[initial_perturbation], n_steps=max_iter) perturbation = results[-1] return gradient.disconnected_grad(inputs + (1 + over_shoot) * perturbation)
def __step(img, prev_bbox, prev_att, state, prev_conf, prev_sugg, prev_W, prev_b, prev_pos, prev_neg, timestep): cx = (prev_bbox[:, 2] + prev_bbox[:, 0]) / 2. cy = (prev_bbox[:, 3] + prev_bbox[:, 1]) / 2. sigma = TT.exp(prev_att[:, 0]) * (max(img_col, img_row) / 2) fract = TT.exp(prev_att[:, 1]) amplifier = TT.exp(prev_att[:, 2]) eps = 1e-8 abs_cx = (cx + 1) / 2. * (img_col - 1) abs_cy = (cy + 1) / 2. * (img_row - 1) abs_stride = (fract * (max(img_col, img_row) - 1)) * ((1. / (NUM_N - 1.)) if NUM_N > 1 else 0) FX, FY = __filterbank(abs_cx, abs_cy, abs_stride, sigma) unnormalized_mask = (FX.dimshuffle(0, 'x', 1, 'x', 2) * FY.dimshuffle(0, 1, 'x', 2, 'x')).sum(axis=2).sum(axis=1) mask = unnormalized_mask# / (unnormalized_mask.sum(axis=2).sum(axis=1) + eps).dimshuffle(0, 'x', 'x') masked_img = img conv1 = conv2d(masked_img, conv1_filters, subsample=(conv1_stride, conv1_stride)) act1 = TT.tanh(conv1) flat1 = TT.reshape(act1, (-1, conv1_output_dim)) gru_in = TT.concatenate([flat1, prev_bbox, prev_conf.reshape((batch_size, 1)), prev_sugg], axis=1) gru_z = NN.sigmoid(TT.dot(gru_in, Wz) + TT.dot(state, Uz) + bz) gru_r = NN.sigmoid(TT.dot(gru_in, Wr) + TT.dot(state, Ur) + br) gru_h_ = TT.tanh(TT.dot(gru_in, Wg) + TT.dot(gru_r * state, Ug) + bg) gru_h = (1 - gru_z) * state + gru_z * gru_h_ bbox = TT.tanh(TT.dot(gru_h, W_fc2) + b_fc2) att = TT.dot(gru_h, W_fc3) + b_fc3 def batch_dot(a, b): return (a.dimshuffle(0, 1, 2, 'x') * b.dimshuffle(0, 'x', 1, 2)).sum(axis=2) def bounding(bbox): return TT.stack([TT.maximum(bbox[:, 0], -1), TT.minimum(bbox[:, 1], 1), TT.maximum(bbox[:, 2], -1), TT.minimum(bbox[:, 3], 1)], axis=1) def sample_positives(bbox): x0 = bbox[:, 0] y0 = bbox[:, 1] x1 = bbox[:, 2] y1 = bbox[:, 3] return TT.stack([bounding(TT.as_tensor([x0, y0, x1, y1]).T), bounding(TT.as_tensor([x0 * 0.75 + x1 * 0.25, y0, x1, y1]).T), bounding(TT.as_tensor([x0, y0 * 0.75 + y1 * 0.25, x1, y1]).T), bounding(TT.as_tensor([x0, y0, x1 * 0.75 + x0 * 0.25, y1]).T), bounding(TT.as_tensor([x0, y0, x1, y1 * 0.75 + y0 * 0.25]).T), bounding(TT.as_tensor([x0 * 1.25 - x1 * 0.25, y0, x1, y1]).T), bounding(TT.as_tensor([x0, y0 * 1.25 - y1 * 0.25, x1, y1]).T), bounding(TT.as_tensor([x0, y0, x1 * 1.25 - x0 * 0.25, y1]).T), bounding(TT.as_tensor([x0, y0, x1, y1 * 1.25 - y0 * 0.25]).T), ], axis=1) def sample_negatives(bbox): x0 = bbox[:, 0] y0 = bbox[:, 1] x1 = bbox[:, 2] y1 = bbox[:, 3] return TT.stack([bounding(TT.as_tensor([x0 * 0.5 + x1 * 0.5, y0, x1, y1]).T), bounding(TT.as_tensor([x0, y0 * 0.5 + y1 * 0.5, x1, y1]).T), bounding(TT.as_tensor([x0, y0, x1 * 0.5 + x0 * 0.5, y1]).T), bounding(TT.as_tensor([x0, y0, x1, y1 * 0.5 + y0 * 0.5]).T), bounding(TT.as_tensor([x0 * 1.5 - x1 * 0.5, y0, x1 * 0.5 + x0 * 0.5, y1]).T), bounding(TT.as_tensor([x0, y0 * 1.5 - y1 * 0.5, x1, y1 * 0.5 + y0 * 0.5]).T), bounding(TT.as_tensor([x0 * 0.5 + x1 * 0.5, y0, x1 * 1.5 - x0 * 0.5, y1]).T), bounding(TT.as_tensor([x0, y0 * 0.5 + y1 * 0.5, x1, y1 * 1.5 - y0 * 0.5]).T), ], axis=1) def sample_around(bbox): return TT.concatenate([sample_positives(bbox), sample_negatives(bbox)], axis=1) crop = batch_multicrop(bbox.dimshuffle(0, 'x', 1), img) feat = conv2d(crop.reshape((batch_size, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, 1, -1)) conf = NN.sigmoid(batch_dot(feat, prev_W) + TT.addbroadcast(prev_b, 1)) nr_samples = 17 sugg_bbox = sample_around(bbox) # (batch_size, nr_samples, 4) sugg_crop = batch_multicrop(sugg_bbox, img) sugg_feat = conv2d(sugg_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1)) sugg_conf = batch_dot(sugg_feat, prev_W) + TT.addbroadcast(prev_b, 1) print sugg_conf.dtype sugg_pos = TT.cast(sugg_conf > 0, T.config.floatX) print sugg_pos.dtype sugg = TG.disconnected_grad((sugg_bbox * TT.patternbroadcast(sugg_pos, [False, False, True])).sum(axis=1) / TT.patternbroadcast(sugg_pos.sum(axis=1), [False, True])) def classify(x, W, b): # x: (batch_size, samples_per_batch, feature_per_sample) return NN.sigmoid(batch_dot(x, W) + TT.addbroadcast(b, 1)) def update_step(W, b, x, y, alpha=1): y_hat = classify(x, W, b) loss = ((y_hat - y) ** 2).mean() g = T.grad(loss, [W, b]) return (W - alpha * g[0], b - alpha * g[1], loss), T.scan_module.until(loss < 0.01) nr_samples = 9 pos_bbox = sample_positives(bbox) pos_crop = batch_multicrop(pos_bbox, img) pos_feat = conv2d(pos_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1)) pos = TG.disconnected_grad(TT.set_subtensor(prev_pos[:, (nr_samples*timestep):(nr_samples*(timestep+1))], pos_feat)) nr_samples = 8 neg_bbox = sample_negatives(bbox) neg_crop = batch_multicrop(neg_bbox, img) neg_feat = conv2d(neg_crop.reshape((batch_size * nr_samples, 1, img_row, img_col)), conv1_filters, subsample=(conv1_stride, conv1_stride)).reshape((batch_size, nr_samples, -1)) neg = TG.disconnected_grad(TT.set_subtensor(prev_neg[:, (nr_samples*timestep):(nr_samples*(timestep+1))], neg_feat)) update_scan, _ = T.scan(fn=update_step, outputs_info=[prev_W, prev_b, None], non_sequences=[TT.concatenate([pos[:, :9*timestep], neg[:, :8*timestep]], axis=1), TT.concatenate([TT.ones((batch_size, 9*timestep, 1)), -TT.ones((batch_size, 8*timestep, 1))], axis=1)], n_steps=1000) new_W, new_b = TG.disconnected_grad(update_scan[0][-1]), TG.zero_grad(update_scan[1][-1]) return bbox, att, gru_h, TT.unbroadcast(conf, 1), sugg, new_W, TT.unbroadcast(new_b, 1), pos, neg, timestep + 1