def t_noise3d(v, perm, grad3): x = v[0] y = v[1] z = v[2] skew_factor = (x + y + z) * 1.0 / 3.0 i = T.floor(x + skew_factor) j = T.floor(y + skew_factor) k = T.floor(z + skew_factor) unskew_factor = (i + j + k) * 1.0 / 6.0 x0 = x - (i - unskew_factor) y0 = y - (j - unskew_factor) z0 = z - (k - unskew_factor) vertices = T.switch( T.ge(x0, y0), T.switch( T.ge(y0, z0), vertices_options[0], T.switch(T.ge(x0, z0), vertices_options[1], vertices_options[2])), T.switch( T.lt(y0, z0), vertices_options[3], T.switch(T.lt(x0, z0), vertices_options[4], vertices_options[5]))) x1 = x0 - vertices[0][0] + 1.0 / 6.0 y1 = y0 - vertices[0][1] + 1.0 / 6.0 z1 = z0 - vertices[0][2] + 1.0 / 6.0 x2 = x0 - vertices[1][0] + 1.0 / 3.0 y2 = y0 - vertices[1][1] + 1.0 / 3.0 z2 = z0 - vertices[1][2] + 1.0 / 3.0 x3 = x0 - 0.5 y3 = y0 - 0.5 z3 = z0 - 0.5 ii = T.bitwise_and(i.astype('int32'), 255) jj = T.bitwise_and(j.astype('int32'), 255) kk = T.bitwise_and(k.astype('int32'), 255) gi0 = perm[ii + perm[jj + perm[kk].astype('int32')].astype('int32')] % 12 gi1 = perm[ii + vertices[0][0] + perm[jj + vertices[0][1] + perm[ kk + vertices[0][2]].astype('int32')].astype('int32')] % 12 gi2 = perm[ii + vertices[1][0] + perm[jj + vertices[1][1] + perm[ kk + vertices[1][2]].astype('int32')].astype('int32')] % 12 gi3 = perm[ii + 1 + perm[jj + 1 + perm[kk + 1].astype('int32')].astype('int32')] % 12 t0 = 0.5 - x0**2 - y0**2 - z0**2 n0 = T.switch(T.lt(t0, 0), 0.0, t0**4 * T.dot(grad3[gi0.astype('int32')], [x0, y0, z0])) t1 = 0.5 - x1**2 - y1**2 - z1**2 n1 = T.switch(T.lt(t1, 0), 0.0, t1**4 * T.dot(grad3[gi1.astype('int32')], [x1, y1, z1])), t2 = 0.5 - x2**2 - y2**2 - z2**2 n2 = T.switch(T.lt(t2, 0), 0.0, t2**4 * T.dot(grad3[gi2.astype('int32')], [x2, y2, z2])) t3 = 0.5 - x3**2 - y3**2 - z3**2 n3 = T.switch(T.lt(t3, 0), 0.0, t3**4 * T.dot(grad3[gi3.astype('int32')], [x3, y3, z3])) return 23.0 * (n0 + n1 + n2 + n3)
def matrix_noise3d(input_vectors, perm, grad3, vertex_table): skew_factors = (input_vectors[:, 0] + input_vectors[:, 1] + input_vectors[:, 2]) * 1.0 / 3.0 skewed_vectors = T.floor(input_vectors + skew_factors[:, np.newaxis]) unskew_factors = (skewed_vectors[:, 0] + skewed_vectors[:, 1] + skewed_vectors[:, 2]) * 1.0 / 6.0 offsets_0 = input_vectors - (skewed_vectors - unskew_factors[:, np.newaxis]) vertex_table_x_index = T.ge(offsets_0[:, 0], offsets_0[:, 1]) vertex_table_y_index = T.ge(offsets_0[:, 1], offsets_0[:, 2]) vertex_table_z_index = T.ge(offsets_0[:, 0], offsets_0[:, 2]) simplex_vertices = vertex_table[ vertex_table_x_index, vertex_table_y_index, vertex_table_z_index].reshape((input_vectors.shape[0], 2, 3)) offsets_1 = offsets_0 - simplex_vertices[:, 0] + 1.0 / 6.0 offsets_2 = offsets_0 - simplex_vertices[:, 1] + 1.0 / 3.0 offsets_3 = offsets_0 - 0.5 masked_skewed_vectors = T.bitwise_and(skewed_vectors.astype('int32'), 255) gi0s = perm[masked_skewed_vectors[:, 0] + perm[ masked_skewed_vectors[:, 1] + perm[ masked_skewed_vectors[:, 2]].astype('int32')].astype('int32')] % 12 gi1s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 0, 0] + perm[ masked_skewed_vectors[:, 1] + simplex_vertices[:, 0, 1] + perm[ masked_skewed_vectors[:, 2] + simplex_vertices[:, 0, 2]].astype('int32')].astype('int32')] % 12 gi2s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 1, 0] + perm[ masked_skewed_vectors[:, 1] + simplex_vertices[:, 1, 1] + perm[ masked_skewed_vectors[:, 2] + simplex_vertices[:, 1, 2]].astype('int32')].astype('int32')] % 12 gi3s = perm[masked_skewed_vectors[:, 0] + 1 + perm[ masked_skewed_vectors[:, 1] + 1 + perm[ masked_skewed_vectors[:, 2] + 1].astype('int32')].astype('int32')] % 12 n0s = calculate_gradient_contribution(offsets_0, gi0s, grad3) n1s = calculate_gradient_contribution(offsets_1, gi1s, grad3) n2s = calculate_gradient_contribution(offsets_2, gi2s, grad3) n3s = calculate_gradient_contribution(offsets_3, gi3s, grad3) return 23.0 * (n0s + n1s + n2s + n3s)
def dtw_inner_step(x2_index, d_slice_slice, insert_cost, x1_length, x2_length, x1_index, previous_cost_row): assert x2_index.ndim == 0 assert 0 <= d_slice_slice.ndim <= 1 assert insert_cost.ndim == d_slice_slice.ndim assert x1_length.ndim == d_slice_slice.ndim assert x2_length.ndim == d_slice_slice.ndim assert x1_index.ndim == 0 assert previous_cost_row.ndim == d_slice_slice.ndim + 1 x2_index = _debug(x2_index, 'dtw_inner_step.x2_index', debug_level) d_slice_slice = _debug(d_slice_slice, 'dtw_inner_step.d_slice_slice', debug_level) insert_cost = _debug(insert_cost, 'dtw_inner_step.insert_cost', debug_level) delete_cost = _debug(previous_cost_row[x2_index], 'dtw_inner_step.delete_cost', debug_level) match_cost = _debug(previous_cost_row[x2_index - 1], 'dtw_inner_step.match_cost', debug_level) assert delete_cost.ndim == d_slice_slice.ndim assert match_cost.ndim == d_slice_slice.ndim min_cost = _debug( tt.min(tt.stack(insert_cost, delete_cost, match_cost), axis=0), 'dtw_inner_step.min_cost', debug_level) assert min_cost.ndim == d_slice_slice.ndim in_first_row = _debug(tt.eq(x1_index, 0), 'dtw_inner_step.in_first_row', debug_level) in_first_column = _debug(tt.eq(x2_index, 0), 'dtw_inner_step.in_first_column', debug_level) assert in_first_row.ndim == 0 assert in_first_column.ndim == 0 cost = _debug( d_slice_slice + tt.switch(in_first_row, insert_cost, tt.switch(in_first_column, delete_cost, min_cost)), 'dtw_inner_step.cost', debug_level) assert cost.ndim == d_slice_slice.ndim length_filtered_cost = _debug( tt.switch( tt.bitwise_and(tt.lt(x1_index, x1_length), tt.lt(x2_index, x2_length)), cost, 0.), 'dtw_inner_step.length_filtered_cost', debug_level) assert length_filtered_cost.ndim == d_slice_slice.ndim return length_filtered_cost
def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t, alpha_star, phi_star, derphi_star): derphi_a1 = derphi(alpha1) cond1 = TT.bitwise_or(phi_a1 > phi0 + c1 * alpha1 * derphi0, TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero)) cond2 = abs(derphi_a1) <= -c2 * derphi0 cond3 = derphi_a1 >= zero alpha_star_c1, phi_star_c1, derphi_star_c1 = \ _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, phi, derphi, phi0, derphi0, c1, c2, profile=profile) alpha_star_c3, phi_star_c3, derphi_star_c3 = \ _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi, derphi, phi0, derphi0, c1, c2, profile=profile) nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX) nw_phi = phi(nw_alpha1) alpha_star, phi_star, derphi_star = \ ifelse(cond1, (alpha_star_c1, phi_star_c1, derphi_star_c1), ifelse(cond2, (alpha1, phi_a1, derphi_a1), ifelse(cond3, (alpha_star_c3, phi_star_c3, derphi_star_c3), (nw_alpha1, nw_phi, nan), name='alphastar_c3'), name='alphastar_c2'), name='alphastar_c1') return ([alpha1, nw_alpha1, phi_a1, ifelse(lazy_or('allconds', cond1, cond2, cond3), phi_a1, nw_phi, name='nwphi1'), ifelse(cond1, derphi_a0, derphi_a1, name='derphi'), i_t + one, alpha_star, phi_star, derphi_star], theano.scan_module.scan_utils.until( lazy_or('until_cond_', TT.eq(nw_alpha1, zero), cond1, cond2, cond3)))
def mask_loss_mse(grid_idx, image): indicies = T.bitwise_and(T.neq(grid_idx, MASK["IGNORE"]), T.neq(grid_idx, MASK["BACKGROUND_RING"])) bw = binary_mask(grid_idx, ignore=0.0) diff = (bw - image) loss = (diff[indicies.nonzero()]**2).mean() visual_diff = T.zeros_like(diff) visual_diff = T.set_subtensor(visual_diff[indicies.nonzero()], diff[indicies.nonzero()]**2) return DotMap({ 'loss': loss, 'visual': { 'diff': visual_diff, 'bw_grid': bw } })
def mask_loss_adaptive_mse(grid_idx, image, impl='auto'): black_mean, white_mean, _ = segment_means(grid_idx, image, impl) white_mean = T.maximum(white_mean, 0.40) white_mean = T.maximum(white_mean, black_mean + 0.20) black_mean = T.minimum(white_mean - 0.20, black_mean) dimsuffle = (0, 'x', 'x', 'x') bw = adaptive_mask(grid_idx, ignore=0.0, black=black_mean.dimshuffle(*dimsuffle), white=white_mean.dimshuffle(*dimsuffle)) # bw = gaussian_filter_2d(bw, sigma=2.) diff = T.zeros_like(bw) idx = T.bitwise_and(T.neq(grid_idx, MASK["IGNORE"]), T.neq(grid_idx, MASK["BACKGROUND_RING"])) diff = T.set_subtensor(diff[idx.nonzero()], abs(bw - image)[idx.nonzero()]) loss = (T.maximum(diff, 0.15)[idx.nonzero()]**2).mean() return DotMap({'loss': loss, 'visual': {'diff': diff, 'bw_grid': bw}})
def while_search(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, i_t, alpha_star, phi_star, derphi_star): derphi_a1 = derphi(alpha1) cond1 = TT.bitwise_or(phi_a1 > phi0 + c1*alpha1*derphi0, TT.bitwise_and(phi_a1 >= phi_a0, i_t > zero)) cond2 = abs(derphi_a1) <= -c2*derphi0 cond3 = derphi_a1 >= zero alpha_star_c1, phi_star_c1, derphi_star_c1 = \ _zoom(alpha0, alpha1, phi_a0, phi_a1, derphi_a0, phi, derphi, phi0, derphi0, c1,c2, profile = profile, mode=mode) alpha_star_c3, phi_star_c3, derphi_star_c3 = \ _zoom(alpha1, alpha0, phi_a1, phi_a0, derphi_a1, phi, derphi, phi0, derphi0, c1,c2, profile = profile, mode=mode) nw_alpha1 = alpha1 * numpy.asarray(2, dtype=theano.config.floatX) nw_phi = phi(nw_alpha1) alpha_star, phi_star, derphi_star = \ ifelse(cond1, (alpha_star_c1, phi_star_c1, derphi_star_c1), ifelse(cond2, (alpha1, phi_a1, derphi_a1), ifelse(cond3, (alpha_star_c3, phi_star_c3, derphi_star_c3), (nw_alpha1, nw_phi, nan), name = 'alphastar_c3'), name = 'alphastar_c2'), name ='alphastar_c1') return ( [alpha1, nw_alpha1, phi_a1, ifelse(lazy_or('allconds',cond1, cond2, cond3), phi_a1, nw_phi, name='nwphi1'), ifelse(cond1, derphi_a0, derphi_a1, name='derphi'), i_t + one, alpha_star, phi_star, derphi_star], theano.scan_module.scan_utils.until( lazy_or('until_cond_',TT.eq(nw_alpha1,zero), cond1, cond2, cond3)))
def mask_loss_adaptive_mse(grid_idx, image, impl='auto'): black_mean, white_mean, _ = segment_means(grid_idx, image, impl) white_mean = T.maximum(white_mean, 0.40) white_mean = T.maximum(white_mean, black_mean + 0.20) black_mean = T.minimum(white_mean - 0.20, black_mean) dimsuffle = (0, 'x', 'x', 'x') bw = adaptive_mask(grid_idx, ignore=0.0, black=black_mean.dimshuffle(*dimsuffle), white=white_mean.dimshuffle(*dimsuffle)) # bw = gaussian_filter_2d(bw, sigma=2.) diff = T.zeros_like(bw) idx = T.bitwise_and(T.neq(grid_idx, MASK["IGNORE"]), T.neq(grid_idx, MASK["BACKGROUND_RING"])) diff = T.set_subtensor(diff[idx.nonzero()], abs(bw - image)[idx.nonzero()]) loss = (T.maximum(diff, 0.15)[idx.nonzero()]**2).mean() return DotMap({ 'loss': loss, 'visual': { 'diff': diff, 'bw_grid': bw } })
def matrix_noise3d(input_vectors, perm, grad3, vertex_table): skew_factors = (input_vectors[:, 0] + input_vectors[:, 1] + input_vectors[:, 2]) * 1.0 / 3.0 skewed_vectors = T.floor(input_vectors + skew_factors[:, np.newaxis]) unskew_factors = (skewed_vectors[:, 0] + skewed_vectors[:, 1] + skewed_vectors[:, 2]) * 1.0 / 6.0 offsets_0 = input_vectors - (skewed_vectors - unskew_factors[:, np.newaxis]) vertex_table_x_index = T.ge(offsets_0[:, 0], offsets_0[:, 1]) vertex_table_y_index = T.ge(offsets_0[:, 1], offsets_0[:, 2]) vertex_table_z_index = T.ge(offsets_0[:, 0], offsets_0[:, 2]) simplex_vertices = vertex_table[vertex_table_x_index, vertex_table_y_index, vertex_table_z_index].reshape( (input_vectors.shape[0], 2, 3)) offsets_1 = offsets_0 - simplex_vertices[:, 0] + 1.0 / 6.0 offsets_2 = offsets_0 - simplex_vertices[:, 1] + 1.0 / 3.0 offsets_3 = offsets_0 - 0.5 masked_skewed_vectors = T.bitwise_and(skewed_vectors.astype('int32'), 255) gi0s = perm[masked_skewed_vectors[:, 0] + perm[masked_skewed_vectors[:, 1] + perm[masked_skewed_vectors[:, 2]].astype('int32')].astype( 'int32')] % 12 gi1s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 0, 0] + perm[ masked_skewed_vectors[:, 1] + simplex_vertices[:, 0, 1] + perm[masked_skewed_vectors[:, 2] + simplex_vertices[:, 0, 2]].astype('int32')].astype('int32')] % 12 gi2s = perm[masked_skewed_vectors[:, 0] + simplex_vertices[:, 1, 0] + perm[ masked_skewed_vectors[:, 1] + simplex_vertices[:, 1, 1] + perm[masked_skewed_vectors[:, 2] + simplex_vertices[:, 1, 2]].astype('int32')].astype('int32')] % 12 gi3s = perm[masked_skewed_vectors[:, 0] + 1 + perm[masked_skewed_vectors[:, 1] + 1 + perm[masked_skewed_vectors[:, 2] + 1].astype('int32')].astype('int32')] % 12 n0s = calculate_gradient_contribution(offsets_0, gi0s, grad3) n1s = calculate_gradient_contribution(offsets_1, gi1s, grad3) n2s = calculate_gradient_contribution(offsets_2, gi2s, grad3) n3s = calculate_gradient_contribution(offsets_3, gi3s, grad3) return 23.0 * (n0s + n1s + n2s + n3s)
def dtw_inner_step(x2_index, d_slice_slice, insert_cost, x1_length, x2_length, x1_index, previous_cost_row): assert x2_index.ndim == 0 assert 0 <= d_slice_slice.ndim <= 1 assert insert_cost.ndim == d_slice_slice.ndim assert x1_length.ndim == d_slice_slice.ndim assert x2_length.ndim == d_slice_slice.ndim assert x1_index.ndim == 0 assert previous_cost_row.ndim == d_slice_slice.ndim + 1 x2_index = _debug(x2_index, 'dtw_inner_step.x2_index', debug_level) d_slice_slice = _debug(d_slice_slice, 'dtw_inner_step.d_slice_slice', debug_level) insert_cost = _debug(insert_cost, 'dtw_inner_step.insert_cost', debug_level) delete_cost = _debug(previous_cost_row[x2_index], 'dtw_inner_step.delete_cost', debug_level) match_cost = _debug(previous_cost_row[x2_index - 1], 'dtw_inner_step.match_cost', debug_level) assert delete_cost.ndim == d_slice_slice.ndim assert match_cost.ndim == d_slice_slice.ndim min_cost = _debug(tt.min(tt.stack(insert_cost, delete_cost, match_cost), axis=0), 'dtw_inner_step.min_cost', debug_level) assert min_cost.ndim == d_slice_slice.ndim in_first_row = _debug(tt.eq(x1_index, 0), 'dtw_inner_step.in_first_row', debug_level) in_first_column = _debug(tt.eq(x2_index, 0), 'dtw_inner_step.in_first_column', debug_level) assert in_first_row.ndim == 0 assert in_first_column.ndim == 0 cost = _debug( d_slice_slice + tt.switch(in_first_row, insert_cost, tt.switch(in_first_column, delete_cost, min_cost)), 'dtw_inner_step.cost', debug_level) assert cost.ndim == d_slice_slice.ndim length_filtered_cost = _debug( tt.switch(tt.bitwise_and(tt.lt(x1_index, x1_length), tt.lt(x2_index, x2_length)), cost, 0.), 'dtw_inner_step.length_filtered_cost', debug_level) assert length_filtered_cost.ndim == d_slice_slice.ndim return length_filtered_cost
def __init__(self, inputs, labels, y_mask, n_dim, cutoff, project_factor=4): ''' Args: inputs: flattened logits with shape of [n_step*n_batch, n_dim] labels: flattened labels with shape of [n_step*n_batch] y_mask: mask the null space of sentences with shape of [n_step*n_batch] cutoff: frequency binning, i.e. [2000, vocab_size] project_factor: project for low-frequency words ''' self.input_dim = n_dim self.sample_num = inputs.shape[0] self.cluster_num = len(cutoff) - 1 self.head_dim = cutoff[0] + self.cluster_num self.params = [] self.y_mask = y_mask init_head_w = np.asarray(np.random.uniform(low=-np.sqrt(1./self.input_dim), high=np.sqrt(1./self.input_dim), size=(self.input_dim,self.head_dim))) self.head_w=theano.shared(value=init_head_w,name='head_w') self.params.append(self.head_w) tail_project_factor = project_factor tail_w_list = [] for i in range(self.cluster_num): project_dim = max(1, self.input_dim // tail_project_factor) tail_dim = cutoff[i + 1] - cutoff[i] _tail_proj_w = np.asarray(np.random.uniform(low=-np.sqrt(1./self.input_dim), high=np.sqrt(1./self.input_dim), size=(self.input_dim, project_dim)),dtype=theano.config.floatX) _tail_w = np.asarray(np.random.uniform(low=-np.sqrt(1./project_dim), high=np.sqrt(1./project_dim), size=(project_dim,tail_dim)),dtype=theano.config.floatX) tail_proj_w = theano.shared(value=_tail_proj_w, name="adaptive_softmax_tail{}_proj_w".format(i+1)) tail_w = theano.shared(value=_tail_w, name="adaptive_softmax_tail{}_w".format(i+1)) tail_w_list.append([tail_proj_w, tail_w]) tail_project_factor *= project_factor self.params.append(tail_proj_w) self.params.append(tail_w) # delete null indexes by y_mask # y_mask = y_mask.flatten() # inputs = inputs[y_mask.nonzero()] # labels = labels[y_mask.nonzero()] # Get tail masks and update head labels training_losses = [] loss = 0. head_labels = labels for i in range(self.cluster_num): mask = T.bitwise_and(T.ge(labels, cutoff[i]), T.lt(labels, cutoff[i + 1])) # mask that delete words not in cluster # update head labels head_labels = T.switch(mask, T.constant([cutoff[0] + i]).repeat(self.sample_num), head_labels) # compute tail loss tail_inputs = inputs[mask.nonzero()] tail_logits = T.dot(T.dot(tail_inputs, tail_w_list[i][0]), tail_w_list[i][1]) tail_labels = (labels - cutoff[i])[mask.nonzero()] tail_y_mask = self.y_mask[mask.nonzero()] # mask that eases the effect of null space tail_logits = tail_logits[T.eq(tail_y_mask, 1).nonzero()] tail_labels = tail_labels[T.eq(tail_y_mask, 1).nonzero()] tail_logits = T.clip(tail_logits, 1.0e-8, 1.0 - 1.0e-8) tail_loss = T.mean(T.nnet.categorical_crossentropy(tail_logits, tail_labels)) training_losses.append(tail_loss) loss += tail_loss self.tail_logits = tail_logits self.tail_labels = tail_labels self.tail_loss = tail_loss # compute head loss head_logits = T.dot(inputs, self.head_w) head_logits = head_logits[T.eq(self.y_mask, 1).nonzero()] head_logits = T.clip(head_logits, 1.0e-8, 1.0 - 1.0e-8) head_labels = head_labels[T.eq(self.y_mask, 1).nonzero()] head_loss = T.mean(T.nnet.categorical_crossentropy(head_logits, head_labels)) loss += head_loss training_losses.append(head_loss) self.loss = loss self.training_losses = training_losses self.head_loss = head_loss
def _get_cost3( self, output, truth, rescore=True ): if not hasattr(self, '_lambda_obj'): lambda_obj, lambda_noobj = T.scalar('lambda_obj'), T.scalar('lambda_noobj') self._lambda_obj, self._lambda_noobj = lambda_obj, lambda_noobj else: lambda_obj, lambda_noobj, thresh = self._lambda_obj, self._lambda_noobj, self._thresh cost = 0. # penalize everything, this will be undone if box matches ground truth #cost += lambda_noobj_coord * T.mean(output[:,:,:4]**2) cost += lambda_noobj * T.mean(output[:,:,4]**2) # get index for each truth row_idx = T.cast(T.floor((truth[:,:,0] + 0.5 * truth[:,:,2]) * self.output_shape[1]), 'int32') col_idx = T.cast(T.floor((truth[:,:,1] + 0.5 * truth[:,:,3]) * self.output_shape[0]), 'int32') # image index img_idx = T.repeat(T.arange(truth.shape[0]).dimshuffle(0,'x'), truth.shape[1], axis=1) # index for each object in an image obj_idx = T.repeat(T.arange(truth.shape[1]), truth.shape[0], axis=0) # reshape to flat row_idx = row_idx.reshape((-1,)) col_idx = col_idx.reshape((-1,)) img_idx = img_idx.reshape((-1,)) obj_idx = obj_idx.reshape((-1,)) # use only valid indices (i.e. greater or equal to zero) valid_idx = T.bitwise_and(row_idx >= 0, col_idx >= 0).reshape((-1,)) row_idx = row_idx[valid_idx.nonzero()] col_idx = col_idx[valid_idx.nonzero()] img_idx = img_idx[valid_idx.nonzero()] obj_idx = obj_idx[valid_idx.nonzero()] # reshape output and truth output = output.dimshuffle(0,'x',1,2,3,4) truth = truth.dimshuffle(0,1,'x',2,'x','x') output = T.repeat(output, truth.shape[1], axis=1) truth = T.repeat(truth, self.boxes.__len__(), axis=2) truth = T.repeat(T.repeat(truth, self.output_shape[0], axis=4), self.output_shape[1], axis=5) # reformat ground truth labels so that they are relative to offsets # and that the width/height are log scale relative to the box height. # add offset to the x,y coordinates x_diff, y_diff = 1./self.output_shape[0], 1./self.output_shape[1] y, x = meshgrid(T.arange(0 + x_diff/2,1,x_diff), T.arange(0 + y_diff/2,1,y_diff)) x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1) # scaling from each anchor box x_scale = theano.shared(np.asarray([b[0] for b in self.boxes]), name='x_scale', borrow=True).dimshuffle('x',0,'x','x') y_scale = theano.shared(np.asarray([b[1] for b in self.boxes]), name='y_scale', borrow=True).dimshuffle('x',0,'x','x') # change predicted output to proper scale pred = T.set_subtensor(output[:,:,:,0], output[:,:,:,0] + x) pred = T.set_subtensor(pred[:,:,:,1], pred[:,:,:,1] + y) pred = T.set_subtensor(pred[:,:,:,2], x_scale * T.exp(pred[:,:,:,2])) pred = T.set_subtensor(pred[:,:,:,3], y_scale * T.exp(pred[:,:,:,3])) # determine iou of chosen boxes xi = T.maximum(pred[img_idx, obj_idx, :, 0, row_idx, col_idx], truth[img_idx, obj_idx, :, 0, row_idx, col_idx]) yi = T.maximum(pred[img_idx, obj_idx, :, 1, row_idx, col_idx], truth[img_idx, obj_idx, :, 1, row_idx, col_idx]) xf = T.minimum( pred[img_idx, obj_idx, :, 0, row_idx, col_idx] + pred[img_idx, obj_idx, :, 2, row_idx, col_idx], truth[img_idx, obj_idx, :, 0, row_idx, col_idx] + truth[img_idx, obj_idx, :, 2, row_idx, col_idx] ) yf = T.minimum( pred[img_idx, obj_idx, :, 1, row_idx, col_idx] + pred[img_idx, obj_idx, :, 3, row_idx, col_idx], truth[img_idx, obj_idx, :, 1, row_idx, col_idx] + truth[img_idx, obj_idx, :, 3, row_idx, col_idx] ) w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.) isec = w * h iou = isec / (pred[img_idx, obj_idx, :, 2, row_idx, col_idx] * pred[img_idx, obj_idx, :, 3, row_idx, col_idx] + \ truth[img_idx, obj_idx, :, 2, row_idx, col_idx] * truth[img_idx, obj_idx, :, 3, row_idx, col_idx] - isec) # get index for matched boxes match_idx = T.argmax(iou, axis=1) # change truth to proper scale for error truth = T.set_subtensor(truth[:,:,:,0,:,:], truth[:,:,:,0,:,:] - x) truth = T.set_subtensor(truth[:,:,:,1,:,:], truth[:,:,:,1,:,:] - y) truth = T.set_subtensor(truth[:,:,:,2,:,:], T.log(truth[:,:,:,2,:,:] / x_scale)) truth = T.set_subtensor(truth[:,:,:,3,:,:], T.log(truth[:,:,:,3,:,:] / y_scale)) # add to cost boxes which have been matched # correct for matched boxes #cost -= lambda_noobj_coord * T.mean(output[img_idx, obj_idx, :, :4, row_idx, col_idx][:,match_idx]**2) cost -= lambda_noobj * T.mean(output[img_idx, obj_idx, :, 4, row_idx, col_idx][:,match_idx]**2) # coordinate errors cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 0, row_idx, col_idx][:,match_idx] - truth[img_idx, obj_idx, :, 0, row_idx, col_idx][:,match_idx])**2 ) cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 1, row_idx, col_idx][:,match_idx] - truth[img_idx, obj_idx, :, 1, row_idx, col_idx][:,match_idx])**2 ) cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 2, row_idx, col_idx][:,match_idx] - truth[img_idx, obj_idx, :, 2, row_idx, col_idx][:,match_idx])**2 ) cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 3, row_idx, col_idx][:,match_idx] - truth[img_idx, obj_idx, :, 3, row_idx, col_idx][:,match_idx])**2 ) # objectness error if rescore: cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 4, row_idx, col_idx][:,match_idx] - iou[:,match_idx])**2 ) else: cost += lambda_obj * T.mean( (output[img_idx, obj_idx, :, 4, row_idx, col_idx][:,match_idx] - 1)**2 ) # class error cost += lambda_obj * T.mean( ( -truth[img_idx, obj_idx, :, -self.num_classes:, row_idx, col_idx][:,match_idx] * \ T.log(output[img_idx, obj_idx, :, -self.num_classes:, row_idx, col_idx][:,match_idx]) ) ) return cost, [iou]
def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1 * dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2 * dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq', TT.isnan(a_j_quad), a_j_quad > b - qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc', TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='phi_rec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ([phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop))
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo, phi, derphi, phi0, derphi0, c1, c2, n_iters=10, profile=False): """ WRITEME Part of the optimization algorithm in `scalar_search_wolfe2`. Parameters ---------- a_lo : float Step size a_hi : float Step size phi_lo : float Value of f at a_lo phi_hi : float Value of f at a_hi derphi_lo : float Value of derivative at a_lo phi : callable Generates computational graph derphi : callable Generates computational graph phi0 : float Value of f at 0 derphi0 : float Value of the derivative at 0 c1 : float Wolfe parameter c2 : float Wolfe parameter profile : bool True if you want printouts of profiling information """ # Function reprensenting the computations of one step of the while loop def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1 * dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2 * dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq', TT.isnan(a_j_quad), a_j_quad > b - qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc', TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='phi_rec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ([phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop)) maxiter = n_iters # cubic interpolant check delta1 = TT.constant(numpy.asarray(0.2, dtype=theano.config.floatX)) # quadratic interpolant check delta2 = TT.constant(numpy.asarray(0.1, dtype=theano.config.floatX)) phi_rec = phi0 a_rec = zero # Initial iteration dalpha = a_hi - a_lo a = TT.switch(dalpha < zero, a_hi, a_lo) b = TT.switch(dalpha < zero, a_lo, a_hi) #a = ifelse(dalpha < 0, a_hi, a_lo) #b = ifelse(dalpha < 0, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # quadric interpolation qchk = delta2 * dalpha a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('mcond_q', TT.isnan(a_j), TT.bitwise_or(a_j > b - qchk, a_j < a + qchk)) a_j = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX) * \ dalpha, a_j) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) cond1 = TT.bitwise_or(phi_aj > phi0 + c1 * a_j * derphi0, phi_aj >= phi_lo) cond2 = derphi_aj * (a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse(cond1, phi_hi, TT.switch(cond2, phi_hi, phi_lo), name='mphirec') a_rec = ifelse(cond1, a_hi, TT.switch(cond2, a_hi, a_lo), name='marec') a_hi = ifelse(cond1, a_j, TT.switch(cond2, a_lo, a_hi), name='mahi') phi_hi = ifelse(cond1, phi_aj, TT.switch(cond2, phi_lo, phi_hi), name='mphihi') onlyif = lazy_and('only_if', TT.bitwise_and(phi_aj <= phi0 + c1 * a_j * derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2 * derphi0) a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo_main') phi_rec.name = 'phi_rec' a_rec.name = 'a_rec' a_lo.name = 'a_lo' a_hi.name = 'a_hi' phi_hi.name = 'phi_hi' phi_lo.name = 'phi_lo' derphi_lo.name = 'derphi_lo' vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='vderphi_aj') states = [] states += [TT.unbroadcast(TT.shape_padleft(phi_rec), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_rec), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(a_hi), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_hi), 0)] states += [TT.unbroadcast(TT.shape_padleft(phi_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(derphi_lo), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] states += [TT.unbroadcast(TT.shape_padleft(zero), 0)] # print'while_zoom' outs, updates = scan(while_zoom, states=states, n_steps=maxiter, name='while_zoom', mode=theano.Mode(linker='cvm_nogc'), profile=profile) # print 'done_while' a_star = ifelse(onlyif, a_j, outs[7][0], name='astar') val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar') valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime') ## WARNING !! I ignore updates given by scan which I should not do !!! return a_star, val_star, valprime
def compute(self, x_gold, x_pred, x_label_gold, x_label_pred): correct_head = T.ge(x_gold, 0) * T.eq(x_gold, x_pred) correct_label = T.eq(x_label_gold, x_label_pred) return T.sum(T.bitwise_and(correct_head, correct_label))
def clip_around_zero(x, threshold=0.2): indicies = T.bitwise_and(x < threshold, x > -threshold) return T.set_subtensor(x[indicies.nonzero()], 0)
def _get_cost(self, input, truth, alpha=1., min_iou=0.5): cost = 0. # create ground truth for non-object class neg_example = theano.shared( np.zeros(self.num_classes + 1, dtype=theano.config.floatX)) neg_example = T.set_subtensor(neg_example[-1], 1.) neg_example = neg_example.dimshuffle('x', 'x', 0, 'x', 'x') cost_coord, cost_class, cost_noobj = 0., 0., 0. for i in range(self._predictive_maps.__len__()): dmap = self._default_maps[i] fmap = self._predictive_maps[i] shape = layers.get_output_shape(self.network['detection'][i])[2:] # get iou between default maps and ground truth iou_default = self._get_iou( dmap.dimshuffle('x', 'x', 0, 1, 2, 3), truth.dimshuffle(0, 1, 'x', 2, 'x', 'x')) #pdb.set_trace() # get which object for which cell idx_match = T.argmax(iou_default, axis=1) # extend truth to cover all cell/box/examples truth_extended = T.repeat(T.repeat(T.repeat(truth.dimshuffle( 0, 1, 'x', 2, 'x', 'x'), self.ratios.__len__(), axis=2), shape[0], axis=4), shape[1], axis=5) idx1, idx2, idx3, idx4 = meshgrid(T.arange(truth.shape[0]), T.arange(self.ratios.__len__()), T.arange(shape[0]), T.arange(shape[1])) # copy truth for every cell/box. truth_extended = truth_extended[idx1, idx_match, idx2, :, idx3, idx4].dimshuffle(0, 1, 4, 2, 3) iou_default = iou_default.max(axis=1) iou_gt_min = iou_default >= min_iou dmap_extended = dmap.dimshuffle('x', 0, 1, 2, 3) # penalize coordinates # cost_fmap = 0. cost_coord_fmap = 0. cost_coord_fmap += (( (fmap[:, :, 0] - (truth_extended[:, :, 0] - dmap_extended[:, :, 0]) / dmap_extended[:, :, 2])[iou_gt_min.nonzero()])**2).sum() cost_coord_fmap += (( (fmap[:, :, 1] - (truth_extended[:, :, 1] - dmap_extended[:, :, 1]) / dmap_extended[:, :, 3])[iou_gt_min.nonzero()])**2).sum() cost_coord_fmap += (( (fmap[:, :, 2] - T.log(truth_extended[:, :, 2] / dmap_extended[:, :, 2]) )[iou_gt_min.nonzero()])**2).sum() cost_coord_fmap += (( (fmap[:, :, 3] - T.log(truth_extended[:, :, 3] / dmap_extended[:, :, 3]) )[iou_gt_min.nonzero()])**2).sum() cost_class_fmap = -( truth_extended[:, :, -(self.num_classes + 1):] * T.log(fmap[:, :, -(self.num_classes + 1):])).sum(axis=2) cost_class_fmap = cost_class_fmap[iou_gt_min.nonzero()].sum() # find negative examples iou_default = iou_default.reshape((-1, )) # iou_idx_sorted = T.argsort(iou_default)[::-1] # iou_st_min = iou_default < min_iou iou_st_min = T.bitwise_and(iou_default >= 0.1, iou_default < min_iou) # Choose index for top boxes whose overlap is smaller than the min overlap. pos_size = iou_gt_min[iou_gt_min.nonzero()].size neg_size = pos_size * 3 # ratio of 3 to 1 #neg_size = 10 idx_neg = T.arange(iou_default.shape[0])[iou_st_min.nonzero()] replace = T.le(idx_neg.shape[0], neg_size) idx_neg = theano.ifelse.ifelse( idx_neg.shape[0] > 0, self._random_stream.choice((neg_size, ), a=idx_neg, replace=replace), T.arange(0)) # iou_idx_sorted = iou_idx_sorted[iou_st_min[iou_idx_sorted].nonzero()][:neg_size] # neg_size = iou_idx_sorted.size neg_size, pos_size = T.maximum(1., neg_size), T.maximum(1., pos_size) # Add the negative examples to the costs. cost_noobj_fmap = -(neg_example * T.log( fmap[:, :, -(self.num_classes + 1):])).sum(axis=2).reshape( (-1, )) cost_noobj_fmap = cost_noobj_fmap[idx_neg].sum() # # NEW STUFF # cost_coord += cost_coord_fmap / pos_size cost_class += alpha * cost_class_fmap / pos_size cost_noobj += alpha * cost_noobj_fmap / neg_size # cost += cost_fmap cost = cost_coord + cost_class + cost_noobj return cost, [cost_coord, cost_class, cost_noobj]
def t_noise3d(v, perm, grad3): x = v[0] y = v[1] z = v[2] skew_factor = (x + y + z) * 1.0 / 3.0 i = T.floor(x + skew_factor) j = T.floor(y + skew_factor) k = T.floor(z + skew_factor) unskew_factor = (i + j + k) * 1.0 / 6.0 x0 = x - (i - unskew_factor) y0 = y - (j - unskew_factor) z0 = z - (k - unskew_factor) vertices = T.switch(T.ge(x0, y0), T.switch(T.ge(y0, z0), vertices_options[0], T.switch(T.ge(x0, z0), vertices_options[1], vertices_options[2])), T.switch(T.lt(y0, z0), vertices_options[3], T.switch(T.lt(x0, z0), vertices_options[4], vertices_options[5])) ) x1 = x0 - vertices[0][0] + 1.0 / 6.0 y1 = y0 - vertices[0][1] + 1.0 / 6.0 z1 = z0 - vertices[0][2] + 1.0 / 6.0 x2 = x0 - vertices[1][0] + 1.0 / 3.0 y2 = y0 - vertices[1][1] + 1.0 / 3.0 z2 = z0 - vertices[1][2] + 1.0 / 3.0 x3 = x0 - 0.5 y3 = y0 - 0.5 z3 = z0 - 0.5 ii = T.bitwise_and(i.astype('int32'), 255) jj = T.bitwise_and(j.astype('int32'), 255) kk = T.bitwise_and(k.astype('int32'), 255) gi0 = perm[ii + perm[ jj + perm[ kk].astype('int32')].astype('int32')] % 12 gi1 = perm[ii + vertices[0][0] + perm[ jj + vertices[0][1] + perm[ kk + vertices[0][2]].astype('int32')].astype('int32')] % 12 gi2 = perm[ii + vertices[1][0] + perm[ jj + vertices[1][1] + perm[ kk + vertices[1][2]].astype('int32')].astype('int32')] % 12 gi3 = perm[ii + 1 + perm[ jj + 1 + perm[ kk + 1].astype('int32')].astype('int32')] % 12 t0 = 0.5 - x0 ** 2 - y0 ** 2 - z0 ** 2 n0 = T.switch( T.lt(t0, 0), 0.0, t0 ** 4 * T.dot(grad3[gi0.astype('int32')], [x0, y0, z0])) t1 = 0.5 - x1 ** 2 - y1 ** 2 - z1 ** 2 n1 = T.switch( T.lt(t1, 0), 0.0, t1 ** 4 * T.dot(grad3[gi1.astype('int32')], [x1, y1, z1])), t2 = 0.5 - x2 ** 2 - y2 ** 2 - z2 ** 2 n2 = T.switch( T.lt(t2, 0), 0.0, t2 ** 4 * T.dot(grad3[gi2.astype('int32')], [x2, y2, z2])) t3 = 0.5 - x3 ** 2 - y3 ** 2 - z3 ** 2 n3 = T.switch( T.lt(t3, 0), 0.0, t3 ** 4 * T.dot(grad3[gi3.astype('int32')], [x3, y3, z3])) return 23.0 * (n0 + n1 + n2 + n3)
def _get_cost2( self, output, truth, rescore=True ): if not hasattr(self, '_lambda_obj'): lambda_obj, lambda_noobj, thresh = T.scalar('lambda_obj'), T.scalar('lambda_noobj'), T.scalar('thresh') self._lambda_obj, self._lambda_noobj, self._thresh = lambda_obj, lambda_noobj, thresh else: lambda_obj, lambda_noobj, thresh = self._lambda_obj, self._lambda_noobj, self._thresh cost = 0. # create grid for cells w_cell, h_cell = 1. / self.output_shape[1], 1. / self.output_shape[0] x, y = T.arange(w_cell / 2, 1., w_cell), T.arange(h_cell / 2, 1., h_cell) y, x = meshgrid(x, y) # reshape truth to match with cell truth_cell = truth.dimshuffle(0, 1, 2, 'x','x') x, y = x.dimshuffle('x','x',0,1), y.dimshuffle('x','x',0,1) # calculate overlap between cell and ground truth boxes xi, yi = T.maximum(truth_cell[:,:,0], x - w_cell/2), T.maximum(truth_cell[:,:,1], y - h_cell/2) xf = T.minimum(truth_cell[:,:,[0,2]].sum(axis=2), x + w_cell/2) yf = T.minimum(truth_cell[:,:,[1,3]].sum(axis=2), y + h_cell/2) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) # overlap between cell and ground truth box overlap = (w * h) / (w_cell * h_cell) # repeat truth boxes truth_boxes = truth.dimshuffle(0, 1, 'x', 2, 'x', 'x') # create grid for anchor boxes anchors = T.concatenate((x.dimshuffle(0,1,'x','x',2,3) - w_cell/2, y.dimshuffle(0,1,'x','x',2,3) - h_cell/2), axis=3) anchors = T.concatenate((anchors, T.ones_like(anchors)), axis=3) anchors = T.repeat(anchors, self.boxes.__len__(), axis=2) w_acr = theano.shared(np.asarray([b[0] for b in self.boxes]), name='w_acr', borrow=True).dimshuffle('x','x',0,'x','x') h_acr = theano.shared(np.asarray([b[1] for b in self.boxes]), name='h_acr', borrow=True).dimshuffle('x','x',0,'x','x') anchors = T.set_subtensor(anchors[:,:,:,2], anchors[:,:,:,2] * w_acr) anchors = T.set_subtensor(anchors[:,:,:,3], anchors[:,:,:,3] * h_acr) # find iou between anchors and ground truths xi, yi = T.maximum(truth_boxes[:,:,:,0], anchors[:,:,:,0]), T.maximum(truth_boxes[:,:,:,1], anchors[:,:,:,1]) xf = T.minimum(truth_boxes[:,:,:,[0,2]].sum(axis=3), anchors[:,:,:,[0,2]].sum(axis=3)) yf = T.minimum(truth_boxes[:,:,:,[1,3]].sum(axis=3), anchors[:,:,:,[1,3]].sum(axis=3)) w, h = T.maximum(xf - xi, 0), T.maximum(yf - yi, 0) isec = w * h iou = isec / (T.prod(truth_boxes[:,:,:,[2,3]], axis=3) + T.prod(anchors[:,:,:,[2,3]], axis=3) - isec) overlap = overlap.dimshuffle(0,1,'x',2,3) best_iou_obj_idx = T.argmax(iou, axis=1).dimshuffle(0,'x',1,2,3) best_iou_box_idx = T.argmax(iou, axis=2).dimshuffle(0,1,'x',2,3) _,obj_idx,box_idx,_,_ = meshgrid( T.arange(truth.shape[0]), T.arange(truth.shape[1]), T.arange(self.boxes.__len__()), T.arange(self.output_shape[0]), T.arange(self.output_shape[1]) ) # define logical matrix assigning object to correct anchor box and cell. best_iou_idx = T.bitwise_and( T.bitwise_and( T.eq(best_iou_box_idx, box_idx), T.eq(best_iou_obj_idx, obj_idx) ), overlap >= thresh ) constants = [] if rescore: # scale predictions correctly pred = output.dimshuffle(0,'x',1,2,3,4) pred = T.set_subtensor(pred[:,:,:,0], pred[:,:,:,0] + x.dimshuffle(0,1,'x',2,3)) pred = T.set_subtensor(pred[:,:,:,1], pred[:,:,:,1] + y.dimshuffle(0,1,'x',2,3)) pred = T.set_subtensor(pred[:,:,:,2], w_acr * T.exp(pred[:,:,:,2])) pred = T.set_subtensor(pred[:,:,:,3], h_acr * T.exp(pred[:,:,:,3])) xi, yi = T.maximum(pred[:,:,:,0], truth_boxes[:,:,:,0]), T.maximum(pred[:,:,:,1], truth_boxes[:,:,:,1]) xf = T.minimum(pred[:,:,:,[0,2]].sum(axis=3), truth_boxes[:,:,:,[0,2]].sum(axis=3)) yf = T.minimum(pred[:,:,:,[1,3]].sum(axis=3), truth_boxes[:,:,:,[1,3]].sum(axis=3)) w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.) isec = w * h iou = isec / (pred[:,:,:,[2,3]].prod(axis=3) + truth_boxes[:,:,:,[2,3]].prod(axis=3) - isec) # make sure iou is considered constant when taking gradient constants.append(iou) # format ground truths correclty truth_boxes = truth_boxes = T.repeat( T.repeat( T.repeat(truth_boxes, self.boxes.__len__(), axis=2), self.output_shape[0], axis=4 ), self.output_shape[1], axis=5 ) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,0], truth_boxes[:,:,:,0] - anchors[:,:,:,0]) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,1], truth_boxes[:,:,:,1] - anchors[:,:,:,1]) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,2], T.log(truth_boxes[:,:,:,2] / anchors[:,:,:,2])) truth_boxes = T.set_subtensor(truth_boxes[:,:,:,3], T.log(truth_boxes[:,:,:,3] / anchors[:,:,:,3])) # add dimension for objects per image pred = T.repeat(output.dimshuffle(0,'x',1,2,3,4), truth.shape[1], axis=1) # penalize coordinates cost += lambda_obj * T.mean(((pred[:,:,:,:4] - truth_boxes[:,:,:,:4])**2).sum(axis=3)[best_iou_idx.nonzero()]) # penalize class scores cost += lambda_obj * T.mean((-truth_boxes[:,:,:,-self.num_classes:] * T.log(pred[:,:,:,-self.num_classes:])).sum(axis=3)[best_iou_idx.nonzero()]) # penalize objectness score if rescore: cost += lambda_obj * T.mean(((pred[:,:,:,4] - iou)**2)[best_iou_idx.nonzero()]) else: cost += lambda_obj * T.mean(((pred[:,:,:,4] - 1.)**2)[best_iou_idx.nonzero()]) # flip all matched and penalize all un-matched objectness scores not_matched_idx = best_iou_idx.sum(axis=1) > 0 not_matched_idx = bitwise_not(not_matched_idx) # penalize objectness score for non-matched boxes cost += lambda_noobj * T.mean((pred[:,0,:,4]**2)[not_matched_idx.nonzero()]) return cost, constants
def _zoom(a_lo, a_hi, phi_lo, phi_hi, derphi_lo, phi, derphi, phi0, derphi0, c1, c2, n_iters=10, profile = False, mode=theano.Mode(linker='cvm')): """ TODO: re-write me Part of the optimization algorithm in `scalar_search_wolfe2`. a_lo : scalar (step size) a_hi : scalar (step size) phi_lo : scalar (value of f at a_lo) phi_hi : scalar ( value of f at a_hi) derphi_lo : scalar ( value of derivative at a_lo) phi : callable -> generates computational graph derphi: callable -> generates computational graph phi0 : scalar ( value of f at 0) derphi0 : scalar (value of the derivative at 0) c1 : scalar (wolfe parameter) c2 : scalar (wolfe parameter) profile: if you want printouts of profiling information """ # Function reprensenting the computations of one step of the while loop def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi-a_lo a = TT.switch( dalpha < zero, a_hi, a_lo) b = TT.switch( dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1*dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2*dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2*derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0, phi_aj >= phi_lo) cond2 = derphi_aj*(a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse( cond1, phi_hi, TT.switch( cond2, phi_hi, phi_lo), name = 'phi_rec') a_rec = ifelse( cond1, a_hi, TT.switch( cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse( cond1, a_j, TT.switch( cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse( cond1, phi_aj, TT.switch( cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ( [ phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop) ) maxiter = n_iters delta1 = TT.constant(numpy.asarray(0.2, dtype=theano.config.floatX)) # cubic interpolant check delta2 = TT.constant(numpy.asarray(0.1, dtype=theano.config.floatX)) # quadratic interpolant check phi_rec = phi0 a_rec = zero # Initial iteration dalpha = a_hi-a_lo a = TT.switch( dalpha < zero, a_hi, a_lo) b = TT.switch( dalpha < zero, a_lo, a_hi) #a = ifelse(dalpha < 0, a_hi, a_lo) #b = ifelse(dalpha < 0, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # quadric interpolation qchk = delta2*dalpha a_j = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('mcond_q',TT.isnan(a_j), TT.bitwise_or( a_j > b-qchk, a_j < a + qchk)) a_j = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0, phi_aj >= phi_lo) cond2 = derphi_aj*(a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse( cond1, phi_hi, TT.switch( cond2, phi_hi, phi_lo), name='mphirec') a_rec = ifelse( cond1, a_hi, TT.switch( cond2, a_hi, a_lo), name='marec') a_hi = ifelse( cond1, a_j, TT.switch( cond2, a_lo, a_hi), name='mahi') phi_hi = ifelse( cond1, phi_aj, TT.switch( cond2, phi_lo, phi_hi), name='mphihi') onlyif = lazy_and( 'only_if', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2*derphi0) a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name = 'derphi_lo_main') phi_rec.name = 'phi_rec' a_rec.name = 'a_rec' a_lo.name = 'a_lo' a_hi.name = 'a_hi' phi_hi.name = 'phi_hi' phi_lo.name = 'phi_lo' derphi_lo.name = 'derphi_lo' vderphi_aj = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='vderphi_aj') states = [] states += [TT.unbroadcast(TT.shape_padleft(phi_rec),0)] states += [TT.unbroadcast(TT.shape_padleft(a_rec),0)] states += [TT.unbroadcast(TT.shape_padleft(a_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(a_hi),0)] states += [TT.unbroadcast(TT.shape_padleft(phi_hi),0)] states += [TT.unbroadcast(TT.shape_padleft(phi_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(derphi_lo),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] states += [TT.unbroadcast(TT.shape_padleft(zero),0)] print'while_zoom' outs, updates = scan(while_zoom, states = states, n_steps = maxiter, name = 'while_zoom', mode = mode, profile = profile) print 'done_while' a_star = ifelse(onlyif, a_j , outs[7][0], name='astar') val_star = ifelse(onlyif, phi_aj, outs[8][0], name='valstar') valprime = ifelse(onlyif, vderphi_aj, outs[9][0], name='valprime') ## WARNING !! I ignore updates given by scan which I should not do !!! return a_star, val_star, valprime
def logic_and(x, y): return T.bitwise_and(x, y)
def _get_cost( self, output, truth, S, B, C, rescore=False, lmbda_coord=5., lmbda_noobj=0.5, lmbda_obj=1., min_overlap=1e-5, use_overlap=False ): ''' Calculates cost for multiple objects in a scene without for loops or scan (so reduces the amount of variable created in the theano computation graph). A cell is associated with a certain object if the iou of that cell and the object is higher than any other ground truth object. and the rest of the objectness scores are pushed towards zero. Returns the cost and list of variable that I don't want to backpropagate through. Params: ------ use_overlap: Yolo, as described in the original paper, assigns a ground truth label if the ground truth box overlaps at all with the cell. I've found that the result is that with new images with many smaller objects because several objects might be overlap a single cell, this causes a sort of average bounding box which looks pretty bad. So by using overlap, you don't assign a cell to a ground truth label unless it overlaps by some semi-significant amount. ''' # calculate height/width of individual cell block_height, block_width = 1. / S[0], 1./ S[1] # get the offset of each cell offset_x, offset_y = meshgrid2D(T.arange(0,1,block_width), T.arange(0,1,block_height)) # get indices for x,y,w,h,object-ness for easy access x_idx, y_idx = T.arange(0,5*B,5), T.arange(1,5*B, 5) w_idx, h_idx = T.arange(2,5*B,5), T.arange(3,5*B,5) conf_idx = T.arange(4,5*B,5) # Get position predictions with offsets. pred_x = (output[:,x_idx] + offset_x.dimshuffle('x','x',0,1)).dimshuffle(0,'x',1,2,3) pred_y = (output[:,y_idx] + offset_y.dimshuffle('x','x',0,1)).dimshuffle(0,'x',1,2,3) pred_w, pred_h = output[:,w_idx].dimshuffle(0,'x',1,2,3), output[:,h_idx].dimshuffle(0,'x',1,2,3) #pred_w, pred_h = T.exp(pred_w), T.exp(pred_h) pred_conf = output[:,conf_idx].dimshuffle(0,'x',1,2,3) pred_class = output[:,-C:].dimshuffle(0,'x',1,2,3) #pred_w, pred_h = T.maximum(pred_w, 0.), T.maximum(pred_h, 0.) x_idx, y_idx = T.arange(0,truth.shape[1],4+C), T.arange(1,truth.shape[1],4+C) w_idx, h_idx = T.arange(2,truth.shape[1],4+C), T.arange(3,truth.shape[1],4+C) class_idx,_ = theano.scan( lambda x: T.arange(x,x+C,1), sequences = T.arange(4,truth.shape[1],4+C) ) truth_x, truth_y = truth[:,x_idx], truth[:,y_idx] truth_w, truth_h = truth[:,w_idx], truth[:,h_idx] truth_class = truth[:, class_idx] # Get intersection region bounding box coordinates xi = T.maximum(pred_x, truth_x.dimshuffle(0,1,'x','x','x')) xf = T.minimum(pred_x + pred_w, (truth_x + truth_w).dimshuffle(0,1,'x','x','x')) yi = T.maximum(pred_y, truth_y.dimshuffle(0,1,'x','x','x')) yf = T.minimum(pred_y + pred_h, (truth_y + truth_h).dimshuffle(0,1,'x','x','x')) w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.) # Calculate iou score for predicted boxes and truth isec = w * h union = (pred_w * pred_h) + (truth_w * truth_h).dimshuffle(0,1,'x','x','x') - isec iou = T.maximum(isec/union, 0.) # Calculate rmse for boxes which have 0 iou score squared_error = (pred_x - truth_x.dimshuffle(0,1,'x','x','x'))**2 + (pred_y - truth_y.dimshuffle(0,1,'x','x','x'))**2 + \ (pred_h - truth_h.dimshuffle(0,1,'x','x','x'))**2 + (pred_h - truth_h.dimshuffle(0,1,'x','x','x'))**2 # Get index matrix representing max along the 1st dimension for the iou score (reps 'responsible' box). maxval_idx, _ = meshgrid2D(T.arange(B), T.arange(truth.shape[0])) maxval_idx = maxval_idx.dimshuffle(0,'x',1,'x','x') maxval_idx = T.repeat(T.repeat(maxval_idx,S[0],3),S[1],4) # determine which box is responsible by giving box with highest iou score (if iou > 0) or smalles squared error. greater_iou = T.eq(maxval_idx, iou.argmax(axis=2).dimshuffle(0,1,'x',2,3)) smaller_se = T.eq(maxval_idx, squared_error.argmin(axis=2).dimshuffle(0,1,'x',2,3)) box_is_resp = T.switch(iou.max(axis=2, keepdims=True) > 0, greater_iou, smaller_se) # Get matrix for the width/height of each cell width, height = T.ones(S) / S[1], T.ones(S) / S[0] width, height = width.dimshuffle('x','x',0,1), height.dimshuffle('x','x',0,1) offset_x, offset_y = offset_x.dimshuffle('x','x',0,1), offset_y.dimshuffle('x','x',0,1) # Get bounding box for intersection between CELL and ground truth box. xi = T.maximum(offset_x, truth_x.dimshuffle(0,1,'x','x')) xf = T.minimum(offset_x + width, (truth_x + truth_w).dimshuffle(0,1,'x','x')) yi = T.maximum(offset_y, truth_y.dimshuffle(0,1,'x','x')) yf = T.minimum(offset_y + height, (truth_y + truth_h).dimshuffle(0,1,'x','x')) w, h = T.maximum(xf - xi, 0.), T.maximum(yf - yi, 0.) # Calculate iou score for the cell. isec = w * h if not use_overlap: union = (width * height) + (truth_w* truth_h).dimshuffle(0,1,'x','x') - isec iou_cell = T.maximum(isec/union, 0.).dimshuffle(0,1,'x',2,3) # * (np.prod(S)) # normalize the iou to make more sense else: iou_cell = T.maximum(isec / (width * height), 0.).dimshuffle(0,1,'x',2,3) maxval_idx, _ = meshgrid2D(T.arange(iou_cell.shape[1]), T.arange(iou_cell.shape[0])) maxval_idx = maxval_idx.dimshuffle(0,1,'x','x','x') maxval_idx = T.repeat(T.repeat(T.repeat(maxval_idx, B, 2), S[0], 3), S[1], 4) obj_for_cell = T.eq(maxval_idx, iou_cell.argmax(axis=1).dimshuffle(0,'x',1,2,3)) # Get logical matrix representing minimum iou score for cell to be considered overlapping ground truth. cell_intersects = (iou_cell > min_overlap) obj_in_cell_and_resp = T.bitwise_and(T.bitwise_and(cell_intersects, box_is_resp), obj_for_cell) conf_is_zero = T.bitwise_and( bitwise_not(T.bitwise_and(cell_intersects, box_is_resp)), obj_for_cell ) conf_is_zero = conf_is_zero.sum(axis=1, keepdims=True) # repeat "cell overlaps" logical matrix for the number of classes. pred_class = T.repeat(pred_class, truth.shape[1] // (4 + C), axis=1) # repeat the ground truth for class probabilities for each cell. truth_class_rep = T.repeat(T.repeat(truth_class.dimshuffle(0,1,2,'x','x'), S[0], axis=3), S[1], axis=4) cell_intersects = T.repeat(cell_intersects, C, axis=2) if not rescore: iou = T.ones_like(iou) cost = T.sum((pred_conf - iou)[obj_in_cell_and_resp.nonzero()]**2) + \ lmbda_noobj * T.sum((pred_conf[conf_is_zero.nonzero()])**2) + \ lmbda_coord * T.sum((pred_x - truth_x.dimshuffle(0,1,'x','x','x'))[obj_in_cell_and_resp.nonzero()]**2) + \ lmbda_coord * T.sum((pred_y - truth_y.dimshuffle(0,1,'x','x','x'))[obj_in_cell_and_resp.nonzero()]**2) + \ lmbda_coord * T.sum((safe_sqrt(pred_w) - safe_sqrt(truth_w.dimshuffle(0,1,'x','x','x')))[obj_in_cell_and_resp.nonzero()]**2) + \ lmbda_coord * T.sum((safe_sqrt(pred_h) - safe_sqrt(truth_h.dimshuffle(0,1,'x','x','x')))[obj_in_cell_and_resp.nonzero()]**2) + \ lmbda_obj * T.sum(((pred_class - truth_class_rep)[cell_intersects.nonzero()])**2) cost /= T.maximum(1., truth.shape[0]) return cost, [iou]
def while_zoom(phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime): # interpolate to find a trial step length between a_lo and # a_hi Need to choose interpolation here. Use cubic # interpolation and then if the result is within delta * # dalpha or outside of the interval bounded by a_lo or a_hi # then use quadratic interpolation, if the result is still too # close, then use bisection dalpha = a_hi-a_lo a = TT.switch( dalpha < zero, a_hi, a_lo) b = TT.switch( dalpha < zero, a_lo, a_hi) # minimizer of cubic interpolant # (uses phi_lo, derphi_lo, phi_hi, and the most recent value of phi) # # if the result is too close to the end points (or out of the # interval) then use quadratic interpolation with phi_lo, # derphi_lo and phi_hi if the result is stil too close to the # end points (or out of the interval) then use bisection # cubic interpolation cchk = delta1*dalpha a_j_cubic = _cubicmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi, a_rec, phi_rec) # quadric interpolation qchk = delta2*dalpha a_j_quad = _quadmin(a_lo, phi_lo, derphi_lo, a_hi, phi_hi) cond_q = lazy_or('condq',TT.isnan(a_j_quad), a_j_quad > b-qchk, a_j_quad < a + qchk) a_j_quad = TT.switch(cond_q, a_lo + numpy.asarray(0.5, dtype=theano.config.floatX)*dalpha, a_j_quad) # pick between the two .. cond_c = lazy_or('condc',TT.isnan(a_j_cubic), TT.bitwise_or(a_j_cubic > b - cchk, a_j_cubic < a + cchk)) # this lazy if actually decides if we need to run the quadric # interpolation a_j = TT.switch(cond_c, a_j_quad, a_j_cubic) #a_j = ifelse(cond_c, a_j_quad, a_j_cubic) # Check new value of a_j phi_aj = phi(a_j) derphi_aj = derphi(a_j) stop = lazy_and('stop', TT.bitwise_and(phi_aj <= phi0 + c1*a_j*derphi0, phi_aj < phi_lo), abs(derphi_aj) <= -c2*derphi0) cond1 = TT.bitwise_or(phi_aj > phi0 + c1*a_j*derphi0, phi_aj >= phi_lo) cond2 = derphi_aj*(a_hi - a_lo) >= zero # Switches just make more sense here because they have a C # implementation and they get composed phi_rec = ifelse( cond1, phi_hi, TT.switch( cond2, phi_hi, phi_lo), name = 'phi_rec') a_rec = ifelse( cond1, a_hi, TT.switch( cond2, a_hi, a_lo), name='a_rec') a_hi = ifelse( cond1, a_j, TT.switch( cond2, a_lo, a_hi), name='a_hi') phi_hi = ifelse( cond1, phi_aj, TT.switch( cond2, phi_lo, phi_hi), name='phi_hi') a_lo = TT.switch(cond1, a_lo, a_j) phi_lo = TT.switch(cond1, phi_lo, phi_aj) derphi_lo = ifelse(cond1, derphi_lo, derphi_aj, name='derphi_lo') a_star = a_j val_star = phi_aj valprime = ifelse(cond1, nan, TT.switch(cond2, derphi_aj, nan), name='valprime') return ( [ phi_rec, a_rec, a_lo, a_hi, phi_hi, phi_lo, derphi_lo, a_star, val_star, valprime], theano.scan_module.scan_utils.until(stop) )
def __init__(self, inputs, labels, y_mask, n_dim, cutoff, project_factor=4): ''' Args: inputs: flattened logits with shape of [n_step*n_batch, n_dim] labels: flattened labels with shape of [n_step*n_batch] y_mask: mask the null space of sentences with shape of [n_step*n_batch] cutoff: frequency binning, i.e. [2000, vocab_size] project_factor: project for low-frequency words ''' self.input_dim = n_dim self.sample_num = inputs.shape[0] self.cluster_num = len(cutoff) - 1 self.head_dim = cutoff[0] + self.cluster_num self.params = [] self.y_mask = y_mask init_head_w = np.asarray(np.random.uniform(low=-np.sqrt(1./self.input_dim), high=np.sqrt(1./self.input_dim), size=(self.input_dim,self.head_dim))) self.head_w=theano.shared(value=init_head_w,name='head_w') self.params.append(self.head_w) tail_project_factor = project_factor tail_w_list = [] for i in range(self.cluster_num): project_dim = max(1, self.input_dim // tail_project_factor) tail_dim = cutoff[i + 1] - cutoff[i] _tail_proj_w = np.asarray(np.random.uniform(low=-np.sqrt(1./self.input_dim), high=np.sqrt(1./self.input_dim), size=(self.input_dim, project_dim)),dtype=theano.config.floatX) _tail_w = np.asarray(np.random.uniform(low=-np.sqrt(1./project_dim), high=np.sqrt(1./project_dim), size=(project_dim,tail_dim)),dtype=theano.config.floatX) tail_proj_w = theano.shared(value=_tail_proj_w, name="adaptive_softmax_tail{}_proj_w".format(i+1)) tail_w = theano.shared(value=_tail_w, name="adaptive_softmax_tail{}_w".format(i+1)) tail_w_list.append([tail_proj_w, tail_w]) tail_project_factor *= project_factor self.params.append(tail_proj_w) self.params.append(tail_w) training_losses = [] loss = 0. head_labels = labels for i in range(self.cluster_num): mask = T.bitwise_and(T.ge(labels, cutoff[i]), T.lt(labels, cutoff[i + 1])) # mask words not in this cluster # update head labels with mask (we take words with high frequency as head part) head_labels = T.switch(mask, T.constant([cutoff[0] + i]).repeat(self.sample_num), head_labels) # we take words with low frequency as a unified label, and append to tail of head labels # i.g. 3000 -> 2000, 2001 -> 2000 # head labels: [0,1,2...,1999] + [2000] # compute tail loss # first remove the words not in this cluster (this range of frequency) with mask tail_inputs = inputs[mask.nonzero()] # encode on tail inputs and get logits tail_logits = T.dot(T.dot(tail_inputs, tail_w_list[i][0]), tail_w_list[i][1]) # update tail labels, relabel by (- 2000) tail_labels = (labels - cutoff[i])[mask.nonzero()] # y_mask that eases the effect of null space in the tail of sentences tail_y_mask = self.y_mask[mask.nonzero()] tail_logits = tail_logits[T.eq(tail_y_mask, 1).nonzero()] tail_labels = tail_labels[T.eq(tail_y_mask, 1).nonzero()] # to solve NaN problem tail_logits = T.clip(tail_logits, 1.0e-8, 1.0 - 1.0e-8) # tail_loss for words with low frequency tail_loss = T.mean(T.nnet.categorical_crossentropy(tail_logits, tail_labels)) training_losses.append(tail_loss) loss += tail_loss self.tail_logits = tail_logits self.tail_labels = tail_labels self.tail_loss = tail_loss # compute head loss # encode head_inputs head_logits = T.dot(inputs, self.head_w) # y_mask that eases the effect of null space in the tail of sentences head_logits = head_logits[T.eq(self.y_mask, 1).nonzero()] head_labels = head_labels[T.eq(self.y_mask, 1).nonzero()] # to solve NaN problem head_logits = T.clip(head_logits, 1.0e-8, 1.0 - 1.0e-8) head_loss = T.mean(T.nnet.categorical_crossentropy(head_logits, head_labels)) loss += head_loss training_losses.append(head_loss) self.loss = loss self.training_losses = training_losses self.head_loss = head_loss