def fztloss( f, pVecs, nVecs ): """ Tensorized cost function from Fast Zero-Shot Learning paper Args: f: The output from the network, a tensor of shape (# images, word embedding size) pVecs: The vector embeddings of the ground truth tags, a tensor of shape (# images, # positive tags, word embedding size) nVecs: The vector embeddings of negatively sampled tags, a tensor of shape (# images, # negative samples, word embedding size) Returns: Scalar tensor representing the batch cost """ posmul = tf.mul(pVecs, f) negmul = tf.mul(nVecs, f) tfpos = tf.reduce_sum(posmul, reduction_indices=2) tfneg = tf.reduce_sum(negmul, reduction_indices=2) tfpos = tf.transpose(tfpos, [1,0]) tfneg = tf.transpose(tfneg, [1,0]) negexpan = tf.tile( tf.expand_dims(tfneg, -1), [1, 1, tf.shape(tfpos)[1]] ) posexpan = tf.tile( tf.transpose(tf.expand_dims(tfpos, -1), [0,2,1]), [1, tf.shape(tfneg)[1], 1]) differences = tf.sub(negexpan, posexpan) return tf.reduce_sum(tf.reduce_sum(tf.log(1 + tf.exp(differences)), reduction_indices=[1,2]))
def make_padded_blocked_matrix(matrix, block_size): """Converts a matrix to padded column-blocked format. For example, given a [64,127] matrix and block_size=16, this function returns an [8,64,16] tensor where the 8 inner sub-matrices, when concatenated left to right, re-constitute the original matrix. Note that the 8th sub-matrix has a final column of padding. Args: matrix: The matrix to convert. block_size: The number of columns per block. Returns: Padded column-blocked matrix. """ shape = tf.shape(matrix) num_rows = shape[0] num_columns = shape[1] # Compute the amount of padding and resulting number of blocks. last_block_size = num_columns % block_size padding_size = (block_size - last_block_size) % block_size num_blocks = (num_columns + padding_size) // block_size # Somehow the obvious approach based on tf.split() and tf.stack() doesn't work # (seems that the number of splits needs to be statically-known), but this # alternative based on tf.transpose() and tf.reshape() does. Continuing the # example from the docstring... padded = tf.pad(matrix, [[0, 0], [0, padding_size]]) # [64,127] => [64,128] transposed = tf.transpose(padded) # => [128,64] blocked = tf.reshape(transposed, [num_blocks, block_size, num_rows]) # => [8,16,64] return tf.transpose(blocked, [0, 2, 1]) # => [8,64,16]
def update_centers(self, img_dataset): ''' Optimize: self.C = (U * hu^T + V * hv^T) (hu * hu^T + hv * hv^T)^{-1} self.C^T = (hu * hu^T + hv * hv^T)^{-1} (hu * U^T + hv * V^T) but all the C need to be replace with C^T : self.C = (hu * hu^T + hv * hv^T)^{-1} (hu^T * U + hv^T * V) ''' old_C_value = self.sess.run(self.C) h = self.img_b_all U = self.img_output_all smallResidual = tf.constant( np.eye(self.subcenter_num * self.subspace_num, dtype=np.float32) * 0.001) Uh = tf.matmul(tf.transpose(h), U) hh = tf.add(tf.matmul(tf.transpose(h), h), smallResidual) compute_centers = tf.matmul(tf.matrix_inverse(hh), Uh) update_C = self.C.assign(compute_centers) C_value = self.sess.run(update_C, feed_dict={ self.img_output_all: img_dataset.output, self.img_b_all: img_dataset.codes, }) C_sums = np.sum(np.square(C_value), axis=1) C_zeros_ids = np.where(C_sums < 1e-8) C_value[C_zeros_ids, :] = old_C_value[C_zeros_ids, :] self.sess.run(self.C.assign(C_value))
def __init__( self, layer=None, act=tf.identity, epsilon=1e-5, scale_init=tf.constant_initializer(1.0), offset_init=tf.constant_initializer(0.0), G=32, name='group_norm', ): Layer.__init__(self, name=name) self.inputs = layer.outputs print(" [TL] GroupNormLayer %s: epsilon:%f act:%s" % (self.name, epsilon, act.__name__)) inputs_shape = get_shape(layer.outputs) G = tf.minimum(G, inputs_shape[-1]) # [N, H, W, C] to [N, C, H, W] temp_input = tf.transpose(self.inputs, [0, 3, 1, 2]) temp_input = tf.reshape(temp_input, [inputs_shape[0], G, inputs_shape[-1]//G, inputs_shape[1], inputs_shape[2]], name='group_reshape1') with tf.variable_scope(name) as vs: mean, var = tf.nn.moments(temp_input, [2, 3, 4], keep_dims=True) scale = tf.get_variable('scale', shape=[1, inputs_shape[-1], 1, 1], initializer=scale_init, dtype=D_TYPE) offset = tf.get_variable('offset', shape=[1, inputs_shape[-1], 1, 1], initializer=offset_init, dtype=D_TYPE) temp_input = (temp_input - mean) / tf.sqrt(var + epsilon) temp_input = tf.reshape(temp_input, shape=[inputs_shape[0], inputs_shape[-1], inputs_shape[1], inputs_shape[2]], name='group_reshape2') self.outputs = scale * temp_input + offset self.outputs = tf.transpose(self.outputs, [0, 2, 3, 1]) self.outputs = act(self.outputs) variables = tf.get_collection(TF_GRAPHKEYS_VARIABLES, scope=vs.name) self.all_layers = list(layer.all_layers) self.all_params = list(layer.all_params) self.all_drop = dict(layer.all_drop) self.all_layers.extend([self.outputs]) self.all_params.extend(variables)
def build_predict(self, Xnew, full_cov=False): """ Xnew is a data matrix, point at which we want to predict This method computes p(F* | Y ) where F* are points on the GP at Xnew, Y are noisy observations at X. """ Kx = self.kern.K(self.X, Xnew) K = self.kern.K(self.X) + eye(self.num_data) * self.likelihood.variance L = tf.cholesky(K) A = tf.matrix_triangular_solve(L, Kx, lower=True) V = tf.matrix_triangular_solve(L, self.Y - self.mean_function(self.X)) fmean = tf.matmul(tf.transpose(A), V) + self.mean_function(Xnew) if full_cov: fvar = self.kern.K(Xnew) - tf.matmul(tf.transpose(A), A) shape = tf.pack([1, 1, tf.shape(self.Y)[1]]) fvar = tf.tile(tf.expand_dims(fvar, 2), shape) else: fvar = self.kern.Kdiag(Xnew) - tf.reduce_sum(tf.square(A), 0) fvar = tf.tile(tf.reshape(fvar, (-1, 1)), [1, self.Y.shape[1]]) return fmean, fvar
def sample_with_temperature(logits, temperature): """Either argmax after softmax or random sample along the pitch axis. Args: logits: a Tensor of shape (batch, time, pitch, instrument). temperature: a float 0.0=argmax 1.0=random Returns: a Tensor of the same shape, with one_hots on the pitch dimension. """ logits = tf.transpose(logits, [0, 1, 3, 2]) pitch_range = tf.shape(logits)[-1] def sample_from_logits(logits): with tf.control_dependencies([tf.assert_greater(temperature, 0.0)]): logits = tf.identity(logits) reshaped_logits = ( tf.reshape(logits, [-1, tf.shape(logits)[-1]]) / temperature) choices = tf.multinomial(reshaped_logits, 1) choices = tf.reshape(choices, tf.shape(logits)[:logits.get_shape().ndims - 1]) return choices choices = tf.cond(tf.equal(temperature, 0.0), lambda: tf.argmax(tf.nn.softmax(logits), -1), lambda: sample_from_logits(logits)) samples_onehot = tf.one_hot(choices, pitch_range) return tf.transpose(samples_onehot, [0, 1, 3, 2])
def _define_distance_to_clusters(self, data): """Defines the Mahalanobis distance to the assigned Gaussian.""" # TODO(xavigonzalvo): reuse (input - mean) * cov^-1 * (input - # mean) from log probability function. self._all_scores = [] for shard in data: all_scores = [] shard = tf.expand_dims(shard, 0) for c in xrange(self._num_classes): if self._covariance_type == FULL_COVARIANCE: cov = self._covs[c, :, :] elif self._covariance_type == DIAG_COVARIANCE: cov = tf.diag(self._covs[c, :]) inverse = tf.matrix_inverse(cov + self._min_var) inv_cov = tf.tile( tf.expand_dims(inverse, 0), tf.pack([self._num_examples, 1, 1])) diff = tf.transpose(shard - self._means[c, :, :], perm=[1, 0, 2]) m_left = tf.batch_matmul(diff, inv_cov) all_scores.append(tf.sqrt(tf.batch_matmul( m_left, tf.transpose(diff, perm=[0, 2, 1]) ))) self._all_scores.append(tf.reshape( tf.concat(1, all_scores), tf.pack([self._num_examples, self._num_classes]))) # Distance to the associated class. self._all_scores = tf.concat(0, self._all_scores) assignments = tf.concat(0, self.assignments()) rows = tf.to_int64(tf.range(0, self._num_examples)) indices = tf.concat(1, [tf.expand_dims(rows, 1), tf.expand_dims(assignments, 1)]) self._scores = tf.gather_nd(self._all_scores, indices)
def tf_tensordot(tf1,tf2,axes): debug = True shp1 = tf1.get_shape() shp2 = tf2.get_shape() r1 = range(len(shp1)) r2 = range(len(shp2)) # Indices i1,i2 = axes e1 = list(set(r1)-set(i1)) e2 = list(set(r2)-set(i2)) ne1 = len(e1) ne2 = len(e2) nii = len(i1) rank = ne1+ne2 sdx1 = e1+i1 # sort index sdx2 = i2+e2 # Shapes - get_reshape() return Dimensions eshp1 = [shp1[i].value for i in e1] ishp1 = [shp1[i].value for i in i1] eshp2 = [shp2[i].value for i in e2] ishp2 = [shp2[i].value for i in i2] esize1 = numpy.prod(eshp1) isize1 = numpy.prod(ishp1) esize2 = numpy.prod(eshp2) isize2 = numpy.prod(ishp2) mtf1 = tf.reshape(tf.transpose(tf1,perm=sdx1),[esize1,isize1]) mtf2 = tf.reshape(tf.transpose(tf2,perm=sdx2),[isize2,esize2]) tfc = tf.reshape( tf.matmul(mtf1,mtf2) , eshp1+eshp2 ) return tfc
def body(self, features): hp = self.hparams block_fns = { "residual": residual_block, "bottleneck": bottleneck_block, } assert hp.block_fn in block_fns inputs = features["inputs"] data_format = "channels_last" if hp.use_nchw: # Convert from channels_last (NHWC) to channels_first (NCHW). This # provides a large performance boost on GPU. inputs = tf.transpose(inputs, [0, 3, 1, 2]) data_format = "channels_first" out = resnet_v2( inputs, block_fns[hp.block_fn], hp.layer_sizes, hp.filter_sizes, data_format, is_training=hp.mode == tf.estimator.ModeKeys.TRAIN) if hp.use_nchw: out = tf.transpose(out, [0, 2, 3, 1]) return out
def chebyshev2(self, x, L, Fout, K): """ Filtering with Chebyshev interpolation Implementation: numpy. Data: x of size N x M x F N: number of signals M: number of vertices F: number of features per signal per vertex """ N, M, Fin = x.get_shape() N, M, Fin = int(N), int(M), int(Fin) # Rescale Laplacian. Copy to not modify the shared L. L = scipy.sparse.csr_matrix(L) L = graph.rescale_L(L, lmax=2) # Transform to Chebyshev basis x = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N x = tf.reshape(x, [M, Fin*N]) # M x Fin*N def chebyshev(x): return graph.chebyshev(L, x, K) x = tf.py_func(chebyshev, [x], [tf.float32])[0] # K x M x Fin*N x = tf.reshape(x, [K, M, Fin, N]) # K x M x Fin x N x = tf.transpose(x, perm=[3,1,2,0]) # N x M x Fin x K x = tf.reshape(x, [N*M, Fin*K]) # N*M x Fin*K # Filter: Fin*Fout filters of order K, i.e. one filterbank per feature. W = self._weight_variable([Fin*K, Fout], regularization=False) x = tf.matmul(x, W) # N*M x Fout return tf.reshape(x, [N, M, Fout]) # N x M x Fout
def chebyshev5(self, x, L, Fout, K, regularization=False): N, M, Fin = x.get_shape() N, M, Fin = int(N), int(M), int(Fin) # Rescale Laplacian and store as a TF sparse tensor. Copy to not modify the shared L. L = scipy.sparse.csr_matrix(L) L = graph.rescale_L(L, lmax=2) L = L.tocoo() indices = np.column_stack((L.row, L.col)) L = tf.SparseTensor(indices, L.data, L.shape) L = tf.sparse_reorder(L) # Transform to Chebyshev basis x0 = tf.transpose(x, perm=[1, 2, 0]) # M x Fin x N x0 = tf.reshape(x0, [M, Fin*N]) # M x Fin*N x = tf.expand_dims(x0, 0) # 1 x M x Fin*N def concat(x, x_): x_ = tf.expand_dims(x_, 0) # 1 x M x Fin*N return tf.concat(0, [x, x_]) # K x M x Fin*N if K > 1: x1 = tf.sparse_tensor_dense_matmul(L, x0) x = concat(x, x1) for k in range(2, K): x2 = 2 * tf.sparse_tensor_dense_matmul(L, x1) - x0 # M x Fin*N x = concat(x, x2) x0, x1 = x1, x2 x = tf.reshape(x, [K, M, Fin, N]) # K x M x Fin x N x = tf.transpose(x, perm=[3,1,2,0]) # N x M x Fin x K x = tf.reshape(x, [N*M, Fin*K]) # N*M x Fin*K # Filter: Fin*Fout filters of order K, i.e. one filterbank per feature pair. W = self._weight_variable([Fin*K, Fout], regularization=regularization) x = tf.matmul(x, W) # N*M x Fout return tf.reshape(x, [N, M, Fout]) # N x M x Fout
def bboxes_intersection(bbox_ref, bboxes, name=None): """Compute relative intersection between a reference box and a collection of bounding boxes. Namely, compute the quotient between intersection area and box area. Args: bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es). bboxes: (N, 4) Tensor, collection of bounding boxes. Return: (N,) Tensor with relative intersection. """ with tf.name_scope(name, 'bboxes_intersection'): # Should be more efficient to first transpose. bboxes = tf.transpose(bboxes) bbox_ref = tf.transpose(bbox_ref) # Intersection bbox and volume. int_ymin = tf.maximum(bboxes[0], bbox_ref[0]) int_xmin = tf.maximum(bboxes[1], bbox_ref[1]) int_ymax = tf.minimum(bboxes[2], bbox_ref[2]) int_xmax = tf.minimum(bboxes[3], bbox_ref[3]) h = tf.maximum(int_ymax - int_ymin, 0.) w = tf.maximum(int_xmax - int_xmin, 0.) # Volumes. inter_vol = h * w bboxes_vol = (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) scores = tfe_math.safe_divide(inter_vol, bboxes_vol, 'intersection') return scores
def soft_alignment(U_AP, raw_question_rep, raw_answer_rep, tokens_question_non_zero, tokens_answer_non_zero): """Calculate the AP soft-alignment matrix (in a batch-friendly fashion) :param U_AP: The AP similarity matrix (to be learned) :param raw_question_rep: :param raw_answer_rep: :param tokens_question_non_zero: :param tokens_answer_non_zero: :return: """ answer_transposed = tf.transpose(raw_answer_rep, [0, 2, 1]) # Unfortunately, there is no clean way in TF to multiply a 3d tensor with a 2d tensor. We need to perform some # reshaping. Compare solution 2 on # http://stackoverflow.com/questions/38235555/tensorflow-matmul-of-input-matrix-with-batch-data raw_question_rep_flat = tf.reshape(raw_question_rep, [-1, tf.shape(raw_question_rep)[2]]) QU_flat = tf.matmul(raw_question_rep_flat, U_AP) QU = tf.reshape(QU_flat, [-1, tf.shape(raw_question_rep)[1], tf.shape(raw_question_rep)[2]]) QUA = tf.batch_matmul(QU, answer_transposed) G = tf.nn.tanh(QUA) # We are now removing all the fields of G that belong to zero padding. To achieve this, we are determining these # fields and adding a value of -2 to all of them (which is guaranteed to result in a smaller number than the minimum # of G, which is -1) additions_G_question = tf.transpose( tf.reshape((tokens_question_non_zero - 1) * 2, [-1, 1, tf.shape(tokens_question_non_zero)[1]]), [0, 2, 1] ) additions_G_answer = tf.reshape((tokens_answer_non_zero - 1) * 2, [-1, 1, tf.shape(tokens_answer_non_zero)[1]]) # G_non_zero contains values of less than -1 for all fields which have a relation to zero-padded token positions G_non_zero = G + additions_G_question + additions_G_answer return G_non_zero
def bboxes_clip(bbox_ref, bboxes, scope=None): """Clip bounding boxes to a reference box. Batch-compatible if the first dimension of `bbox_ref` and `bboxes` can be broadcasted. Args: bbox_ref: Reference bounding box. Nx4 or 4 shaped-Tensor; bboxes: Bounding boxes to clip. Nx4 or 4 shaped-Tensor or dictionary. Return: Clipped bboxes. """ # Bboxes is dictionary. if isinstance(bboxes, dict): with tf.name_scope(scope, 'bboxes_clip_dict'): d_bboxes = {} for c in bboxes.keys(): d_bboxes[c] = bboxes_clip(bbox_ref, bboxes[c]) return d_bboxes # Tensors inputs. with tf.name_scope(scope, 'bboxes_clip'): # Easier with transposed bboxes. Especially for broadcasting. bbox_ref = tf.transpose(bbox_ref) bboxes = tf.transpose(bboxes) # Intersection bboxes and reference bbox. ymin = tf.maximum(bboxes[0], bbox_ref[0]) xmin = tf.maximum(bboxes[1], bbox_ref[1]) ymax = tf.minimum(bboxes[2], bbox_ref[2]) xmax = tf.minimum(bboxes[3], bbox_ref[3]) # Double check! Empty boxes when no-intersection. ymin = tf.minimum(ymin, ymax) xmin = tf.minimum(xmin, xmax) bboxes = tf.transpose(tf.stack([ymin, xmin, ymax, xmax], axis=0)) return bboxes
def bboxes_jaccard(bbox_ref, bboxes, name=None): """Compute jaccard score between a reference box and a collection of bounding boxes. Args: bbox_ref: (N, 4) or (4,) Tensor with reference bounding box(es). bboxes: (N, 4) Tensor, collection of bounding boxes. Return: (N,) Tensor with Jaccard scores. """ with tf.name_scope(name, 'bboxes_jaccard'): # Should be more efficient to first transpose. bboxes = tf.transpose(bboxes) bbox_ref = tf.transpose(bbox_ref) # Intersection bbox and volume. int_ymin = tf.maximum(bboxes[0], bbox_ref[0]) int_xmin = tf.maximum(bboxes[1], bbox_ref[1]) int_ymax = tf.minimum(bboxes[2], bbox_ref[2]) int_xmax = tf.minimum(bboxes[3], bbox_ref[3]) h = tf.maximum(int_ymax - int_ymin, 0.) w = tf.maximum(int_xmax - int_xmin, 0.) # Volumes. inter_vol = h * w union_vol = -inter_vol \ + (bboxes[2] - bboxes[0]) * (bboxes[3] - bboxes[1]) \ + (bbox_ref[2] - bbox_ref[0]) * (bbox_ref[3] - bbox_ref[1]) jaccard = tfe_math.safe_divide(inter_vol, union_vol, 'jaccard') return jaccard
def opt_energy_env_1site(iso_012, h_op_1site, h_mpo_2site, state_1site): iso_021 = tf.transpose(iso_012, (0, 2, 1)) terms_012, terms_021 = _ascend_op_to_1site_partial(h_op_1site, h_mpo_2site, iso_012, iso_021) terms = terms_012 + tf.transpose(terms_021, (0, 2, 1)) env = tensornetwork.ncon([state_1site, terms], [(0, -1), (0, -2, -3)]) return env
def runFiniteDifferences(self, shapes, dtypes=(tf.float32, tf.float64), scalarTest=False): with self.test_session(use_gpu=False): for shape in shapes: for batch in False, True: for dtype in dtypes: if not scalarTest: x = tf.constant(np.random.randn(shape[0], shape[1]), dtype) tensor = tf.matmul(x, tf.transpose(x)) / shape[0] else: # This is designed to be a faster test for larger matrices. x = tf.constant(np.random.randn(), dtype) R = tf.constant(np.random.randn(shape[0], shape[1]), dtype) e = tf.mul(R, x) tensor = tf.matmul(e, tf.transpose(e)) / shape[0] # Inner-most matrices in tensor are positive definite. if batch: tensor = tf.tile(tf.expand_dims(tensor, 0), [4, 1, 1]) op = tf.batch_cholesky else: op = tf.cholesky if not (scalarTest): y = op(tensor) else: y = tf.reduce_mean(op(tensor)) error = tf.test.compute_gradient_error(x, x._shape_as_list(), y, y._shape_as_list()) tf.logging.info("error = %f", error) if dtype == tf.float64: self.assertLess(error, 1e-5) else: self.assertLess(error, 3e-3)
def _from_dilation(self, inputs, crop): '''Remove paddings and reshape to 1d signal. Used after 1D dilation convolution. ''' if self.paddings is not None: #dilated conv assert isinstance(self.paddings, int) #inputs: [batch_size * dilation_rate, width_pad / dilation_rate, channels] inputs_shape = tf.shape(inputs) batch_size = inputs_shape[0] / self.dilation_rate width_pad = inputs_shape[1] * self.dilation_rate channels = inputs_shape[-1] new_shape = (width_pad, -1, channels) #-1 refers to batch_size #[width_pad / dilation_rate, batch_size * dilation_rate, channels] inputs_transposed = tf.transpose(inputs, [1, 0, 2]) #[width_pad, batch_size, channels] inputs_reshaped = tf.reshape(inputs_transposed, new_shape) #[batch_size, channels, width_pad] outputs = tf.transpose(inputs_reshaped, [1, 2, 0]) #[batch_size, channels, width] cropped = tf.slice(outputs, [0, 0, crop], [-1, -1, -1]) else: #Simple channels first convolution cropped = tf.transpose(inputs, [0, 2, 1]) return cropped
def SubpixelConv2D(*args, **kwargs): kwargs['output_dim'] = 4*kwargs['output_dim'] output = lib.ops.conv2d.Conv2D(*args, **kwargs) output = tf.transpose(output, [0,2,3,1]) output = tf.depth_to_space(output, 2) output = tf.transpose(output, [0,3,1,2]) return output
def inference(self, x): #loc_t ~ gaussian(loc_mean_t, [[sigma^2, 0], [0, sigma^2]]^-1) #loc_t = loc_mean_t + normal(loc_mean_t.shape, # avg = 0.0, # std = self.sigma) loc_t = self.loc_init h_t = self.h_init loc_mean_ts = [] loc_ts = [] h_ts = [] for i in xrange(self.n_steps): x_t = self.rho(loc_t, x) g_t = self.f_g(x_t, loc_t) h_t = self.f_h(h_t, g_t) loc_mean_t = self.f_l(h_t) loc_t = tf.random_normal(loc_mean_t.get_shape(), mean = loc_mean_t, stddev = self.sigma) loc_mean_ts.append(loc_mean_t) loc_ts.append(loc_t) h_ts.append(h_t) prob = tf.matmul(h_t, self.w_classifier) prob = tf.nn.bias_add(prob, self.b_classifier) pred = tf.argmax(prob, 1) loc_mean_ts = tf.transpose(tf.pack(loc_mean_ts), perm = [1, 0, 2]) loc_ts = tf.transpose(tf.pack(loc_ts), perm = [1, 0, 2]) h_ts = tf.transpose(tf.pack(h_ts), perm = [1, 0, 2]) return loc_mean_ts, loc_ts, h_ts, prob, pred, loc_t
def _to_dilation(self, inputs): '''Pad and reshape inputs by dilation rate. Used to perfrom 1D dilation convolution. ''' if self.paddings is not None: #dilated conv assert isinstance(self.paddings, int) inputs_padded = tf.pad(inputs, [[0, 0], [0, 0], [self.paddings, 0]], "CONSTANT") #inputs are channels first inputs_shape = tf.shape(inputs_padded) channels = inputs_shape[1] width_pad = inputs_shape[-1] dilation_shape = (width_pad // self.dilation_rate, -1, channels) #-1 refers to batch_size * dilation_rate #[width_pad, batch_size, channels] inputs_transposed = tf.transpose(inputs_padded, [2, 0, 1]) #[width_pad / dilation_rate, batch_size * dilation_rate, channels] inputs_reshaped = tf.reshape(inputs_transposed, dilation_shape) #[batch_size * dilation_rate, width_pad / dilation_rate, channels] outputs = tf.transpose(inputs_reshaped, [1, 0, 2]) else: #Simple channels first convolution outputs = tf.transpose(inputs, [0, 2, 1]) return outputs
def cross_entropy(u, label_u, alpha=0.5, normed=False): label_ip = tf.cast( tf.matmul(label_u, tf.transpose(label_u)), tf.float32) s = tf.clip_by_value(label_ip, 0.0, 1.0) # compute balance param # s_t \in {-1, 1} s_t = tf.multiply(tf.add(s, tf.constant(-0.5)), tf.constant(2.0)) sum_1 = tf.reduce_sum(s) sum_all = tf.reduce_sum(tf.abs(s_t)) balance_param = tf.add(tf.abs(tf.add(s, tf.constant(-1.0))), tf.multiply(tf.div(sum_all, sum_1), s)) if normed: # ip = tf.clip_by_value(tf.matmul(u, tf.transpose(u)), -1.5e1, 1.5e1) ip_1 = tf.matmul(u, tf.transpose(u)) def reduce_shaper(t): return tf.reshape(tf.reduce_sum(t, 1), [tf.shape(t)[0], 1]) mod_1 = tf.sqrt(tf.matmul(reduce_shaper(tf.square(u)), reduce_shaper(tf.square(u)), transpose_b=True)) ip = tf.div(ip_1, mod_1) else: ip = tf.clip_by_value(tf.matmul(u, tf.transpose(u)), -1.5e1, 1.5e1) ones = tf.ones([tf.shape(u)[0], tf.shape(u)[0]]) return tf.reduce_mean(tf.multiply(tf.log(ones + tf.exp(alpha * ip)) - s * alpha * ip, balance_param))
def compute_pairwise_distances(x, y): """Computes the squared pairwise Euclidean distances between x and y. Args: x: a tensor of shape [num_x_samples, num_features] y: a tensor of shape [num_y_samples, num_features] Returns: a distance matrix of dimensions [num_x_samples, num_y_samples]. Raises: ValueError: if the inputs do no matched the specified dimensions. """ if not len(x.get_shape()) == len(y.get_shape()) == 2: raise ValueError('Both inputs should be matrices.') if x.get_shape().as_list()[1] != y.get_shape().as_list()[1]: raise ValueError('The number of features should be the same.') norm = lambda x: tf.reduce_sum(tf.square(x), 1) # By making the `inner' dimensions of the two matrices equal to 1 using # broadcasting then we are essentially substracting every pair of rows # of x and y. # x will be num_samples x num_features x 1, # and y will be 1 x num_features x num_samples (after broadcasting). # After the substraction we will get a # num_x_samples x num_features x num_y_samples matrix. # The resulting dist will be of shape num_y_samples x num_x_samples. # and thus we need to transpose it again. return tf.transpose(norm(tf.expand_dims(x, 2) - tf.transpose(y)))
def _parser(serialized_example): """Parses a single tf.Example into image and label tensors.""" features = tf.parse_single_example( serialized_example, features={ "image": tf.FixedLenFeature([], tf.string), "label": tf.FixedLenFeature([], tf.int64), }) image = tf.decode_raw(features["image"], tf.uint8) # Initially reshaping to [H, W, C] does not work image = tf.reshape(image, [NUM_CHANNEL, IMAGE_HEIGHT, IMAGE_WIDTH]) # This is needed for `tf.image.resize_image_with_crop_or_pad` image = tf.transpose(image, [1, 2, 0]) image = tf.cast(image, dtype) label = tf.cast(features["label"], tf.int32) if data_aug: image = tf.image.resize_image_with_crop_or_pad(image, IMAGE_HEIGHT + 4, IMAGE_WIDTH + 4) image = tf.random_crop(image, [IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNEL]) image = tf.image.random_flip_left_right(image) if data_format == "channels_first": image = tf.transpose(image, [2, 0, 1]) if div255: image /= 255. return image, label
def lid_term(logits, batch_size=100): """Calculate LID loss term for a minibatch of logits :param logits: :return: """ # y_pred = tf.nn.softmax(logits) y_pred = logits # calculate pairwise distance r = tf.reduce_sum(y_pred * y_pred, 1) # turn r into column vector r1 = tf.reshape(r, [-1, 1]) D = r1 - 2 * tf.matmul(y_pred, tf.transpose(y_pred)) + tf.transpose(r1) + \ tf.ones([batch_size, batch_size]) # find the k nearest neighbor D1 = -tf.sqrt(D) D2, _ = tf.nn.top_k(D1, k=21, sorted=True) D3 = -D2[:, 1:] m = tf.transpose(tf.multiply(tf.transpose(D3), 1.0 / D3[:, -1])) v_log = tf.reduce_sum(tf.log(m + 1e-9), axis=1) # to avoid nan lids = -20 / v_log ## batch normalize lids # lids = tf.nn.l2_normalize(lids, dim=0, epsilon=1e-12) return lids
def _multichannel_image_summary(name, images, perm=[0, 3, 1, 2], max_summary_images=16): _min = tf.reduce_min(images) _max = tf.reduce_max(images) _ = tf.mul(tf.div(tf.add(images, _min), tf.sub(_max, _min)), 255.0) _ = tf.transpose(_, perm=perm) shape = _.get_shape().as_list() tf.image_summary(name, tf.reshape(tf.transpose(_, perm=perm), [reduce(lambda x,y:x*y, shape)/(shape[3]*shape[2]), shape[2], shape[3], 1]), max_images=max_summary_images)
def skew(inputs, scope="skew"): with tf.name_scope(scope): batch, height, width, channel = get_shape(inputs) # [batch, height, width, channel] rows = tf.split(1, height, inputs) # [batch, 1, width, channel] new_width = width + height - 1 new_rows = [] for idx, row in enumerate(rows): transposed_row = tf.transpose(tf.squeeze(row, [1]), [0, 2, 1]) # [batch, channel, width] squeezed_row = tf.reshape(transposed_row, [-1, width]) # [batch*channel, width] padded_row = tf.pad(squeezed_row, ((0, 0), (idx, height - 1 - idx))) # [batch*channel, width*2-1] unsqueezed_row = tf.reshape(padded_row, [-1, channel, new_width]) # [batch, channel, width*2-1] untransposed_row = tf.transpose(unsqueezed_row, [0, 2, 1]) # [batch, width*2-1, channel] assert get_shape(untransposed_row) == [batch, new_width, channel], "wrong shape of skewed row" new_rows.append(untransposed_row) outputs = tf.pack(new_rows, axis=1, name="output") assert get_shape(outputs) == [None, height, new_width, channel], "wrong shape of skewed output" logger.debug('[skew] %s : %s %s -> %s %s' \ % (scope, inputs.name, inputs.get_shape(), outputs.name, outputs.get_shape())) return outputs
def inputs(path): whole = read_csv(FLAGS.batch_size, path) features = tf.transpose(tf.pack(whole[0:FLAGS.max_sentence_len])) label = tf.one_hot( tf.transpose(tf.pack(whole[FLAGS.max_sentence_len])), depth=2) return features, label
def channel_wise_fc_layer(bottom, name, bias=True): """ channel-wise fully connected layer """ _, width, height, n_feat_map = bottom.get_shape().as_list() input_reshape = tf.reshape( bottom, [-1, width*height, n_feat_map] ) # order='C' input_transpose = tf.transpose( input_reshape, [2,0,1] ) # n_feat_map * batch * d with tf.variable_scope(name): W = tf.get_variable( "W", shape=[n_feat_map,width*height, width*height], # n_feat_map * d * d_filter initializer=tf.truncated_normal_initializer(0., 0.005)) output = tf.batch_matmul(input_transpose, W) # n_feat_map * batch * d_filter if bias == True: b = tf.get_variable( "b", shape=width*height, initializer=tf.constant_initializer(0.)) output = tf.nn.bias_add(output, b) output_transpose = tf.transpose(output, [1,2,0]) # batch * d_filter * n_feat_map output_reshape = tf.reshape( output_transpose, [-1, width, height, n_feat_map] ) return output_reshape
def __call__(self, inputs, seq_len, keep_prob=1.0, is_train=None, concat_layers=True): outputs = [tf.transpose(inputs, [1, 0, 2])] for layer in range(self.num_layers): gru_fw, gru_bw = self.grus[layer] init_fw, init_bw = self.inits[layer] mask_fw, mask_bw = self.dropout_mask[layer] with tf.variable_scope('fw_{}'.format(layer), reuse=tf.AUTO_REUSE): with tf.variable_scope('cudnn_gru', reuse=tf.AUTO_REUSE): out_fw, _ = tf.nn.dynamic_rnn(cell=gru_fw, inputs=outputs[-1] * mask_fw, time_major=True, initial_state=tuple(tf.unstack(init_fw, axis=0))) with tf.variable_scope('bw_{}'.format(layer), reuse=tf.AUTO_REUSE): with tf.variable_scope('cudnn_gru', reuse=tf.AUTO_REUSE): inputs_bw = tf.reverse_sequence( outputs[-1] * mask_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1) out_bw, _ = tf.nn.dynamic_rnn(cell=gru_bw, inputs=inputs_bw, time_major=True, initial_state=tuple(tf.unstack(init_bw, axis=0))) out_bw = tf.reverse_sequence( out_bw, seq_lengths=seq_len, seq_dim=0, batch_dim=1) outputs.append(tf.concat([out_fw, out_bw], axis=2)) if concat_layers: res = tf.concat(outputs[1:], axis=2) else: res = outputs[-1] res = tf.transpose(res, [1, 0, 2]) return res
def __init__(self, exp_dir, corpus_reader, num_layers=3, hidden_size=250, beam_width=100, decoding_merge_repeated=True): super().__init__(exp_dir, corpus_reader) if not os.path.isdir(exp_dir): os.makedirs(exp_dir) # Increase vocab size by 2 since we need an extra for CTC blank labels # and another extra for dynamic padding with zeros. vocab_size = corpus_reader.corpus.vocab_size + 2 # Reset the graph. tf.reset_default_graph() self.num_layers = num_layers self.hidden_size = hidden_size self.beam_width = beam_width self.vocab_size = vocab_size # Initialize placeholders for feeding data to model. self.batch_x = tf.placeholder( tf.float32, [None, None, corpus_reader.corpus.num_feats]) self.batch_x_lens = tf.placeholder(tf.int32, [None]) self.batch_y = tf.sparse_placeholder(tf.int32) batch_size = tf.shape(self.batch_x)[0] layer_input = self.batch_x for i in range(num_layers): with tf.variable_scope("layer_%d" % i): cell_fw = lstm_cell(self.hidden_size) cell_bw = lstm_cell(self.hidden_size) (self.out_fw, self.out_bw), _ = tf.nn.bidirectional_dynamic_rnn( cell_fw, cell_bw, layer_input, self.batch_x_lens, dtype=tf.float32, time_major=False) # Self outputs now becomes [batch_num, time, hidden_size*2] self.outputs_concat = tf.concat((self.out_fw, self.out_bw), 2) # For feeding into the next layer layer_input = self.outputs_concat self.outputs = tf.reshape(self.outputs_concat, [-1, self.hidden_size * 2]) # Single-variable names are appropriate for weights an biases. # pylint: disable=invalid-name W = tf.Variable( tf.truncated_normal([hidden_size * 2, vocab_size], stddev=np.sqrt(2.0 / (2 * hidden_size)))) b = tf.Variable(tf.zeros([vocab_size])) self.logits = tf.matmul(self.outputs, W) + b self.logits = tf.reshape(self.logits, [batch_size, -1, vocab_size]) # igormq made it time major, because of an optimization in ctc_loss. self.logits = tf.transpose(self.logits, (1, 0, 2)) # For lattice construction self.log_softmax = tf.nn.log_softmax(self.logits) self.decoded, self.log_prob = tf.nn.ctc_beam_search_decoder( self.logits, self.batch_x_lens, beam_width=beam_width, merge_repeated=decoding_merge_repeated) # If we want to do manual PER decoding. The decoded[0] beans the best # hypothesis (0th) in an n-best list. self.dense_decoded = tf.sparse_tensor_to_dense(self.decoded[0]) self.dense_ref = tf.sparse_tensor_to_dense(self.batch_y) self.loss = tf.nn.ctc_loss(self.batch_y, self.logits, self.batch_x_lens, preprocess_collapse_repeated=False, ctc_merge_repeated=True) self.cost = tf.reduce_mean(self.loss) self.optimizer = tf.train.AdamOptimizer().minimize(self.cost) self.ler = tf.reduce_mean( tf.edit_distance(tf.cast(self.decoded[0], tf.int32), self.batch_y)) self.write_desc()
def _transpose_mul(self, a, b): """ Shortcut for multiplication with a transposed matrix. """ return tf.transpose(tf.mul(tf.transpose(a), b))
def dmnrun(fulldata, queask): # Loading saved meta graph sess = tf.Session() saver = tf.train.import_meta_graph("C:/Users/Mark/PycharmProjects/DMNTrain/weights/model.meta") saver.restore(sess, tf.train.latest_checkpoint('C:/Users/Mark/PycharmProjects/DMNTrain/weights')) tf.reset_default_graph() def wideArray(x, weight): wide = np.zeros([len(x), weight]) for i in range(0, len(x)): for j in range(0, len(x[i])): wide[i][j] = x[i][j] return wide def octalConv(x): ans = [] rows = [] words = [] for line in x.split(' '): for word in line: number = ord(word) convNum = oct(number) convNum = int(convNum[2:]) rows.append(ans) ans = [] words.append(line) ans = wideArray(rows, 50) return ans, words def contextualize(data, quest): """ Read in the input and question and build a context sets. Output is a list of data points, each of which is a 7-element tuple containing: The sentences in the context in vectorized form. The sentences in the context as a list of string tokens. The question in vectorized form. The question as a list of string tokens. The answer in vectorized form. The answer as a list of string tokens. A list of numbers for supporting statements, which is currently unused. """ output = [] context = [] for entry in data: # Turn input into a word vector # TODO: Change to Octal Decimal encoding context.append(octalConv(entry[:-1])) # Wrap up object so DMN can use it comp_context = tuple(zip(*context)) output.append(comp_context + octalConv(quest) + octalConv('Nothing') + (0,)) return output test_data = contextualize(fulldata, queask) final_train_data = [] def finalize(data): """ Prepares data generated by contextualize() for use in the network. """ final_data = [] for cqas in data: contextvs, contextws, qvs, qws, avs, aws, spt = cqas lspt = [spt] lengths = itertools.accumulate(len(cvec) for cvec in contextvs) context_vec = np.concatenate(contextvs) context_words = sum(contextws, []) # Location markers for the beginnings of new sentences. sentence_ends = np.array(list(lengths)) final_data.append((context_vec, sentence_ends, qvs, lspt, context_words, cqas, avs, aws)) return np.array(final_data) final_test_data = finalize(test_data) tf.reset_default_graph() # Hyperparameters # The number of dimensions used to store data passed between recurrent layers in the network. recurrent_cell_size = 128 # The number of dimensions in our word vectorizations. D = 50 # How quickly the network learns. Too high, and we may run into numeric instability # or other issues. learning_rate = 0.005 # Dropout probabilities. For a description of dropout and what these probabilities are, # see Entailment with TensorFlow. input_p, output_p = 0.5, 0.5 # How many questions we train on at a time. batch_size = 128 # Number of passes in episodic memory. We'll get to this later. passes = 4 # Feed Forward layer sizes: the number of dimensions used to store data passed from feed-forward layers. ff_hidden_size = 256 weight_decay = 0.00000001 # The strength of our regularization. Increase to encourage sparsity in episodic memory, # but makes training slower. Don't make this larger than leraning_rate. training_iterations_count = 400000 # How many questions the network trains on each time it is trained. # Some questions are counted multiple times. display_step = 1 # How many iterations of training occur before each validation check. # Input Module # Context: A [batch_size, maximum_context_length, word_vectorization_dimensions] tensor # that contains all the context information. context = tf.placeholder(tf.float32, [None, None, D], "context") context_placeholder = context # I use context as a variable name later on # input_sentence_endings: A [batch_size, maximum_sentence_count, 2] tensor that # contains the locations of the ends of sentences. input_sentence_endings = tf.placeholder(tf.int32, [None, None, 2], "sentence") # recurrent_cell_size: the number of hidden units in recurrent layers. input_gru = tf.contrib.rnn.GRUCell(recurrent_cell_size) # input_p: The probability of maintaining a specific hidden input unit. # Likewise, output_p is the probability of maintaining a specific hidden output unit. gru_drop = tf.contrib.rnn.DropoutWrapper(input_gru, input_p, output_p) # dynamic_rnn also returns the final internal state. We don't need that, and can # ignore the corresponding output (_). input_module_outputs, _ = tf.nn.dynamic_rnn(gru_drop, context, dtype=tf.float32, scope="input_module") # cs: the facts gathered from the context. cs = tf.gather_nd(input_module_outputs, input_sentence_endings) # to use every word as a fact, useful for tasks with one-sentence contexts s = input_module_outputs # Question Module # query: A [batch_size, maximum_question_length, word_vectorization_dimensions] tensor # that contains all of the questions. query = tf.placeholder(tf.float32, [None, None, D], "query") # input_query_lengths: A [batch_size, 2] tensor that contains question length information. # input_query_lengths[:,1] has the actual lengths; input_query_lengths[:,0] is a simple range() # so that it plays nice with gather_nd. input_query_lengths = tf.placeholder(tf.int32, [None, 2], "query_lengths") question_module_outputs, _ = tf.nn.dynamic_rnn(gru_drop, query, dtype=tf.float32, scope=tf.VariableScope(True, "input_module")) # q: the question states. A [batch_size, recurrent_cell_size] tensor. q = tf.gather_nd(question_module_outputs, input_query_lengths) # Episodic Memory # make sure the current memory (i.e. the question vector) is broadcasted along the facts dimension size = tf.stack([tf.constant(1), tf.shape(cs)[1], tf.constant(1)]) re_q = tf.tile(tf.reshape(q, [-1, 1, recurrent_cell_size]), size) # Final output for attention, needs to be 1 in order to create a mask output_size = 1 # Weights and biases attend_init = tf.random_normal_initializer(stddev=0.1) w_1 = tf.get_variable("attend_w1", [1, recurrent_cell_size * 7, recurrent_cell_size], tf.float32, initializer=attend_init) w_2 = tf.get_variable("attend_w2", [1, recurrent_cell_size, output_size], tf.float32, initializer=attend_init) b_1 = tf.get_variable("attend_b1", [1, recurrent_cell_size], tf.float32, initializer=attend_init) b_2 = tf.get_variable("attend_b2", [1, output_size], tf.float32, initializer=attend_init) # Regulate all the weights and biases tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(w_1)) tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(b_1)) tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(w_2)) tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(b_2)) def attention(c, mem, existing_facts): """ Custom attention mechanism. c: A [batch_size, maximum_sentence_count, recurrent_cell_size] tensor that contains all the facts from the contexts. mem: A [batch_size, maximum_sentence_count, recurrent_cell_size] tensor that contains the current memory. It should be the same memory for all facts for accurate results. existing_facts: A [batch_size, maximum_sentence_count, 1] tensor that acts as a binary mask for which facts exist and which do not. """ with tf.variable_scope("attending") as scope: # attending: The metrics by which we decide what to attend to. attending = tf.concat([c, mem, re_q, c * re_q, c * mem, (c - re_q) ** 2, (c - mem) ** 2], 2) # m1: First layer of multiplied weights for the feed-forward network. # We tile the weights in order to manually broadcast, since tf.matmul does not # automatically broadcast batch matrix multiplication as of TensorFlow 1.2. m1 = tf.matmul(attending * existing_facts, tf.tile(w_1, tf.stack([tf.shape(attending)[0], 1, 1]))) * existing_facts # bias_1: A masked version of the first feed-forward layer's bias # over only existing facts. bias_1 = b_1 * existing_facts # tnhan: First nonlinearity. In the original paper, this is a tanh nonlinearity; # choosing relu was a design choice intended to avoid issues with # low gradient magnitude when the tanh returned values close to 1 or -1. tnhan = tf.nn.relu(m1 + bias_1) # m2: Second layer of multiplied weights for the feed-forward network. # Still tiling weights for the same reason described in m1's comments. m2 = tf.matmul(tnhan, tf.tile(w_2, tf.stack([tf.shape(attending)[0], 1, 1]))) # bias_2: A masked version of the second feed-forward layer's bias. bias_2 = b_2 * existing_facts # norm_m2: A normalized version of the second layer of weights, which is used # to help make sure the softmax nonlinearity doesn't saturate. norm_m2 = tf.nn.l2_normalize(m2 + bias_2, -1) # softmaxable: A hack in order to use sparse_softmax on an otherwise dense tensor. # We make norm_m2 a sparse tensor, then make it dense again after the operation. softmax_idx = tf.where(tf.not_equal(norm_m2, 0))[:, :-1] softmax_gather = tf.gather_nd(norm_m2[..., 0], softmax_idx) softmax_shape = tf.shape(norm_m2, out_type=tf.int64)[:-1] softmaxable = tf.SparseTensor(softmax_idx, softmax_gather, softmax_shape) return tf.expand_dims(tf.sparse_tensor_to_dense(tf.sparse_softmax(softmaxable)), -1) # facts_0s: a [batch_size, max_facts_length, 1] tensor # whose values are 1 if the corresponding fact exists and 0 if not. facts_0s = tf.cast(tf.count_nonzero(input_sentence_endings[:, :, -1:], -1, keepdims=True), tf.float32) with tf.variable_scope("Episodes") as scope: attention_gru = tf.contrib.rnn.GRUCell(recurrent_cell_size) # memory: A list of all tensors that are the (current or past) memory state # of the attention mechanism. memory = [q] # attends: A list of all tensors that represent what the network attends to. attends = [] for a in range(passes): # attention mask attend_to = attention(cs, tf.tile(tf.reshape(memory[-1], [-1, 1, recurrent_cell_size]), size), facts_0s) # Inverse attention mask, for what's retained in the state. retain = 1 - attend_to # GRU pass over the facts, according to the attention mask. while_valid_index = (lambda state, index: index < tf.shape(cs)[1]) update_state = (lambda state, index: (attend_to[:, index, :] * attention_gru(cs[:, index, :], state)[0] + retain[:, index, :] * state)) # start loop with most recent memory and at the first index memory.append(tuple(tf.while_loop(while_valid_index, (lambda state, index: (update_state(state, index), index + 1)), loop_vars=[memory[-1], 0]))[0]) attends.append(attend_to) # Reuse variables so the GRU pass uses the same variables every pass. scope.reuse_variables() # Answer Module # a0: Final memory state. (Input to answer module) a0 = tf.concat([memory[-1], q], -1) # fc_init: Initializer for the final fully connected layer's weights. fc_init = tf.random_normal_initializer(stddev=0.1) with tf.variable_scope("answer"): # w_answer: The final fully connected layer's weights. w_answer = tf.get_variable("weight", [recurrent_cell_size * 2, D], tf.float32, initializer=fc_init) # Regulate the fully connected layer's weights tf.add_to_collection(tf.GraphKeys.REGULARIZATION_LOSSES, tf.nn.l2_loss(w_answer)) # The regressed word. This isn't an actual word yet; # we still have to find the closest match. logit = tf.expand_dims(tf.matmul(a0, w_answer), 1) # Make a mask over which words exist. with tf.variable_scope("ending"): all_ends = tf.reshape(input_sentence_endings, [-1, 2]) range_ends = tf.range(tf.shape(all_ends)[0]) ends_indices = tf.stack([all_ends[:, 0], range_ends], axis=1) ind = tf.reduce_max(tf.scatter_nd(ends_indices, all_ends[:, 1], [tf.shape(q)[0], tf.shape(all_ends)[0]]), axis=-1) range_ind = tf.range(tf.shape(ind)[0]) mask_ends = tf.cast(tf.scatter_nd(tf.stack([ind, range_ind], axis=1), tf.ones_like(range_ind), [tf.reduce_max(ind) + 1, tf.shape(ind)[0]]), bool) # A bit of a trick. With the locations of the ends of the mask (the last periods in # each of the contexts) as 1 and the rest as 0, we can scan with exclusive or # (starting from all 1). For each context in the batch, this will result in 1s # up until the marker (the location of that last period) and 0s afterwards. mask = tf.scan(tf.logical_xor, mask_ends, tf.ones_like(range_ind, dtype=bool)) # We score each possible word inversely with their Euclidean distance to the regressed word. # The highest score (lowest distance) will correspond to the selected word. logits = -tf.reduce_sum(tf.square(context * tf.transpose(tf.expand_dims( tf.cast(mask, tf.float32), -1), [1, 0, 2]) - logit), axis=-1, name='logits') # Training # gold_standard: The real answers. gold_standard = tf.placeholder(tf.float32, [None, 1, D], "answer") with tf.variable_scope('accuracy'): eq = tf.equal(context, gold_standard) corrbool = tf.reduce_all(eq, -1, name='corrbool') logloc = tf.reduce_max(logits, -1, keepdims=True) # locs: A boolean tensor that indicates where the score # matches the minimum score. This happens on multiple dimensions, # so in the off chance there's one or two indexes that match # we make sure it matches in all indexes. locs = tf.equal(logits, logloc) # correctsbool: A boolean tensor that indicates for which # words in the context the score always matches the minimum score. correctsbool = tf.reduce_any(tf.logical_and(locs, corrbool), -1) # corrects: A tensor that is simply correctsbool cast to floats. corrects = tf.where(correctsbool, tf.ones_like(correctsbool, dtype=tf.float32), tf.zeros_like(correctsbool, dtype=tf.float32)) # corr: corrects, but for the right answer instead of our selected answer. corr = tf.where(corrbool, tf.ones_like(corrbool, dtype=tf.float32), tf.zeros_like(corrbool, dtype=tf.float32)) with tf.variable_scope("loss"): # Use sigmoid cross entropy as the base loss, # with our distances as the relative probabilities. There are # multiple correct labels, for each location of the answer word within the context. loss = tf.nn.sigmoid_cross_entropy_with_logits(logits=tf.nn.l2_normalize(logits, -1), labels=corr) # Add regularization losses, weighted by weight_decay. total_loss = tf.reduce_mean(loss) + weight_decay * tf.add_n( tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)) # TensorFlow's default implementation of the Adam optimizer works. We can adjust more than # just the learning rate, but it's not necessary to find a very good optimum. optimizer = tf.train.AdamOptimizer(learning_rate) # Once we have an optimizer, we ask it to minimize the loss # in order to work towards the proper training. opt_op = optimizer.minimize(total_loss) # Initialize variables init = tf.global_variables_initializer() # Launch the TensorFlow session sess = tf.Session() sess.run(init) def prep_batch(batch_data, more_data=False): """ Prepare all the preproccessing that needs to be done on a batch-by-batch basis. """ context_vec, sentence_ends, questionvs, spt, context_words, cqas, answervs, _ = zip(*batch_data) ends = list(sentence_ends) maxend = max(map(len, ends)) aends = np.zeros((len(ends), maxend)) for index, i in enumerate(ends): for indexj, x in enumerate(i): aends[index, indexj] = x - 1 new_ends = np.zeros(aends.shape + (2,)) for index, x in np.ndenumerate(aends): new_ends[index + (0,)] = index[0] new_ends[index + (1,)] = x contexts = list(context_vec) max_context_length = max([len(x) for x in contexts]) contextsize = list(np.array(contexts[0]).shape) contextsize[0] = max_context_length final_contexts = np.zeros([len(contexts)] + contextsize) contexts = [np.array(x) for x in contexts] for i, context in enumerate(contexts): final_contexts[i, 0:len(context), :] = context max_query_length = max(len(x) for x in questionvs) querysize = list(np.array(questionvs[0]).shape) querysize[:1] = [len(questionvs), max_query_length] queries = np.zeros(querysize) querylengths = np.array(list(zip(range(len(questionvs)), [len(q) - 1 for q in questionvs]))) questions = [np.array(q) for q in questionvs] for i, question in enumerate(questions): queries[i, 0:len(question), :] = question data = {context_placeholder: final_contexts, input_sentence_endings: new_ends, query: queries, input_query_lengths: querylengths, gold_standard: answervs} return (data, context_words, cqas) if more_data else data # Use TQDM if installed tqdm_installed = False # Prepare validation set batch = np.random.randint(final_test_data.shape[0], size=batch_size * 10) batch_data = final_test_data[batch] validation_set, val_context_words, val_cqas = prep_batch(batch_data, True) holder = [corrbool, locs, total_loss, logits, facts_0s, w_1] + attends + [query, cs, question_module_outputs] print('Starting session') start_time = time.time() ancr = sess.run([corrbool, locs, total_loss, logits, facts_0s, w_1] + attends + [query, cs, question_module_outputs], feed_dict=validation_set) elapsed_time = time.time() - start_time print(elapsed_time) a = ancr[0] n = ancr[1] cr = ancr[2] attenders = np.array(ancr[6:-3]) faq = np.sum(ancr[4], axis=(-1, -2)) # Number of facts in each context limit = 1 # Locations of responses within contexts indices = np.argmax(n, axis=1) # Locations of actual answers within contexts indicesc = np.argmax(a, axis=1) response = "" ans = 0 inp = '' for i, e, cw, cqa in list(zip(indices, indicesc, val_context_words, val_cqas))[:limit]: ccc = " ".join(cw) print("TEXT: ", ccc) inp = ccc print("QUESTION: ", " ".join(cqa[3])) print("RESPONSE: ", cw[i], ["Correct", "Incorrect"][i != e]) ans = i print("EXPECTED: ", cw[e]) print() # For safety, return this if nothing is found sess.close() print('--') tot_index = 0 for line in fulldata: tot_index = tot_index + len(line) if tot_index >= ans: return line return response
def _gram_matrix(F, N, M): Ft = tf.reshape(F, (M, N)) return tf.matmul(tf.transpose(Ft), Ft)
# handling tensors second example import matplotlib.image as mp_image import matplotlib.pyplot as plt import tensorflow as tf # STEP 1 --- PREPARE THE DATA filename = "packt.jpeg" input_image = mp_image.imread(filename) # dimension print('input dim = {}'.format(input_image.ndim)) # shape print('input shape = {}'.format(input_image.shape)) height, width, depth = input_image.shape plt.imshow(input_image) plt.show() x = tf.Variable(input_image, name='x') model = tf.initialize_all_variables() with tf.Session() as session: x = tf.transpose(x, perm=[1, 0, 2]) session.run(model) result = session.run(x) plt.imshow(result) plt.show()
def __init__(self, config, name): assert name in ('validation', 'training', 'test') self.name = name logging.debug('{} - model - initialize'.format(self.name)) self.is_training = True if self.name == 'training' else False self.config = config if not self.is_training: self.reinitializable_iter_for_dataset = None self.batch = self._gen_batch_fn() # generate mini-batch with tf.name_scope(self.name): with tf.variable_scope('full_conv', reuse=tf.AUTO_REUSE): logits_stereo = self._nn_model_fn() logits_stereo_flattened = flatten_maybe_padded_sequences( maybe_padded_sequences=logits_stereo, lengths=tf.tile(input=self.batch['num_frames'], multiples=[2])) logits_left_flattened, logits_right_flattened = tf.split( value=logits_stereo_flattened, num_or_size_splits=2, axis=0) logits_minor_flattened = tf.minimum(logits_left_flattened, logits_right_flattened) logits_larger_flattened = tf.maximum(logits_left_flattened, logits_right_flattened) labels_bool_flattened = flatten_maybe_padded_sequences( maybe_padded_sequences=self.batch['label'], lengths=self.batch['num_frames']) negated_labels_bool_flattened = tf.logical_not(labels_bool_flattened) labels_float_flattened = tf.cast(x=labels_bool_flattened, dtype=tf.float32) #When label is True, choose the smaller logits. Otherwise, choose the larger logits logits_mono_flattened = tf.where( tf.equal(labels_bool_flattened, True), logits_minor_flattened, logits_larger_flattened) #cross-entropy #loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels_float_flattened, logits=logits_mono_flattened) #weighted cross-entropy #A value `pos_weights > 1` decreases the false negative count, hence increasing the recall. #Conversely setting `pos_weights < 1` decreases the false positive count and increases the precision. loss = tf.nn.weighted_cross_entropy_with_logits(targets=labels_float_flattened, logits=logits_mono_flattened, pos_weight=1.1) #focal loss #loss = MiscFns.focal_loss(labels=labels_float_flattened, logits=logits_mono_flattened) loss = tf.reduce_mean(loss) if self.is_training: global_step = tf.train.get_or_create_global_step() learning_rate = tf.train.exponential_decay(self.config.learning_rate, global_step, \ self.config.batches_per_epoch * 7, 0.7, staircase=True) _update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if _update_ops: with tf.control_dependencies(_update_ops): training_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step) else: training_op = tf.train.AdamOptimizer(learning_rate).minimize(loss, global_step=global_step) pred_labels_flattened = tf.greater(logits_left_flattened+logits_right_flattened, 0) negated_pred_labels_flattened = tf.logical_not(pred_labels_flattened) # individual and ensemble statistics for test and validation if not self.is_training: with tf.name_scope('individual_and_ensemble_stats'): with tf.variable_scope('{}_local_vars'.format(self.name), reuse=tf.AUTO_REUSE): individual_tps_fps_tns_fns_var = tf.get_variable( name='individual_tps_fps_tns_fns', shape=[len(self.config.file_names[self.name]), 4], dtype=tf.int32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) acc_loss_var = tf.get_variable( name='acc_loss', shape=[], dtype=tf.float32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) batch_counter_var = tf.get_variable( name='batch_counter', shape=[], dtype=tf.int32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) loop_var_proto = collections.namedtuple( 'loop_var_proto', ['sample_idx', 'batch_size', 'preds', 'negated_preds', 'labels', 'negated_labels', 'lengths', 'me_ids']) def cond_fn(loop_var): return tf.less(loop_var.sample_idx, loop_var.batch_size) def body_fn(loop_var): start_pos = tf.reduce_sum(loop_var.lengths[:loop_var.sample_idx]) end_pos = start_pos + loop_var.lengths[loop_var.sample_idx] cur_preds = loop_var.preds negated_cur_preds = loop_var.negated_preds cur_labels = loop_var.labels negated_cur_labels = loop_var.negated_labels cur_preds, negated_cur_preds, cur_labels, negated_cur_labels = \ [value[start_pos:end_pos] for value in [cur_preds, negated_cur_preds, cur_labels, negated_cur_labels]] tps = tf.logical_and(cur_preds, cur_labels) fps = tf.logical_and(cur_preds, negated_cur_labels) tns = tf.logical_and(negated_cur_preds, negated_cur_labels) fns = tf.logical_and(negated_cur_preds, cur_labels) tps, fps, tns, fns = \ [tf.reduce_sum(tf.cast(value, tf.int32)) for value in [tps, fps, tns, fns]] me_id = loop_var.me_ids[loop_var.sample_idx] stats_var = individual_tps_fps_tns_fns_var _new_value = stats_var[me_id] + tf.convert_to_tensor([tps, fps, tns, fns]) _update_stats = tf.scatter_update( stats_var, me_id, _new_value, use_locking=True) with tf.control_dependencies([_update_stats]): sample_idx = loop_var.sample_idx + 1 loop_var = loop_var_proto( sample_idx=sample_idx, batch_size=loop_var.batch_size, preds=loop_var.preds, negated_preds=loop_var.negated_preds, labels=loop_var.labels, negated_labels=loop_var.negated_labels, lengths=loop_var.lengths, me_ids=loop_var.me_ids ) return [loop_var] sample_idx = tf.constant(0, dtype=tf.int32) cur_batch_size = tf.shape(self.batch['num_frames'])[0] loop_var = loop_var_proto( sample_idx=sample_idx, batch_size=cur_batch_size, preds=pred_labels_flattened, negated_preds=negated_pred_labels_flattened, labels=labels_bool_flattened, negated_labels=negated_labels_bool_flattened, lengths=self.batch['num_frames'], me_ids=self.batch['me_id'] ) final_sample_idx = tf.while_loop( cond=cond_fn, body=body_fn, loop_vars=[loop_var], parallel_iterations=self.config.batch_size, back_prop=False, return_same_structure=True )[0].sample_idx individual_tps_fps_tns_fns_float = tf.cast(individual_tps_fps_tns_fns_var, tf.float32) tps, fps, _, fns = tf.unstack(individual_tps_fps_tns_fns_float, axis=1) me_wise_precisions = tps / (tps + fps + 1e-7) me_wise_recalls = tps / (tps + fns + 1e-7) me_wise_f1s = 2. * me_wise_precisions * me_wise_recalls / \ (me_wise_precisions + me_wise_recalls + 1e-7) me_wise_prfs = tf.stack([me_wise_precisions, me_wise_recalls, me_wise_f1s], axis=1) assert me_wise_prfs.shape.as_list() == [len(self.config.file_names[self.name]), 3] average_me_wise_prf = tf.reduce_mean(me_wise_prfs, axis=0) assert average_me_wise_prf.shape.as_list() == [3] # ensemble stats ensemble_tps_fps_tns_fns = tf.reduce_sum(individual_tps_fps_tns_fns_var, axis=0) tps, fps, _, fns = tf.unstack(tf.cast(ensemble_tps_fps_tns_fns, tf.float32)) en_precision = tps / (tps + fps + 1e-7) en_recall = tps / (tps + fns + 1e-7) en_f1 = 2. * en_precision * en_recall / (en_precision + en_recall + 1e-7) batch_counter_update_op = tf.assign_add(batch_counter_var, 1) acc_loss_update_op = tf.assign_add(acc_loss_var, loss) ensemble_prf_and_loss = tf.convert_to_tensor( [en_precision, en_recall, en_f1, acc_loss_var / tf.cast(batch_counter_var, tf.float32)]) update_op_after_each_batch = tf.group( final_sample_idx, batch_counter_update_op, acc_loss_update_op, name='grouped update ops to be run after each batch'.replace(' ', '_')) stats_after_each_epoch = dict( individual_tps_fps_tns_fns=individual_tps_fps_tns_fns_var, individual_prfs=me_wise_prfs, ensemble_tps_fps_tns_fns=ensemble_tps_fps_tns_fns, ensemble_prf_and_loss=ensemble_prf_and_loss, average_prf=average_me_wise_prf ) ''' # ensemble stats for training if self.is_training: with tf.name_scope('ensemble_stats'): with tf.variable_scope('{}_local_vars'.format(self.name), reuse=tf.AUTO_REUSE): ensemble_tps_fps_tns_fns_var = tf.get_variable( name='ensemble_tps_fps_tns_fns', shape=[4], dtype=tf.int32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) acc_loss_var = tf.get_variable( name='acc_loss', shape=[], dtype=tf.float32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) batch_counter_var = tf.get_variable( name='batch_counter', shape=[], dtype=tf.int32, initializer=tf.zeros_initializer, trainable=False, collections=[tf.GraphKeys.LOCAL_VARIABLES] ) tps = tf.logical_and(pred_labels_flattened, labels_bool_flattened) fps = tf.logical_and(pred_labels_flattened, negated_labels_bool_flattened) tns = tf.logical_and(negated_pred_labels_flattened, negated_labels_bool_flattened) fns = tf.logical_and(negated_pred_labels_flattened, labels_bool_flattened) tps, fps, tns, fns = [tf.reduce_sum(tf.cast(value, tf.int32)) for value in [tps, fps, tns, fns]] ensemble_tps_fps_tns_fns_update_op = tf.assign_add( ensemble_tps_fps_tns_fns_var, tf.convert_to_tensor([tps, fps, tns, fns])) acc_loss_update_op = tf.assign_add(acc_loss_var, loss) batch_counter_update_op = tf.assign_add(batch_counter_var, 1) ensemble_tps_fps_tns_fns_float = tf.cast(ensemble_tps_fps_tns_fns_var, tf.float32) tps, fps, _, fns = tf.unstack(ensemble_tps_fps_tns_fns_float) ensemble_precision = tps / (tps + fps + 1e-7) ensemble_recall = tps / (tps + fns + 1e-7) ensemble_f1 = 2. * ensemble_precision * ensemble_recall / \ (ensemble_precision + ensemble_recall + 1e-7) ensemble_loss = acc_loss_var / tf.cast(batch_counter_var, tf.float32) ensemble_prf_and_loss = tf.convert_to_tensor( [ensemble_precision, ensemble_recall, ensemble_f1, ensemble_loss]) update_op_after_each_batch = tf.group( batch_counter_update_op, ensemble_tps_fps_tns_fns_update_op, acc_loss_update_op) stats_after_each_epoch = dict( ensemble_tps_fps_tns_fns=ensemble_tps_fps_tns_fns_var, ensemble_prf_and_loss=ensemble_prf_and_loss ) ''' # define tensorboard summaries with tf.name_scope('tensorboard_summary'): with tf.name_scope('statistics'): if not self.is_training: list_of_summaries = [] with tf.name_scope('ensemble'): p, r, f, lo = tf.unstack(stats_after_each_epoch['ensemble_prf_and_loss']) items_for_summary = dict(precision=p, recall=r, f1=f, average_loss=lo) for item_name, item_value in items_for_summary.items(): tmp = tf.summary.scalar(item_name, item_value) list_of_summaries.append(tmp) with tf.name_scope('individual'): p, r, f = tf.unstack(stats_after_each_epoch['average_prf']) items_for_summary = dict(precision=p, recall=r, f1=f) for item_name, item_value in items_for_summary.items(): tmp = tf.summary.scalar(item_name, item_value) list_of_summaries.append(tmp) statistical_summary = tf.summary.merge(list_of_summaries) ''' else: list_of_summaries = [] with tf.name_scope('ensemble'): p, r, f, lo = tf.unstack(stats_after_each_epoch['ensemble_prf_and_loss']) items_for_summary = dict(precision=p, recall=r, f1=f, average_loss=lo) for item_name, item_value in items_for_summary.items(): tmp = tf.summary.scalar(item_name, item_value) list_of_summaries.append(tmp) statistical_summary = tf.summary.merge(list_of_summaries) ''' with tf.name_scope('images'): image_summary_length = int(6 * 16000 // 512) labels_uint8 = self.batch['label'][:, :image_summary_length, :] labels_uint8 = tf.cast(labels_uint8, tf.uint8) * 255 #assert labels_uint8.dtype == tf.uint8 labels_uint8 = labels_uint8[..., None] _logits_left = tf.split(value=logits_stereo, num_or_size_splits=2, axis=0)[0] logits_prob_uint8 = tf.sigmoid(_logits_left[:, :image_summary_length, :]) logits_prob_uint8 = tf.cast(logits_prob_uint8 * 255., tf.uint8) logits_prob_uint8 = logits_prob_uint8[..., None] images = tf.concat([labels_uint8, logits_prob_uint8, tf.zeros_like(labels_uint8)], axis=-1) images = tf.transpose(images, [0, 2, 1, 3]) images.set_shape([None, 88, image_summary_length, 3]) image_summary = tf.summary.image('images', images) if self.is_training: with tf.name_scope('params'): var_summary_dict = dict() for var in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES): var_summary_dict[var.op.name] = tf.summary.histogram(var.op.name, var) param_summary = tf.summary.merge(list(var_summary_dict.values())) if self.is_training: op_dict = dict( training_op=training_op, #tb_summary=dict(statistics=statistical_summary, image=image_summary, parameter=param_summary), #tb_summary=dict(image=image_summary, parameter=param_summary), #update_op_after_each_batch=update_op_after_each_batch, #statistics_after_each_epoch=stats_after_each_epoch ) else: op_dict = dict( tb_summary=dict(statistics=statistical_summary, image=image_summary), update_op_after_each_batch=update_op_after_each_batch, statistics_after_each_epoch=stats_after_each_epoch ) self.op_dict = op_dict
def build_sampler(self, max_len=20): sampled_word_list = [] image = tf.image.convert_image_dtype(self.image, dtype=tf.float32) image_val = image_processing.process_image(image, is_training=False, height=self.image_size, width=self.image_size) network_fn = nets_factory.get_network_fn(self.CNN_MODEL, num_classes=1001, weight_decay=0.0001, is_training=False) logits, end_points = network_fn(tf.expand_dims(image_val, 0)) region_feature = end_points['Mixed_7d'] #(N,8,8,1536) mean_feature = tf.reduce_mean(tf.reshape(region_feature, [-1, self.R, 1536]), axis=1) #(N,1536) batch_size = tf.shape(mean_feature)[0] mean_proj, region_proj = self._image_features( mean_feature, region_feature, reuse=False) # (N,H);(N,R,LSTM) lstm_init = self._init_image_feature(mean_feature, reuse=False) MIL_embedding = tf.expand_dims( self._word_embedding(self.MIL_list, reuse=False), 0) #(1,N,616) lstm_cell_v = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.LSTM) (c_v, h_v) = lstm_cell_v.zero_state(batch_size=batch_size, dtype=tf.float32) lstm_cell_v2 = tf.nn.rnn_cell.BasicLSTMCell(num_units=self.LSTM) (c_v2, h_v2) = lstm_cell_v2.zero_state(batch_size=batch_size, dtype=tf.float32) z = self._attention_map(region_feature, reuse=False) final_prob, mean_activation, final_prob2 = self.MIL_classifier( region_feature, z) MIL_attribute, _ = self._attribute_vector_op(batch_size, final_prob2, MIL_embedding) with tf.variable_scope('lstm', reuse=False): _, (c_v, h_v) = lstm_cell_v(inputs=tf.concat( (lstm_init, MIL_attribute), axis=1), state=(c_v, h_v)) with tf.variable_scope('v_lstm2', reuse=False): _, (c_v2, h_v2) = lstm_cell_v2(inputs=tf.concat( (lstm_init, MIL_attribute), axis=1), state=(c_v2, h_v2)) for t in range(self.T): if t == 0: previous_word = self._word_embedding(inputs=tf.fill( [batch_size], self._start), reuse=True) else: previous_word = self._word_embedding(inputs=sampled_word, reuse=True) x_new_v = tf.concat([h_v2, mean_proj, previous_word], 1) with tf.variable_scope('lstm', reuse=True): _, (c_v, h_v) = lstm_cell_v(inputs=x_new_v, state=(c_v, h_v)) pivot, alpha_pivot = self._attention_pivot( region_proj, h_v, reuse=(t != 0)) #(N,H) (N,R) pivot_stopped = tf.stop_gradient(pivot) final_prob2_gated = self._attribute_gate_layer(h_v, pivot, final_prob2, reuse=(t != 0)) MIL_attribute, _ = self._attribute_vector_op( batch_size, final_prob2_gated, MIL_embedding) x_new_v2 = tf.concat([pivot, h_v, MIL_attribute], 1) #(2*LSTM) with tf.variable_scope('v_lstm2', reuse=True): _, (c_v2, h_v2) = lstm_cell_v2(inputs=x_new_v2, state=(c_v2, h_v2)) logits2 = self._decode_layer(h_v2, reuse=(t != 0)) sampled_word = tf.argmax(logits2, 1, output_type=tf.int32) sampled_word_list.append(sampled_word) sampled_captions = tf.transpose(tf.stack(sampled_word_list), (1, 0)) # (N, max_len) return sampled_captions
# bias and weights w0 = tf.Variable(tf.zeros([1])) W = tf.Variable(tf.zeros([p])) # matrix factorization factors, randomly initialized V = tf.Variable(tf.random_normal([k, p], stddev=0.01)) # estimation of y Y_hat = tf.Variable(tf.zeros([n, 1])) # 定义损失函数和优化器 linear_terms = tf.add(w0, tf.reduce_sum(tf.multiply(W, X), 1, keepdims=True)) pair_interactions = tf.multiply(0.5, tf.reduce_sum( tf.subtract( tf.pow(tf.matmul(X, tf.transpose(V)), 2), tf.matmul(tf.pow(X, 2), tf.pow(tf.transpose(V), 2)) ), 1, keepdims=True )) Y_hat = tf.add(linear_terms, pair_interactions) lambda_w = tf.constant(0.001, name='lambda_w') lambda_v = tf.constant(0.001, name='lambda_v') l2_norm = tf.reduce_sum(tf.multiply(lambda_w, tf.pow(W,2)))+tf.reduce_sum(tf.multiply(lambda_v, tf.pow(V,2))) error = tf.reduce_mean(tf.square(tf.subtract(Y, Y_hat))) loss = tf.add(error, l2_norm) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01).minimize(loss) # 训练和测试 epochs = 50 init = tf.global_variables_initializer()
# The LSTM used for looking backwards through the sentences, similar to lstm. lstm_back = tf.contrib.rnn.BasicLSTMCell(lstm_size) # A dropout wrapper for lstm_back, like lstm_drop. lstm_drop_back = tf.contrib.rnn.DropoutWrapper(lstm_back, input_p, output_p) # Initial values for the fully connected layer's weights. fc_initializer = tf.random_normal_initializer(stddev=0.1) fc_weight = tf.get_variable('fc_weight', [2*hidden_size, 3]) fc_bias = tf.get_variable('bias', [3]) # x: the inputs to the bidirectional_rnn x = tf.concat([hyp, evi], 1) # Permuting batch_size and n_steps x = tf.transpose(x, [1, 0, 2]) # (Le+Lh), N, d # Reshaping to (n_steps*batch_size, n_input) x = tf.reshape(x, [-1, vector_size]) # (Le+Lh)*N, d # Split to get a list of 'n_steps' tensors of shape (batch_size, n_input) x = tf.split(x, l_seq,) rnn_outputs, _, _ = tf.contrib.rnn.static_bidirectional_rnn(lstm, lstm_back, x, dtype=tf.float32) # The scores are relative certainties for how likely the output matches # a certain entailment: # 0: Positive entailment # 1: Neutral entailment # 2: Negative entailment classification_scores = tf.matmul(rnn_outputs[-1], fc_weight) + fc_bias # Initialize variables
def FProp(self, theta, input_batch): """Embeds source ids and transforms with TransformerStack. Args: theta: A `.NestedMap` object containing weights' values of this layer and its children layers. input_batch: A `.NestedMap` with fields: - ids: The inputs tensor. It is expected to be of shape [batch, time]. - paddings: The paddings tensor. Expected shape [batch, time]. Returns: A NestedMap containing: - encoded: The encoded features, either a tensor of shape [time, batch, depth], or a list of tensors if is_transparent is set in transformer_stack. - padding: of shape [time, batch] - segment_id: [time, batch] if packed inputs are supported by the model (and all layers), or None otherwise. - embedded_inputs: [time, batch, depth] embedded inputs tokens without positional encodings. """ p = self.params with tf.name_scope(p.name): src_segment_id = None src_segment_pos = None input_ids = py_utils.with_dependencies([ py_utils.assert_shape_match( tf.shape(input_batch.ids), tf.shape(input_batch.paddings)), py_utils.assert_equal(tf.rank(input_batch.ids), 2) ], input_batch.ids) if (not py_utils.use_tpu() and tf.flags.FLAGS.transformer_encoder_truncates_inputs): max_seq_length = tf.cast( tf.reduce_max(tf.reduce_sum(1.0 - input_batch.paddings, 1)), tf.int32) paddings = py_utils.with_dependencies([ py_utils.assert_equal( tf.constant(True, tf.bool), tf.reduce_all(input_batch.paddings[:, max_seq_length:] > 0.5)) ], input_batch.paddings) input_ids = input_ids[:, :max_seq_length] paddings = paddings[:, :max_seq_length] if p.packed_input: src_segment_id = input_batch.segment_ids[:, :max_seq_length] src_segment_pos = input_batch.segment_pos[:, :max_seq_length] else: paddings = input_batch.paddings if p.packed_input: src_segment_id = input_batch.segment_ids src_segment_pos = input_batch.segment_pos max_time = tf.shape(input_ids)[1] # Input token embeddings + positional embeddings input_embs = self.token_emb.EmbLookup(theta.token_emb, tf.reshape(input_ids, [-1])) input_embs = tf.reshape(input_embs, [-1, max_time, p.token_emb.embedding_dim]) # [time, batch, dim] orig_input_embs = tf.transpose(input_embs, [1, 0, 2]) if p.packed_input: position_embs = self.position_emb.FPropWithPosition( theta.position_emb, src_segment_pos) else: position_embs = self.position_emb.FProp(theta.position_emb, max_time) position_embs = tf.reshape(position_embs, [1, max_time, p.token_emb.embedding_dim]) input_embs += position_embs if p.model_dim != p.token_emb.embedding_dim: input_embs = self.emb_proj.FProp(theta.emb_proj, input_embs) paddings = tf.transpose(paddings) if p.packed_input: src_segment_id = tf.transpose(src_segment_id) input_embs = self.input_dropout.FProp(theta.input_dropout, input_embs) # [time, batch, dim] transformer_input = tf.transpose(input_embs, [1, 0, 2]) encoded, padding, segment_id = self.transformer_stack.FProp( theta.transformer_stack, transformer_input, paddings, src_segment_id) return py_utils.NestedMap( encoded=encoded, padding=padding, segment_id=segment_id, embedded_inputs=orig_input_embs)
def __init__(self, source_vocab_size, target_vocab_size, buckets, size, num_layers, max_gradient_norm, batch_size, learning_rate, learning_rate_decay_factor, use_lstm=False, num_samples=512, forward_only=False, dtype=tf.float32): """Create the model. Args: source_vocab_size: size of the source vocabulary. target_vocab_size: size of the target vocabulary. buckets: a list of pairs (I, O), where I specifies maximum input length that will be processed in that bucket, and O specifies maximum output length. Training instances that have inputs longer than I or outputs longer than O will be pushed to the next bucket and padded accordingly. We assume that the list is sorted, e.g., [(2, 4), (8, 16)]. size: number of units in each layer of the model. num_layers: number of layers in the model. max_gradient_norm: gradients will be clipped to maximally this norm. batch_size: the size of the batches used during training; the model construction is independent of batch_size, so it can be changed after initialization if this is convenient, e.g., for decoding. learning_rate: learning rate to start with. learning_rate_decay_factor: decay learning rate by this much when needed. use_lstm: if true, we use LSTM cells instead of GRU cells. num_samples: number of samples for sampled softmax. forward_only: if set, we do not construct the backward pass in the model. dtype: the data type to use to store internal variables. """ self.source_vocab_size = source_vocab_size self.target_vocab_size = target_vocab_size self.buckets = buckets self.batch_size = batch_size self.learning_rate = tf.Variable( float(learning_rate), trainable=False, dtype=dtype) self.learning_rate_decay_op = self.learning_rate.assign( self.learning_rate * learning_rate_decay_factor) self.global_step = tf.Variable(0, trainable=False) # If we use sampled softmax, we need an output projection. output_projection = None softmax_loss_function = None # Sampled softmax only makes sense if we sample less than vocabulary size. if num_samples > 0 and num_samples < self.target_vocab_size: w_t = tf.get_variable("proj_w", [self.target_vocab_size, size], dtype=dtype) w = tf.transpose(w_t) b = tf.get_variable("proj_b", [self.target_vocab_size], dtype=dtype) output_projection = (w, b) def sampled_loss(labels, logits): labels = tf.reshape(labels, [-1, 1]) # We need to compute the sampled_softmax_loss using 32bit floats to # avoid numerical instabilities. local_w_t = tf.cast(w_t, tf.float32) local_b = tf.cast(b, tf.float32) local_inputs = tf.cast(logits, tf.float32) return tf.cast( tf.nn.sampled_softmax_loss( weights=local_w_t, biases=local_b, labels=labels, inputs=local_inputs, num_sampled=num_samples, num_classes=self.target_vocab_size), dtype) softmax_loss_function = sampled_loss # Create the internal multi-layer cell for our RNN. def single_cell(): return tf.contrib.rnn.GRUCell(size) if use_lstm: def single_cell(): return tf.contrib.rnn.BasicLSTMCell(size) cell = single_cell() if num_layers > 1: cell = tf.contrib.rnn.MultiRNNCell([single_cell() for _ in range(num_layers)]) # The seq2seq function: we use embedding for the input and attention. def seq2seq_f(encoder_inputs, decoder_inputs, do_decode): return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq( encoder_inputs, decoder_inputs, cell, num_encoder_symbols=source_vocab_size, num_decoder_symbols=target_vocab_size, embedding_size=size, output_projection=output_projection, feed_previous=do_decode, dtype=dtype) # Feeds for inputs. self.encoder_inputs = [] self.decoder_inputs = [] self.target_weights = [] for i in xrange(buckets[-1][0]): # Last bucket is the biggest one. self.encoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="encoder{0}".format(i))) for i in xrange(buckets[-1][1] + 1): self.decoder_inputs.append(tf.placeholder(tf.int32, shape=[None], name="decoder{0}".format(i))) self.target_weights.append(tf.placeholder(dtype, shape=[None], name="weight{0}".format(i))) # Our targets are decoder inputs shifted by one. targets = [self.decoder_inputs[i + 1] for i in xrange(len(self.decoder_inputs) - 1)] # Training outputs and losses. if forward_only: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, True), softmax_loss_function=softmax_loss_function) # If we use output projection, we need to project outputs for decoding. if output_projection is not None: for b in xrange(len(buckets)): self.outputs[b] = [ tf.matmul(output, output_projection[0]) + output_projection[1] for output in self.outputs[b] ] else: self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets( self.encoder_inputs, self.decoder_inputs, targets, self.target_weights, buckets, lambda x, y: seq2seq_f(x, y, False), softmax_loss_function=softmax_loss_function) # Gradients and SGD update operation for training the model. params = tf.trainable_variables() if not forward_only: self.gradient_norms = [] self.updates = [] opt = tf.train.GradientDescentOptimizer(self.learning_rate) # opt = tf.train.AdamOptimizer(self.learning_rate) for b in xrange(len(buckets)): gradients = tf.gradients(self.losses[b], params) clipped_gradients, norm = tf.clip_by_global_norm(gradients, max_gradient_norm) self.gradient_norms.append(norm) self.updates.append(opt.apply_gradients( zip(clipped_gradients, params), global_step=self.global_step)) self.saver = tf.train.Saver(tf.global_variables())
def attention_layer(from_tensor, to_tensor, attention_mask=None, num_attention_heads=1, size_per_head=512, query_act=None, key_act=None, value_act=None, attention_probs_dropout_prob=0.0, initializer_range=0.02, do_return_2d_tensor=False, batch_size=None, from_seq_length=None, to_seq_length=None): """Performs multi-headed attention from `from_tensor` to `to_tensor`. This is an implementation of multi-headed attention based on "Attention is all you Need". If `from_tensor` and `to_tensor` are the same, then this is self-attention. Each timestep in `from_tensor` attends to the corresponding sequence in `to_tensor`, and returns a fixed-with vector. This function first projects `from_tensor` into a "query" tensor and `to_tensor` into "key" and "value" tensors. These are (effectively) a list of tensors of length `num_attention_heads`, where each tensor is of shape [batch_size, seq_length, size_per_head]. Then, the query and key tensors are dot-producted and scaled. These are softmaxed to obtain attention probabilities. The value tensors are then interpolated by these probabilities, then concatenated back to a single tensor and returned. In practice, the multi-headed attention are done with transposes and reshapes rather than actual separate tensors. Args: from_tensor: float Tensor of shape [batch_size, from_seq_length, from_width]. to_tensor: float Tensor of shape [batch_size, to_seq_length, to_width]. attention_mask: (optional) int32 Tensor of shape [batch_size, from_seq_length, to_seq_length]. The values should be 1 or 0. The attention scores will effectively be set to -infinity for any positions in the mask that are 0, and will be unchanged for positions that are 1. num_attention_heads: int. Number of attention heads. size_per_head: int. Size of each attention head. query_act: (optional) Activation function for the query transform. key_act: (optional) Activation function for the key transform. value_act: (optional) Activation function for the value transform. attention_probs_dropout_prob: (optional) float. Dropout probability of the attention probabilities. initializer_range: float. Range of the weight initializer. do_return_2d_tensor: bool. If True, the output will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]. If False, the output will be of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. batch_size: (Optional) int. If the input is 2D, this might be the batch size of the 3D version of the `from_tensor` and `to_tensor`. from_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `from_tensor`. to_seq_length: (Optional) If the input is 2D, this might be the seq length of the 3D version of the `to_tensor`. Returns: float Tensor of shape [batch_size, from_seq_length, num_attention_heads * size_per_head]. (If `do_return_2d_tensor` is true, this will be of shape [batch_size * from_seq_length, num_attention_heads * size_per_head]). Raises: ValueError: Any of the arguments or tensor shapes are invalid. """ def transpose_for_scores(input_tensor, batch_size, num_attention_heads, seq_length, width): output_tensor = tf.reshape( input_tensor, [batch_size, seq_length, num_attention_heads, width]) output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3]) return output_tensor from_shape = get_shape_list(from_tensor, expected_rank=[2, 3]) to_shape = get_shape_list(to_tensor, expected_rank=[2, 3]) if len(from_shape) != len(to_shape): raise ValueError( "The rank of `from_tensor` must match the rank of `to_tensor`.") if len(from_shape) == 3: batch_size = from_shape[0] from_seq_length = from_shape[1] to_seq_length = to_shape[1] elif len(from_shape) == 2: if (batch_size is None or from_seq_length is None or to_seq_length is None): raise ValueError( "When passing in rank 2 tensors to attention_layer, the values " "for `batch_size`, `from_seq_length`, and `to_seq_length` " "must all be specified.") # Scalar dimensions referenced here: # B = batch size (number of sequences) # F = `from_tensor` sequence length # T = `to_tensor` sequence length # N = `num_attention_heads` # H = `size_per_head` from_tensor_2d = reshape_to_matrix(from_tensor) to_tensor_2d = reshape_to_matrix(to_tensor) # `query_layer` = [B*F, N*H] query_layer = tf.layers.dense( from_tensor_2d, num_attention_heads * size_per_head, activation=query_act, name="query", kernel_initializer=create_initializer(initializer_range)) # `key_layer` = [B*T, N*H] key_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=key_act, name="key", kernel_initializer=create_initializer(initializer_range)) # `value_layer` = [B*T, N*H] value_layer = tf.layers.dense( to_tensor_2d, num_attention_heads * size_per_head, activation=value_act, name="value", kernel_initializer=create_initializer(initializer_range)) # `query_layer` = [B, N, F, H] query_layer = transpose_for_scores(query_layer, batch_size, num_attention_heads, from_seq_length, size_per_head) # `key_layer` = [B, N, T, H] key_layer = transpose_for_scores(key_layer, batch_size, num_attention_heads, to_seq_length, size_per_head) # Take the dot product between "query" and "key" to get the raw # attention scores. # `attention_scores` = [B, N, F, T] attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head))) if attention_mask is not None: # `attention_mask` = [B, 1, F, T] attention_mask = tf.expand_dims(attention_mask, axis=[1]) # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. adder = (1.0 - tf.cast(attention_mask, tf.float32)) * -10000.0 # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. attention_scores += adder # Normalize the attention scores to probabilities. # `attention_probs` = [B, N, F, T] attention_probs = tf.nn.softmax(attention_scores) # This is actually dropping out entire tokens to attend to, which might # seem a bit unusual, but is taken from the original Transformer paper. attention_probs = dropout(attention_probs, attention_probs_dropout_prob) # `value_layer` = [B, T, N, H] value_layer = tf.reshape( value_layer, [batch_size, to_seq_length, num_attention_heads, size_per_head]) # `value_layer` = [B, N, T, H] value_layer = tf.transpose(value_layer, [0, 2, 1, 3]) # `context_layer` = [B, N, F, H] context_layer = tf.matmul(attention_probs, value_layer) # `context_layer` = [B, F, N, H] context_layer = tf.transpose(context_layer, [0, 2, 1, 3]) if do_return_2d_tensor: # `context_layer` = [B*F, N*H] context_layer = tf.reshape( context_layer, [batch_size * from_seq_length, num_attention_heads * size_per_head]) else: # `context_layer` = [B, F, N*H] context_layer = tf.reshape( context_layer, [batch_size, from_seq_length, num_attention_heads * size_per_head]) return context_layer
def get_bilinear_classifier(self, layer, outputs, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets # (n x m x m) label_targets = self.placeholder unlabeled_predictions = outputs['unlabeled_predictions'] unlabeled_targets = outputs['unlabeled_targets'] #----------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) #----------------------------------------------------------- # Compute the probabilities/cross entropy # (n x m x m) -> (n x m x m x 1) head_probabilities = tf.expand_dims(tf.stop_gradient(outputs['probabilities']), axis=-1) # (n x m x m x c) -> (n x m x m x c) label_probabilities = tf.nn.softmax(transposed_logits) * tf.to_float(tf.expand_dims(token_weights, axis=-1)) # (n x m x m), (n x m x m x c), (n x m x m) -> () label_loss = tf.losses.sparse_softmax_cross_entropy(label_targets, transposed_logits, weights=token_weights*unlabeled_targets) #----------------------------------------------------------- # Compute the predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = tf.argmax(transposed_logits, axis=-1, output_type=tf.int64) # (n x m x m) (*) (n x m x m) -> (n x m x m) true_positives = nn.equal(label_targets, predictions) * unlabeled_predictions correct_label_tokens = nn.equal(label_targets, predictions) * unlabeled_targets # (n x m x m) -> () n_unlabeled_predictions = tf.reduce_sum(unlabeled_predictions) n_unlabeled_targets = tf.reduce_sum(unlabeled_targets) n_true_positives = tf.reduce_sum(true_positives) n_correct_label_tokens = tf.reduce_sum(correct_label_tokens) # () - () -> () n_false_positives = n_unlabeled_predictions - n_true_positives n_false_negatives = n_unlabeled_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) n_correct_label_tokens_per_sequence = tf.reduce_sum(correct_label_tokens, axis=[1,2]) # (n) x 2 -> () n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) n_correct_label_sequences = tf.reduce_sum(nn.equal(n_correct_label_tokens_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary rho = self.loss_interpolation outputs['label_targets'] = label_targets outputs['probabilities'] = label_probabilities * head_probabilities outputs['label_loss'] = label_loss outputs['loss'] = 2*((1-rho) * outputs['loss'] + rho * label_loss) outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences outputs['n_correct_label_tokens'] = n_correct_label_tokens outputs['n_correct_label_sequences'] = n_correct_label_sequences # ============================================================== outputs['label_predictions'] = predictions outputs['label_targets'] = label_targets outputs['label_logits'] = transposed_logits # pdb.set_trace() #=============================================================== return outputs
s = create_svec(alpha_s,t,tau) # In[11]: #### Tensorflow Graph ###### with tf.name_scope('s'): x = tf.placeholder(tf.float64) with tf.name_scope('f'): f = tf.placeholder(tf.float64) with tf.name_scope('s_matrix'): dim = tf.ones([len(s),n_gaussian],tf.float64) s_matrix_t = tf.math.multiply(dim,x) s_matrix = tf.transpose(s_matrix_t) with tf.name_scope('center'): c = tf.Variable(tf.linspace(tf.dtypes.cast(0,tf.float64,name=None),1.000,n_gaussian,name = 'center')) c = tf.reshape(c,[n_gaussian,1]) with tf.name_scope('c_matrix'): dim_c = tf.ones([n_gaussian,len(s)],tf.float64) c_matrix = tf.math.multiply(dim_c,c) with tf.name_scope('bandwidth'): h = tf.Variable(0.1*tf.ones((n_gaussian,1),dtype = tf.float64)) smc = s_matrix-c_matrix h_smc_pow = tf.math.multiply((-h),tf.pow(smc,2)) with tf.name_scope('psi'): psi = tf.math.exp(h_smc_pow) w = tf.Variable(tf.random_normal([n_gaussian,1],name = "weight",dtype = tf.float64))
def get_bilinear_classifier(self, layer, outputs, token_weights, variable_scope=None, reuse=False): """""" layer1 = layer2 = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) bucket_size = tf.shape(layer)[-2] #------------------------------------------------------- # Process the targets # (n x m) label_targets = self.placeholder unlabeled_predictions = outputs['unlabeled_predictions'] unlabeled_targets = outputs['unlabeled_targets'] # (n x m) -> (n x m x m) unlabeled_predictions = tf.one_hot(unlabeled_predictions, bucket_size) unlabeled_targets = tf.one_hot(unlabeled_targets, bucket_size) # (n x m x m) -> (n x m x m x 1) unlabeled_predictions = tf.expand_dims(unlabeled_predictions, axis=-1) unlabeled_targets = tf.expand_dims(unlabeled_targets, axis=-1) #------------------------------------------------------- # Process the logits # We use the gold heads for computing the label score and the predicted # heads for computing the unlabeled attachment score # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) # (n x m x c x m) * (n x m x m x 1) -> (n x m x c x 1) predicted_logits = tf.matmul(logits, unlabeled_predictions) oracle_logits = tf.matmul(logits, unlabeled_targets) # (n x m x c x 1) -> (n x m x c) predicted_logits = tf.squeeze(predicted_logits, axis=-1) oracle_logits = tf.squeeze(oracle_logits, axis=-1) #------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m) -> (n x m x m x 1) head_probabilities = tf.expand_dims(tf.stop_gradient(outputs['probabilities']), axis=-1) # (n x m x m x c) -> (n x m x m x c) label_probabilities = tf.nn.softmax(transposed_logits) # (n x m), (n x m x c), (n x m) -> () label_loss = tf.losses.sparse_softmax_cross_entropy(label_targets, oracle_logits, weights=token_weights) #------------------------------------------------------- # Compute predictions/accuracy # (n x m x c) -> (n x m) label_predictions = tf.argmax(predicted_logits, axis=-1, output_type=tf.int32) label_oracle_predictions = tf.argmax(oracle_logits, axis=-1, output_type=tf.int32) # (n x m) (*) (n x m) -> (n x m) correct_label_tokens = nn.equal(label_targets, label_oracle_predictions) * token_weights correct_tokens = nn.equal(label_targets, label_predictions) * outputs['correct_unlabeled_tokens'] # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_label_tokens_per_sequence = tf.reduce_sum(correct_label_tokens, axis=-1) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) # (n), (n) -> (n) correct_label_sequences = nn.equal(tokens_per_sequence, correct_label_tokens_per_sequence) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary rho = self.loss_interpolation outputs['label_targets'] = label_targets # This way we can reconstruct the head_probabilities by exponentiating and summing along the last axis outputs['probabilities'] = label_probabilities * head_probabilities outputs['label_loss'] = label_loss outputs['loss'] = 2*((1-rho) * outputs['loss'] + rho * label_loss) outputs['label_predictions'] = label_predictions outputs['n_correct_label_tokens'] = tf.reduce_sum(correct_label_tokens) outputs['n_correct_label_sequences'] = tf.reduce_sum(correct_label_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def get_unfactored_bilinear_classifier(self, layer, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob add_linear = self.add_linear with tf.variable_scope(variable_scope or self.field): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*self.hidden_size, hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top' % i): layers = classifiers.hidden(layer, 2*[self.hidden_size], hidden_func=self.hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) #----------------------------------------------------------- # Process the targets targets = self.placeholder # (n x m x m) -> (n x m x m) unlabeled_targets = nn.greater(targets, 0) #----------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x m x c) transposed_logits = tf.transpose(logits, [0,1,3,2]) #----------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x m x c) -> (n x m x m x c) probabilities = tf.nn.softmax(transposed_logits) * tf.to_float(tf.expand_dims(token_weights, axis=-1)) # (n x m x m), (n x m x m x c), (n x m x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, transposed_logits, weights=token_weights) #----------------------------------------------------------- # Compute predictions/accuracy # (n x m x m x c) -> (n x m x m) predictions = tf.argmax(transposed_logits, axis=-1, output_type=tf.int32) * token_weights # (n x m x m) -> (n x m x m) unlabeled_predictions = nn.greater(predictions, 0) # (n x m x m) (*) (n x m x m) -> (n x m x m) unlabeled_true_positives = unlabeled_predictions * unlabeled_targets true_positives = nn.equal(targets, predictions) * unlabeled_true_positives # (n x m x m) -> () n_predictions = tf.reduce_sum(unlabeled_predictions) n_targets = tf.reduce_sum(unlabeled_targets) n_unlabeled_true_positives = tf.reduce_sum(unlabeled_true_positives) n_true_positives = tf.reduce_sum(true_positives) # () - () -> () n_unlabeled_false_positives = n_predictions - n_unlabeled_true_positives n_unlabeled_false_negatives = n_targets - n_unlabeled_true_positives n_false_positives = n_predictions - n_true_positives n_false_negatives = n_targets - n_true_positives # (n x m x m) -> (n) n_targets_per_sequence = tf.reduce_sum(unlabeled_targets, axis=[1,2]) n_unlabeled_true_positives_per_sequence = tf.reduce_sum(unlabeled_true_positives, axis=[1,2]) n_true_positives_per_sequence = tf.reduce_sum(true_positives, axis=[1,2]) # (n) x 2 -> () n_correct_unlabeled_sequences = tf.reduce_sum(nn.equal(n_unlabeled_true_positives_per_sequence, n_targets_per_sequence)) n_correct_sequences = tf.reduce_sum(nn.equal(n_true_positives_per_sequence, n_targets_per_sequence)) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['label_targets'] = self.placeholder outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = tf.constant(0.) outputs['loss'] = loss outputs['unlabeled_predictions'] = unlabeled_predictions outputs['label_predictions'] = predictions outputs['n_unlabeled_true_positives'] = n_unlabeled_true_positives outputs['n_unlabeled_false_positives'] = n_unlabeled_false_positives outputs['n_unlabeled_false_negatives'] = n_unlabeled_false_negatives outputs['n_correct_unlabeled_sequences'] = n_correct_unlabeled_sequences outputs['n_true_positives'] = n_true_positives outputs['n_false_positives'] = n_false_positives outputs['n_false_negatives'] = n_false_negatives outputs['n_correct_sequences'] = n_correct_sequences return outputs
def resnet_model_fn(features, labels, mode, params): """The model_fn for ResNet to be used with TPUEstimator. Args: features: `Tensor` of batched images. If transpose_input is enabled, it is transposed to device layout and reshaped to 1D tensor. labels: `Tensor` of labels for the data samples mode: one of `tf.estimator.ModeKeys.{TRAIN,EVAL,PREDICT}` params: `dict` of parameters passed to the model from the TPUEstimator, `params['batch_size']` is always provided and should be used as the effective batch size. Returns: A `TPUEstimatorSpec` for the model """ if isinstance(features, dict): features = features['feature'] # In most cases, the default data format NCHW instead of NHWC should be # used for a significant performance boost on GPU/TPU. NHWC should be used # only if the network needs to be run on CPU since the pooling operations # are only supported on NHWC. if params['data_format'] == 'channels_first': assert not params['transpose_input'] # channels_first only for GPU features = tf.transpose(features, [0, 3, 1, 2]) if params['transpose_input'] and mode != tf.estimator.ModeKeys.PREDICT: image_size = tf.sqrt(tf.shape(features)[0] / (3 * tf.shape(labels)[0])) features = tf.reshape(features, [image_size, image_size, 3, -1]) features = tf.transpose(features, [3, 0, 1, 2]) # HWCN to NHWC # Normalize the image to zero mean and unit variance. features -= tf.constant(MEAN_RGB, shape=[1, 1, 3], dtype=features.dtype) features /= tf.constant(STDDEV_RGB, shape=[1, 1, 3], dtype=features.dtype) # DropBlock keep_prob for the 4 block groups of ResNet architecture. # None means applying no DropBlock at the corresponding block group. dropblock_keep_probs = [None] * 4 if params['dropblock_groups']: # Scheduled keep_prob for DropBlock. train_steps = tf.cast(params['train_steps'], tf.float32) current_step = tf.cast(tf.train.get_global_step(), tf.float32) current_ratio = current_step / train_steps dropblock_keep_prob = (1 - current_ratio * (1 - params['dropblock_keep_prob'])) # Computes DropBlock keep_prob for different block groups of ResNet. dropblock_groups = [ int(x) for x in params['dropblock_groups'].split(',') ] for block_group in dropblock_groups: if block_group < 1 or block_group > 4: raise ValueError( 'dropblock_groups should be a comma separated list of integers ' 'between 1 and 4 (dropblcok_groups: {}).'.format( params['dropblock_groups'])) dropblock_keep_probs[block_group - 1] = 1 - ( (1 - dropblock_keep_prob) / 4.0**(4 - block_group)) # This nested function allows us to avoid duplicating the logic which # builds the network, for different values of --precision. def build_network(): network = resnet_model.resnet_v1( resnet_depth=params['resnet_depth'], num_classes=params['num_label_classes'], dropblock_size=params['dropblock_size'], dropblock_keep_probs=dropblock_keep_probs, data_format=params['data_format']) return network(inputs=features, is_training=(mode == tf.estimator.ModeKeys.TRAIN)) if params['precision'] == 'bfloat16': with tf.contrib.tpu.bfloat16_scope(): logits = build_network() logits = tf.cast(logits, tf.float32) elif params['precision'] == 'float32': logits = build_network() if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'classes': tf.argmax(logits, axis=1), 'probabilities': tf.nn.softmax(logits, name='softmax_tensor') } return tf.estimator.EstimatorSpec( mode=mode, predictions=predictions, export_outputs={ 'classify': tf.estimator.export.PredictOutput(predictions) }) # If necessary, in the model_fn, use params['batch_size'] instead the batch # size flags (--train_batch_size or --eval_batch_size). batch_size = params['batch_size'] # pylint: disable=unused-variable # Calculate loss, which includes softmax cross entropy and L2 regularization. one_hot_labels = tf.one_hot(labels, params['num_label_classes']) cross_entropy = tf.losses.softmax_cross_entropy( logits=logits, onehot_labels=one_hot_labels, label_smoothing=params['label_smoothing']) # Add weight decay to the loss for non-batch-normalization variables. loss = cross_entropy + params['weight_decay'] * tf.add_n([ tf.nn.l2_loss(v) for v in tf.trainable_variables() if 'batch_normalization' not in v.name ]) host_call = None if mode == tf.estimator.ModeKeys.TRAIN: # Compute the current epoch and associated learning rate from global_step. global_step = tf.train.get_global_step() steps_per_epoch = params['num_train_images'] / params[ 'train_batch_size'] current_epoch = (tf.cast(global_step, tf.float32) / steps_per_epoch) # LARS is a large batch optimizer. LARS enables higher accuracy at batch 16K # and larger batch sizes. if params['enable_lars']: learning_rate = 0.0 optimizer = lars_util.init_lars_optimizer(current_epoch, params) else: learning_rate = lottery.get_lr_tensor(params) if learning_rate is None: learning_rate = learning_rate_schedule(params, current_epoch) optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params['momentum'], use_nesterov=True) if params['use_tpu']: # When using TPU, wrap the optimizer with CrossShardOptimizer which # handles synchronization details between different TPU cores. To the # user, this should look like regular synchronous training. optimizer = tf.contrib.tpu.CrossShardOptimizer(optimizer) # Batch normalization requires UPDATE_OPS to be added as a dependency to # the train operation. update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(loss, global_step) if not params['skip_host_call']: def host_call_fn(gs, loss, lr, ce): """Training host call. Creates scalar summaries for training metrics. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `host_call`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `host_call`. Args: gs: `Tensor with shape `[batch]` for the global_step loss: `Tensor` with shape `[batch]` for the training loss. lr: `Tensor` with shape `[batch]` for the learning_rate. ce: `Tensor` with shape `[batch]` for the current_epoch. Returns: List of summary ops to run on the CPU host. """ gs = gs[0] # Host call fns are executed params['iterations_per_loop'] times after # one TPU loop is finished, setting max_queue value to the same as # number of iterations will make the summary writer only flush the data # to storage once per loop. with summary.create_file_writer( FLAGS.model_dir, max_queue=params['iterations_per_loop']).as_default(): with summary.always_record_summaries(): summary.scalar('loss', loss[0], step=gs) summary.scalar('learning_rate', lr[0], step=gs) summary.scalar('current_epoch', ce[0], step=gs) return summary.all_summary_ops() # To log the loss, current learning rate, and epoch for Tensorboard, the # summary op needs to be run on the host CPU via host_call. host_call # expects [batch_size, ...] Tensors, thus reshape to introduce a batch # dimension. These Tensors are implicitly concatenated to # [params['batch_size']]. gs_t = tf.reshape(global_step, [1]) loss_t = tf.reshape(loss, [1]) lr_t = tf.reshape(learning_rate, [1]) ce_t = tf.reshape(current_epoch, [1]) host_call = (host_call_fn, [gs_t, loss_t, lr_t, ce_t]) else: train_op = None eval_metrics = None if mode == tf.estimator.ModeKeys.EVAL: def metric_fn(labels, logits): """Evaluation metric function. Evaluates accuracy. This function is executed on the CPU and should not directly reference any Tensors in the rest of the `model_fn`. To pass Tensors from the model to the `metric_fn`, provide as part of the `eval_metrics`. See https://www.tensorflow.org/api_docs/python/tf/contrib/tpu/TPUEstimatorSpec for more information. Arguments should match the list of `Tensor` objects passed as the second element in the tuple passed to `eval_metrics`. Args: labels: `Tensor` with shape `[batch]`. logits: `Tensor` with shape `[batch, num_classes]`. Returns: A dict of the metrics to return from evaluation. """ predictions = tf.argmax(logits, axis=1) top_1_accuracy = tf.metrics.accuracy(labels, predictions) in_top_5 = tf.cast(tf.nn.in_top_k(logits, labels, 5), tf.float32) top_5_accuracy = tf.metrics.mean(in_top_5) return { 'top_1_accuracy': top_1_accuracy, 'top_5_accuracy': top_5_accuracy, } eval_metrics = (metric_fn, [labels, logits]) return tf.contrib.tpu.TPUEstimatorSpec(mode=mode, loss=loss, train_op=train_op, host_call=host_call, eval_metrics=eval_metrics)
def get_unfactored_bilinear_classifier(self, layer, unlabeled_targets, token_weights, variable_scope=None, reuse=False): """""" recur_layer = layer hidden_keep_prob = 1 if reuse else self.hidden_keep_prob hidden_func = self.hidden_func hidden_size = self.hidden_size add_linear = self.add_linear with tf.variable_scope(variable_scope or self.classname): for i in six.moves.range(0, self.n_layers-1): with tf.variable_scope('FC-%d' % i): layer = classifiers.hidden(layer, 2*hidden_size, hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) with tf.variable_scope('FC-top'): layers = classifiers.hiddens(layer, 2*[hidden_size], hidden_func=hidden_func, hidden_keep_prob=hidden_keep_prob) layer1, layer2 = layers.pop(0), layers.pop(0) with tf.variable_scope('Classifier'): if self.diagonal: logits = classifiers.diagonal_bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) else: logits = classifiers.bilinear_classifier( layer1, layer2, len(self), hidden_keep_prob=hidden_keep_prob, add_linear=add_linear) bucket_size = tf.shape(layer)[-2] #------------------------------------------------------- # Process the targets # c (*) (n x m) + (n x m) #targets = len(self) * unlabeled_targets + self.placeholder targets = bucket_size * self.placeholder + unlabeled_targets #------------------------------------------------------- # Process the logits # (n x m x c x m) -> (n x m x cm) reshaped_logits = tf.reshape(logits, tf.stack([-1, bucket_size, bucket_size * len(self)])) #------------------------------------------------------- # Compute probabilities/cross entropy # (n x m x cm) -> (n x m x cm) probabilities = tf.nn.softmax(reshaped_logits) # (n x m x cm) -> (n x m x c x m) probabilities = tf.reshape(probabilities, tf.stack([-1, bucket_size, len(self), bucket_size])) # (n x m x c x m) -> (n x m x m x c) probabilities = tf.transpose(probabilities, [0,1,3,2]) # (n x m), (n x m x cm), (n x m) -> () loss = tf.losses.sparse_softmax_cross_entropy(targets, reshaped_logits, weights=token_weights) #------------------------------------------------------- # Compute predictions/accuracy # (n x m x cm) -> (n x m) predictions = tf.argmax(reshaped_logits, axis=-1, output_type=tf.int32) # (n x m), () -> (n x m) unlabeled_predictions = tf.mod(predictions, bucket_size) # (n x m) (*) (n x m) -> (n x m) correct_tokens = nn.equal(predictions, targets) * token_weights correct_unlabeled_tokens = nn.equal(unlabeled_predictions, unlabeled_targets) * token_weights # (n x m) -> (n) tokens_per_sequence = tf.reduce_sum(token_weights, axis=-1) # (n x m) -> (n) correct_tokens_per_sequence = tf.reduce_sum(correct_tokens, axis=-1) correct_unlabeled_tokens_per_sequence = tf.reduce_sum(correct_unlabeled_tokens, axis=-1) # (n), (n) -> (n) correct_sequences = nn.equal(tokens_per_sequence, correct_tokens_per_sequence) correct_unlabeled_sequences = nn.equal(tokens_per_sequence, correct_unlabeled_tokens_per_sequence) #----------------------------------------------------------- # Populate the output dictionary outputs = {} outputs['recur_layer'] = recur_layer outputs['unlabeled_targets'] = unlabeled_targets outputs['probabilities'] = probabilities outputs['unlabeled_loss'] = tf.constant(0.) outputs['loss'] = loss outputs['unlabeled_predictions'] = unlabeled_predictions outputs['label_predictions'] = predictions outputs['n_correct_unlabeled_tokens'] = tf.reduce_sum(correct_unlabeled_tokens) outputs['n_correct_unlabeled_sequences'] = tf.reduce_sum(correct_unlabeled_sequences) outputs['n_correct_tokens'] = tf.reduce_sum(correct_tokens) outputs['n_correct_sequences'] = tf.reduce_sum(correct_sequences) return outputs
def __init__(self,is_training): # Training or not self.is_training = is_training # Placeholder self.input_ids = tf.placeholder(tf.int32, shape=[None, hp.sequence_length], name='input_ids') self.input_masks = tf.placeholder(tf.int32, shape=[None, hp.sequence_length], name='input_masks') self.segment_ids = tf.placeholder(tf.int32, shape=[None, hp.sequence_length], name='segment_ids') self.label_ids = tf.placeholder(tf.int32, shape=[None,hp.num_labels], name='label_ids') # Load BERT model self.model = modeling.AlbertModel( config=bert_config, is_training=self.is_training, input_ids=self.input_ids, input_mask=self.input_masks, token_type_ids=self.segment_ids, use_one_hot_embeddings=False) # Get the feature vector by BERT output_layer = self.model.get_pooled_output() print('output_layer',output_layer)#(?, 384) # Hidden size hidden_size = output_layer.shape[-1].value with tf.name_scope("Full-connection"): loss_num_label = [] logits_num_label = [] for i in range(hp.num_labels): output_weights = tf.get_variable( "output_weights%s"%str(i), [2, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable( "output_bias%s"%str(i), [2], initializer=tf.zeros_initializer())# logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) logits_num_label.append(logits) one_hot_labels = tf.one_hot(self.label_ids[:,i], depth=2, dtype=tf.int32) per_example_loss = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_labels,logits=logits) loss_num_label.append(tf.reduce_mean(per_example_loss)) self.logits_num_label = tf.transpose(tf.stack(logits_num_label, 0),[1,0,2]) self.loss_num_label = tf.stack(loss_num_label, 0) self.probabilities = tf.nn.sigmoid(self.logits_num_label) with tf.variable_scope("Prediction"): # Prediction self.predictions = tf.to_int32(tf.argmax(self.probabilities,2)) with tf.variable_scope("loss"): # Summary for tensorboard if self.is_training: self.accuracy = tf.reduce_mean(tf.to_float(tf.equal(self.predictions, self.label_ids))) tf.summary.scalar('accuracy', self.accuracy) # Initial embedding by BERT ckpt = tf.train.get_checkpoint_state(hp.saved_model_path) checkpoint_suffix = ".index" if ckpt and tf.gfile.Exists(ckpt.model_checkpoint_path + checkpoint_suffix): print('='*10,'Restoring model from checkpoint!','='*10) print("%s - Restoring model from checkpoint ~%s" % (time_now_string(), ckpt.model_checkpoint_path)) else: print('='*10,'First time load BERT model!','='*10) tvars = tf.trainable_variables() if hp.init_checkpoint: (assignment_map, initialized_variable_names) = \ modeling.get_assignment_map_from_checkpoint(tvars, hp.init_checkpoint) tf.train.init_from_checkpoint(hp.init_checkpoint, assignment_map) # Loss and Optimizer if self.is_training: # Global_step self.global_step = tf.Variable(0, name='global_step', trainable=False) self.loss = tf.reduce_mean(self.loss_num_label) # Optimizer BERT train_examples = processor.get_train_examples(hp.data_dir) num_train_steps = int( len(train_examples) / hp.batch_size * hp.num_train_epochs) num_warmup_steps = int(num_train_steps * hp.warmup_proportion) print('num_train_steps',num_train_steps) self.optimizer = optimization.create_optimizer(self.loss, hp.learning_rate, num_train_steps, num_warmup_steps, hp.use_tpu, Global_step=self.global_step) # Summary for tensorboard tf.summary.scalar('loss', self.loss) self.merged = tf.summary.merge_all()
def optimize(content_targets, style_target, content_weight, style_weight, tv_weight, vgg_path, epochs=2, print_iterations=1000, batch_size=4, save_path='saver/fns.ckpt', slow=False, learning_rate=1e-3, debug=True): if slow: batch_size = 1 mod = len(content_targets) % batch_size if mod > 0: print("Train set has been trimmed slightly..") content_targets = content_targets[:-mod] style_features = {} batch_shape = (batch_size, 256, 256, 3) style_shape = (1, ) + style_target.shape print(style_shape) # precompute style features with tf.Graph().as_default(), tf.device('/cpu:0'), tf.Session() as sess: style_image = tf.placeholder(tf.float32, shape=style_shape, name='style_image') style_image_pre = vgg.preprocess(style_image) net = vgg.net(vgg_path, style_image_pre) style_pre = np.array([style_target]) for layer in STYLE_LAYERS: features = net[layer].eval(feed_dict={style_image: style_pre}) features = np.reshape(features, (-1, features.shape[3])) gram = np.matmul(features.T, features) / features.size style_features[layer] = gram with tf.Graph().as_default(), tf.Session() as sess: X_content = tf.placeholder(tf.float32, shape=batch_shape, name="X_content") X_pre = vgg.preprocess(X_content) # precompute content features content_features = {} content_net = vgg.net(vgg_path, X_pre) content_features[CONTENT_LAYER] = content_net[CONTENT_LAYER] if slow: preds = tf.Variable( tf.random_normal(X_content.get_shape()) * 0.256) preds_pre = preds else: preds = transform.net(X_content / 255.0) preds_pre = vgg.preprocess(preds) net = vgg.net(vgg_path, preds_pre) content_size = _tensor_size( content_features[CONTENT_LAYER]) * batch_size assert _tensor_size(content_features[CONTENT_LAYER]) == _tensor_size( net[CONTENT_LAYER]) content_loss = content_weight * ( 2 * tf.nn.l2_loss(net[CONTENT_LAYER] - content_features[CONTENT_LAYER]) / content_size) style_losses = [] for style_layer in STYLE_LAYERS: layer = net[style_layer] bs, height, width, filters = map(lambda i: i.value, layer.get_shape()) size = height * width * filters feats = tf.reshape(layer, (bs, height * width, filters)) feats_T = tf.transpose(feats, perm=[0, 2, 1]) grams = tf.matmul(feats_T, feats) / size style_gram = style_features[style_layer] style_losses.append(2 * tf.nn.l2_loss(grams - style_gram) / style_gram.size) style_loss = style_weight * functools.reduce(tf.add, style_losses) / batch_size # total variation denoising tv_y_size = _tensor_size(preds[:, 1:, :, :]) tv_x_size = _tensor_size(preds[:, :, 1:, :]) y_tv = tf.nn.l2_loss(preds[:, 1:, :, :] - preds[:, :batch_shape[1] - 1, :, :]) x_tv = tf.nn.l2_loss(preds[:, :, 1:, :] - preds[:, :, :batch_shape[2] - 1, :]) tv_loss = tv_weight * 2 * (x_tv / tv_x_size + y_tv / tv_y_size) / batch_size loss = content_loss + style_loss + tv_loss # overall loss train_step = tf.train.AdamOptimizer(learning_rate).minimize(loss) sess.run(tf.global_variables_initializer()) import random uid = random.randint(1, 100) print("UID: %s" % uid) for epoch in range(epochs): num_examples = len(content_targets) iterations = 0 while iterations * batch_size < num_examples: start_time = time.time() curr = iterations * batch_size step = curr + batch_size X_batch = np.zeros(batch_shape, dtype=np.float32) for j, img_p in enumerate(content_targets[curr:step]): X_batch[j] = get_img(img_p, (256, 256, 3)).astype(np.float32) iterations += 1 assert X_batch.shape[0] == batch_size feed_dict = {X_content: X_batch} print("start training") train_step.run(feed_dict=feed_dict) end_time = time.time() delta_time = end_time - start_time if debug: print("UID: %s, batch time: %s" % (uid, delta_time)) is_print_iter = int(iterations) % print_iterations == 0 if slow: is_print_iter = epoch % print_iterations == 0 is_last = epoch == epochs - 1 and iterations * batch_size >= num_examples should_print = is_print_iter or is_last if should_print: to_get = [style_loss, content_loss, tv_loss, loss, preds] test_feed_dict = {X_content: X_batch} tup = sess.run(to_get, feed_dict=test_feed_dict) _style_loss, _content_loss, _tv_loss, _loss, _preds = tup losses = (_style_loss, _content_loss, _tv_loss, _loss) if slow: _preds = vgg.unprocess(_preds) else: saver = tf.train.Saver() res = saver.save(sess, save_path) yield (_preds, losses, iterations, epoch)
def build_network(lr=None, n_stack=None, image_size=None, n_actions=None): """ Build the network for RL algorithm. """ # init Hp hp = Hyperparameters() flag = hp.model if lr is None: lr = hp.LEARNING_RATE if n_stack is None: n_stack = hp.N_STACK if image_size is None: image_size = hp.IMAGE_SIZE if n_actions is None: n_actions = hp.N_ACTIONS """This network occupy 879Mib GPU memory.""" # ------------------ all inputs -------------------------- # input for target net eval_net_input = tf.placeholder( tf.float32, shape=[None, n_stack, image_size, image_size], name='eval_net_input_' + flag) # input for eval net target_net_input = tf.placeholder( tf.float32, shape=[None, n_stack, image_size, image_size], name='target_net_input_' + flag) # q_target for loss q_target = tf.placeholder(tf.float32, shape=[None, n_actions], name='q_target_' + flag) # ------------------ build evaluate_net ------------------ with tf.variable_scope('eval_net_' + flag): # (?, 4, 80, 80) e_input_crop = eval_net_input / 255 e_input = tf.transpose(e_input_crop, [0, 2, 3, 1]) # (?, 80, 80, 4) # tf.contrib.layers.conv2d(..., activation_fn=tf.nn.relu,...) e_conv1 = tf.contrib.layers.conv2d(inputs=e_input, num_outputs=32, kernel_size=8, stride=4) # (?, 20, 20, 32) e_conv2 = tf.contrib.layers.conv2d(inputs=e_conv1, num_outputs=64, kernel_size=4, stride=2) # (?, 10, 10, 64) e_conv3 = tf.contrib.layers.conv2d(inputs=e_conv2, num_outputs=64, kernel_size=3, stride=1) # (?, 10, 10, 64) e_flat = tf.contrib.layers.flatten(e_conv3) e_f = tf.contrib.layers.fully_connected(e_flat, 512) e_out = tf.contrib.layers.fully_connected(e_f, n_actions) with tf.variable_scope('loss_' + flag): loss = tf.reduce_mean( tf.squared_difference(q_target, e_out, name='TD_error_' + flag)) with tf.variable_scope('train_' + flag): _train_op = tf.train.RMSPropOptimizer(lr, 0.99, 0.0, 1e-6).minimize(loss) # ------------------ build target_net -------------------- with tf.variable_scope('target_net_' + flag): # (?, 4, 80, 80) t_input_crop = target_net_input / 255 t_input = tf.transpose(t_input_crop, [0, 2, 3, 1]) # (?, 80, 80, 4) # tf.contrib.layers.conv2d(..., activation_fn=tf.nn.relu,...) t_conv1 = tf.contrib.layers.conv2d(inputs=t_input, num_outputs=32, kernel_size=8, stride=4) # (?, 20, 20, 32) t_conv2 = tf.contrib.layers.conv2d(inputs=t_conv1, num_outputs=64, kernel_size=4, stride=2) # (?, 10, 10, 64) t_conv3 = tf.contrib.layers.conv2d(inputs=t_conv2, num_outputs=64, kernel_size=3, stride=1) # (?, 10, 10, 64) t_flat = tf.contrib.layers.flatten(t_conv3) t_f = tf.contrib.layers.fully_connected(t_flat, 512) t_out = tf.contrib.layers.fully_connected(t_f, n_actions) t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='target_net_' + flag) e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='eval_net_' + flag) return [[eval_net_input, target_net_input, q_target], [e_out, loss, t_out], [e_params, t_params, _train_op]]
# #stacked_rnn = [] #for i in range(3): # stacked_rnn.append(tf.contrib.rnn.LSTMCell(n_hidden)) #mcell = tf.contrib.rnn.MultiRNNCell(stacked_rnn) gru = tf.contrib.rnn.GRUCell(n_hidden * 2) lstm_cell = tf.contrib.rnn.LSTMCell(n_hidden) mcell = tf.contrib.rnn.MultiRNNCell([lstm_cell, gru]) #x1 = tf.unstack(x, n_steps, 1) #outputs, states = tf.contrib.rnn.static_rnn(mcell, x1, dtype=tf.float32) #pred = tf.contrib.layers.fully_connected(outputs[-1],n_classes,activation_fn = None) outputs, states = tf.nn.dynamic_rnn(mcell, x, dtype=tf.float32) #(?, 28, 256) outputs = tf.transpose( outputs, [1, 0, 2]) #(28, ?, 256) 28个时序,取最后一个时序outputs[-1]=(?,256) pred = tf.contrib.layers.fully_connected(outputs[-1], n_classes, activation_fn=None) learning_rate = 0.001 # Define loss and optimizer cost = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # Evaluate model correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1)) accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32)) training_iters = 100000
def decode(self, x, conv_inputs1, features): batch_size = tf.shape(x)[0] # initialize state tensor arrays state_queues = [] for i, (conv_input, dilation) in enumerate(zip(conv_inputs1, self.dilations)): batch_idx = tf.range(batch_size) batch_idx = tf.tile(tf.expand_dims(batch_idx, 1), (1, dilation)) batch_idx = tf.reshape(batch_idx, [-1]) queue_begin_time = self.encode_len - dilation - 1 temporal_idx = tf.expand_dims(queue_begin_time, 1) + tf.expand_dims(tf.range(dilation), 0) temporal_idx = tf.reshape(temporal_idx, [-1]) idx = tf.stack([batch_idx, temporal_idx], axis=1) slices = tf.reshape(tf.gather_nd(conv_input, idx), (batch_size, dilation, shape(conv_input, 2))) layer_ta = tf.TensorArray(dtype=tf.float32, size=dilation + self.decode_series_len) layer_ta = layer_ta.unstack(tf.transpose(slices, (1, 0, 2))) state_queues.append(layer_ta) # initialize feature tensor array features_ta = tf.TensorArray(dtype=tf.float32, size=self.decode_series_len) features_ta = features_ta.unstack(tf.transpose(features, (1, 0, 2))) # initialize output tensor array emit_ta = tf.TensorArray(size=self.decode_series_len, dtype=tf.float32) # initialize other loop vars elements_finished = 0 >= self.decode_len time = tf.constant(0, dtype=tf.int32) # get initial x input current_idx = tf.stack([tf.range(tf.shape(self.encode_len)[0]), self.encode_len - 1], axis=1) initial_input = tf.gather_nd(x, current_idx) def loop_fn(time1, current_input, queues): current_features = features_ta.read(time1) current_input = tf.concat([current_input, current_features], axis=1) with tf.variable_scope('x-proj-decode', reuse=True): w_x_proj = tf.get_variable('weights') b_x_proj = tf.get_variable('biases') x_proj = tf.nn.tanh(tf.matmul(current_input, w_x_proj) + b_x_proj) skip_outputs, updated_queues = [], [] for i, (conv_input, queue, dilation) in enumerate(zip(conv_inputs1, queues, self.dilations)): state = queue.read(time1) with tf.variable_scope('dilated-conv-decode-{}'.format(i), reuse=True): w_conv = tf.get_variable('weights'.format(i)) b_conv = tf.get_variable('biases'.format(i)) dilated_conv = tf.matmul(state, w_conv[0, :, :]) + tf.matmul(x_proj, w_conv[1, :, :]) + b_conv conv_filter, conv_gate = tf.split(dilated_conv, 2, axis=1) dilated_conv = tf.nn.tanh(conv_filter) * tf.nn.sigmoid(conv_gate) with tf.variable_scope('dilated-conv-proj-decode-{}'.format(i), reuse=True): w_proj = tf.get_variable('weights'.format(i)) b_proj = tf.get_variable('biases'.format(i)) concat_outputs = tf.matmul(dilated_conv, w_proj) + b_proj skips, residuals = tf.split(concat_outputs, [self.skip_channels, self.residual_channels], axis=1) x_proj += residuals skip_outputs.append(skips) updated_queues.append(queue.write(time1 + dilation, x_proj)) skip_outputs = tf.nn.relu(tf.concat(skip_outputs, axis=1)) with tf.variable_scope('dense-decode-1', reuse=True): w_h = tf.get_variable('weights') b_h = tf.get_variable('biases') h = tf.nn.relu(tf.matmul(skip_outputs, w_h) + b_h) with tf.variable_scope('dense-decode-2', reuse=True): w_y = tf.get_variable('weights') b_y = tf.get_variable('biases') y_hat2 = tf.matmul(h, w_y) + b_y elements_finished2 = (time1 >= self.decode_len) finished = tf.reduce_all(elements_finished2) next_input = tf.cond( finished, lambda: tf.zeros([batch_size, 1], dtype=tf.float32), lambda: y_hat2 ) next_elements_finished = (time1 >= self.decode_len -1) return next_elements_finished, next_input, updated_queues def condition(unused_time, elements_finished1, *_): return tf.logical_not(tf.reduce_all(elements_finished1)) def body(time1, elements_finished1, emit_ta1, *state_queues1): (next_finished, emit_output, state_queues2) = loop_fn(time1, initial_input, state_queues1) emit = tf.where(elements_finished1, tf.zeros_like(emit_output), emit_output) emit_ta2 = emit_ta1.write(time1, emit) #elements_finished2 = tf.logical_or(elements_finished1, next_finished) return [time1 + 1, next_finished, emit_ta2] + list(state_queues2) returned = tf.while_loop( cond=condition, body=body, loop_vars=[time, elements_finished, emit_ta] + state_queues ) outputs_ta = returned[2] y_hat = tf.transpose(outputs_ta.stack(), (1, 0, 2)) return y_hat
def process_batch_input_for_RNN(batch_input): batch_input_ = tf.transpose(batch_input, perm=[2, 0, 1]) X = tf.transpose(batch_input_) return X
def preprocess_for_train(image, labels, bboxes, out_shape, data_format='NHWC', scope='ssd_preprocessing_train'): """Preprocesses the given image for training. Note that the actual resizing scale is sampled from [`resize_size_min`, `resize_size_max`]. Args: image: A `Tensor` representing an image of arbitrary size. output_height: The height of the image after preprocessing. output_width: The width of the image after preprocessing. resize_side_min: The lower bound for the smallest side of the image for aspect-preserving resizing. resize_side_max: The upper bound for the smallest side of the image for aspect-preserving resizing. Returns: A preprocessed image. """ fast_mode = False with tf.name_scope(scope, 'ssd_preprocessing_train', [image, labels, bboxes]): if image.get_shape().ndims != 3: raise ValueError('Input must be of size [height, width, C>0]') # Convert to float scaled [0, 1]. if image.dtype != tf.float32: image = tf.image.convert_image_dtype(image, dtype=tf.float32) tf_summary_image(image, bboxes, 'image_with_bboxes') # # Remove DontCare labels. # labels, bboxes = ssd_common.tf_bboxes_filter_labels(out_label, # labels, # bboxes) # Distort image and bounding boxes. dst_image = image bboxes = tf.maximum(bboxes, 1.0) dst_image, labels, bboxes, distort_bbox ,num= \ distorted_bounding_box_crop(image, labels, bboxes, aspect_ratio_range=CROP_RATIO_RANGE) # Resize image to output size. dst_image = tf_image.resize_image( dst_image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) tf_summary_image(dst_image, bboxes, 'image_shape_distorted') # Randomly flip the image horizontally. dst_image, bboxes = tf_image.random_flip_left_right(dst_image, bboxes) # Randomly distort the colors. There are 4 ways to do it. dst_image = apply_with_random_selector( dst_image, lambda x, ordering: distort_color(x, ordering, fast_mode), num_cases=4) tf_summary_image(dst_image, bboxes, 'image_color_distorted') # Rescale to VGG input scale. image = dst_image * 255. image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) # Image data format. if data_format == 'NCHW': image = tf.transpose(image, perm=(2, 0, 1)) return image, labels, bboxes, num
#Forward Pass, ab_i is the state vector together with bias ab_0 = tf.concat([a_0, tf.tile(tf.ones([1, 1]), [tf.shape(a_0)[0], 1])], 1) z_1 = tf.matmul(ab_0, w_1) a_1 = sigma(z_1) ab_1 = tf.concat([a_1, tf.tile(tf.ones([1, 1]), [tf.shape(a_1)[0], 1])], 1) z_2 = tf.matmul(ab_1, w_2) a_2 = sigma(z_2) diff = tf.subtract(a_2, y) #Backward Pass reg2 = tf.Variable(0.001) reg1 = tf.Variable(0.001) d_z_2 = tf.multiply(diff, sigmaprime(z_2)) d_w_2 = tf.matmul(tf.transpose(tf.matmul(ab_1, F_1)), d_z_2) inc_w_2 = tf.subtract(w_2, w_old_2) reg_w_2 = tf.multiply(reg2, inc_w_2) d_w_2 = tf.add(d_w_2, reg_w_2) d_ab_1 = tf.matmul(d_z_2, tf.transpose(w_2)) d_a_1 = d_ab_1[:, :-1] d_z_1 = tf.multiply(d_a_1, sigmaprime(z_1)) d_w_1 = tf.matmul(tf.transpose(tf.matmul(ab_0, F_0)), d_z_1) inc_w_1 = tf.subtract(w_1, w_old_1) reg_w_1 = tf.multiply(reg1, inc_w_1) d_w_1 = tf.add(d_w_1, reg_w_1) eta = tf.constant(0.1)
def _calculate_input_gram_matrix_for(self, network, layer): image_feature = network[layer] batch_size, height, width, number = map(lambda i: i.value, image_feature.get_shape()) size = height * width * number image_feature = tf.reshape(image_feature, (batch_size, height * width, number)) return tf.matmul(tf.transpose(image_feature, perm=[0,2,1]), image_feature) / size
def preprocess_for_eval(image, labels, bboxes, out_shape=EVAL_SIZE, data_format='NHWC', difficults=None, resize=Resize.WARP_RESIZE, scope='ssd_preprocessing_train'): """Preprocess an image for evaluation. Args: image: A `Tensor` representing an image of arbitrary size. out_shape: Output shape after pre-processing (if resize != None) resize: Resize strategy. Returns: A preprocessed image. """ with tf.name_scope(scope): if image.get_shape().ndims != 3: raise ValueError('Input must be of size [height, width, C>0]') image = tf.to_float(image) image = tf_image_whitened(image, [_R_MEAN, _G_MEAN, _B_MEAN]) # Add image rectangle to bboxes. bbox_img = tf.constant([[0., 0., 1., 1.]]) if bboxes is None: bboxes = bbox_img else: bboxes = tf.concat([bbox_img, bboxes], axis=0) if resize == Resize.NONE: # No resizing... pass elif resize == Resize.CENTRAL_CROP: # Central cropping of the image. image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad( image, bboxes, out_shape[0], out_shape[1]) elif resize == Resize.PAD_AND_RESIZE: # Resize image first: find the correct factor... shape = tf.shape(image) factor = tf.minimum( tf.to_double(1.0), tf.minimum(tf.to_double(out_shape[0] / shape[0]), tf.to_double(out_shape[1] / shape[1]))) resize_shape = factor * tf.to_double(shape[0:2]) resize_shape = tf.cast(tf.floor(resize_shape), tf.int32) image = tf_image.resize_image( image, resize_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) # Pad to expected size. image, bboxes = tf_image.resize_image_bboxes_with_crop_or_pad( image, bboxes, out_shape[0], out_shape[1]) elif resize == Resize.WARP_RESIZE: # Warp resize of the image. image = tf_image.resize_image( image, out_shape, method=tf.image.ResizeMethod.BILINEAR, align_corners=False) # Split back bounding boxes. bbox_img = bboxes[0] bboxes = bboxes[1:] # Remove difficult boxes. if difficults is not None: mask = tf.logical_not(tf.cast(difficults, tf.bool)) labels = tf.boolean_mask(labels, mask) bboxes = tf.boolean_mask(bboxes, mask) # Image data format. if data_format == 'NCHW': image = tf.transpose(image, perm=(2, 0, 1)) return image, labels, bboxes, bbox_img
def RNN(X): # neurons in hidden layer n_hidden1_units = 128 n_hidden2_units = 64 n_hidden3_units = 64 n_hidden4_units = 64 n_hidden5_units = 64 n_hidden6_units = 128 # Define weights and biases weights = { 'in': tf.Variable(tf.random_normal([n_inputs, n_hidden1_units]), trainable=True), 'hidd2': tf.Variable(tf.random_normal([n_hidden1_units, n_hidden2_units])), 'hidd3': tf.Variable(tf.random_normal([n_hidden2_units, n_hidden3_units])), 'hidd4': tf.Variable(tf.random_normal([n_hidden3_units, n_hidden4_units])), 'hidd5': tf.Variable(tf.random_normal([n_hidden4_units, n_hidden5_units])), 'hidd6': tf.Variable(tf.random_normal([n_hidden5_units, n_hidden6_units])), 'out': tf.Variable(tf.random_normal([n_hidden6_units, n_classes]), trainable=True), } biases = { 'in': tf.Variable(tf.constant(0.1, shape=[n_hidden1_units])), 'hidd2': tf.Variable(tf.constant(0.1, shape=[n_hidden2_units])), 'hidd3': tf.Variable(tf.constant(0.1, shape=[n_hidden3_units])), 'hidd4': tf.Variable(tf.constant(0.1, shape=[n_hidden4_units])), 'hidd5': tf.Variable(tf.constant(0.1, shape=[n_hidden5_units])), 'hidd6': tf.Variable(tf.constant(0.1, shape=[n_hidden6_units])), 'out': tf.Variable(tf.constant(0.1, shape=[n_classes]), trainable=True), } # hidden layer for input to cell ######################################## # X(batch_size,feature_numbers) X = tf.reshape(X, [-1, feature_number]) # 6 hidden layer X_hidd1 = tf.nn.relu(tf.add(tf.matmul(X, weights['in']), biases['in'])) X_hidd2 = tf.nn.relu( tf.add(tf.matmul(X_hidd1, weights['hidd2']), biases['hidd2'])) X_hidd3 = tf.nn.relu( tf.add(tf.matmul(X_hidd2, weights['hidd3']), biases['hidd3'])) X_hidd4 = tf.nn.relu( tf.add(tf.matmul(X_hidd3, weights['hidd4']), biases['hidd4'])) X_hidd5 = tf.nn.relu( tf.add(tf.matmul(X_hidd4, weights['hidd5']), biases['hidd5'])) X_hidd6 = tf.nn.relu( tf.add(tf.matmul(X_hidd5, weights['hidd6']), biases['hidd6'])) # X_hidd6(?,n_hidden6_units=256) # ====> # X_in(batch_size=?,n_steps=16,n_hidden6_units=256) X_in = tf.reshape(X_hidd6, [-1, n_steps, n_hidden6_units]) # hidden layer basic LSTM Cell. lstm_cell = tf.contrib.rnn.BasicLSTMCell(n_hidden1_units, forget_bias=1, state_is_tuple=True) init_state = lstm_cell.zero_state(batch_size, dtype=tf.float32) outputs, final_state = tf.nn.dynamic_rnn(lstm_cell, X_in, initial_state=init_state, time_major=False) # outputs # final_states is the last outputs outputs = tf.unstack(tf.transpose(outputs, [1, 0, 2])) results = tf.matmul(outputs[-1], weights['out']) + biases['out'] # 选取最后一个 output # # attention based model # X_att2 = final_state[0] # weights # i_want_to_see(X_att2, X_att2) # outputs_att = tf.multiply(outputs[-1], X_att2) # results = outputs_att return results # outputs_att #
def _split_output_tensor(self, tensor): transposed_tensor = tf.transpose(tensor, [2, 0, 1]) return tf.unstack(transposed_tensor)