def testWrongDimensions(self): # The matrix and right-hand sides should have the same number of rows. with self.session(use_gpu=True): matrix = constant_op.constant([[1., 0.], [0., 1.]]) rhs = constant_op.constant([[1., 0.]]) with self.assertRaises(ValueError): linalg_ops.matrix_solve_ls(matrix, rhs)
def benchmarkMatrixSolveLsOp(self): run_gpu_test = test_lib.is_gpu_available(True) regularizer = 1.0 for matrix_shape in self.matrix_shapes: for num_rhs in 1, 2, matrix_shape[-1]: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_cpu_shape_{matrix_shape}_num_rhs_{num_rhs}" ).format(matrix_shape=matrix_shape, num_rhs=num_rhs)) if run_gpu_test and (len(matrix_shape) < 3 or matrix_shape[0] < 513): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) self.evaluate(variables.global_variables_initializer()) self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_gpu_shape_{matrix_shape}_num_rhs_" "{num_rhs}").format( matrix_shape=matrix_shape, num_rhs=num_rhs))
def testWrongDimensions(self): # The matrix and right-hand sides should have the same number of rows. with self.session(): matrix = constant_op.constant([[1., 0.], [0., 1.]]) rhs = constant_op.constant([[1., 0.]]) with self.assertRaises((ValueError, errors_impl.InvalidArgumentError)): linalg_ops.matrix_solve_ls(matrix, rhs)
def benchmarkMatrixSolveLsOp(self): run_gpu_test = test_lib.is_gpu_available(True) regularizer = 1.0 for matrix_shape in self.matrix_shapes: for num_rhs in 1, 2, matrix_shape[-1]: with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/cpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_cpu_shape_{matrix_shape}_num_rhs_{num_rhs}" ).format(matrix_shape=matrix_shape, num_rhs=num_rhs)) if run_gpu_test and (len(matrix_shape) < 3 or matrix_shape[0] < 513): with ops.Graph().as_default(), \ session.Session(config=benchmark.benchmark_config()) as sess, \ ops.device("/gpu:0"): matrix, rhs = _GenerateTestData(matrix_shape, num_rhs) x = linalg_ops.matrix_solve_ls(matrix, rhs, regularizer) variables.global_variables_initializer().run() self.run_op_benchmark( sess, control_flow_ops.group(x), min_iters=25, store_memory_usage=False, name=("matrix_solve_ls_gpu_shape_{matrix_shape}_num_rhs_" "{num_rhs}").format( matrix_shape=matrix_shape, num_rhs=num_rhs))
def _verifySolve(self, x, y, dtype, use_placeholder, fast, l2_regularizer, batch_shape=()): if not fast and l2_regularizer != 0: # The slow path does not support regularization. return maxdim = np.max(x.shape) if dtype == np.float32 or dtype == np.complex64: tol = maxdim * 5e-4 else: tol = maxdim * 5e-7 a = x.astype(dtype) b = y.astype(dtype) if dtype in [np.complex64, np.complex128]: a.imag = a.real b.imag = b.real # numpy.linalg.lstqr does not batching, so we just solve a single system # and replicate the solution. and residual norm. np_ans = _SolveWithNumpy(x, y, l2_regularizer=l2_regularizer) np_r = np.dot(np.conj(a.T), b - np.dot(a, np_ans)) np_r_norm = np.sqrt(np.sum(np.conj(np_r) * np_r)) if batch_shape is not (): a = np.tile(a, batch_shape + (1, 1)) b = np.tile(b, batch_shape + (1, 1)) np_ans = np.tile(np_ans, batch_shape + (1, 1)) np_r_norm = np.tile(np_r_norm, batch_shape) with self.cached_session(use_gpu=fast) as sess: if use_placeholder: a_ph = array_ops.placeholder(dtypes.as_dtype(dtype)) b_ph = array_ops.placeholder(dtypes.as_dtype(dtype)) feed_dict = {a_ph: a, b_ph: b} tf_ans = linalg_ops.matrix_solve_ls( a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer) else: tf_ans = linalg_ops.matrix_solve_ls( a, b, fast=fast, l2_regularizer=l2_regularizer) feed_dict = {} self.assertEqual(np_ans.shape, tf_ans.get_shape()) if l2_regularizer == 0: # The least squares solution should satisfy A^H * (b - A*x) = 0. tf_r = b - math_ops.matmul(a, tf_ans) tf_r = math_ops.matmul(a, tf_r, adjoint_a=True) tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1]) tf_ans_val, tf_r_norm_val = sess.run([tf_ans, tf_r_norm], feed_dict=feed_dict) self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol) else: tf_ans_val = sess.run(tf_ans, feed_dict=feed_dict) self.assertEqual(np_ans.shape, tf_ans_val.shape) self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
def _verifySolve(self, x, y, dtype, use_placeholder, fast, l2_regularizer, batch_shape=()): if not fast and l2_regularizer != 0: # The slow path does not support regularization. return maxdim = np.max(x.shape) if dtype == np.float32 or dtype == np.complex64: tol = maxdim * 5e-4 else: tol = maxdim * 5e-7 a = x.astype(dtype) b = y.astype(dtype) if dtype in [np.complex64, np.complex128]: a.imag = a.real b.imag = b.real # numpy.linalg.lstqr does not batching, so we just solve a single system # and replicate the solution. and residual norm. np_ans = _SolveWithNumpy(x, y, l2_regularizer=l2_regularizer) np_r = np.dot(np.conj(a.T), b - np.dot(a, np_ans)) np_r_norm = np.sqrt(np.sum(np.conj(np_r) * np_r)) if batch_shape is not (): a = np.tile(a, batch_shape + (1, 1)) b = np.tile(b, batch_shape + (1, 1)) np_ans = np.tile(np_ans, batch_shape + (1, 1)) np_r_norm = np.tile(np_r_norm, batch_shape) with self.cached_session(use_gpu=fast) as sess: if use_placeholder: a_ph = array_ops.placeholder(dtypes.as_dtype(dtype)) b_ph = array_ops.placeholder(dtypes.as_dtype(dtype)) feed_dict = {a_ph: a, b_ph: b} tf_ans = linalg_ops.matrix_solve_ls( a_ph, b_ph, fast=fast, l2_regularizer=l2_regularizer) else: tf_ans = linalg_ops.matrix_solve_ls( a, b, fast=fast, l2_regularizer=l2_regularizer) feed_dict = {} self.assertEqual(np_ans.shape, tf_ans.get_shape()) if l2_regularizer == 0: # The least squares solution should satisfy A^H * (b - A*x) = 0. tf_r = b - math_ops.matmul(a, tf_ans) tf_r = math_ops.matmul(a, tf_r, adjoint_a=True) tf_r_norm = linalg_ops.norm(tf_r, ord="fro", axis=[-2, -1]) tf_ans_val, tf_r_norm_val = sess.run( [tf_ans, tf_r_norm], feed_dict=feed_dict) self.assertAllClose(np_r_norm, tf_r_norm_val, atol=tol, rtol=tol) else: tf_ans_val = sess.run(tf_ans, feed_dict=feed_dict) self.assertEqual(np_ans.shape, tf_ans_val.shape) self.assertAllClose(np_ans, tf_ans_val, atol=2 * tol, rtol=2 * tol)
def testEmpty(self): full = np.array([[1., 2.], [3., 4.], [5., 6.]]) empty0 = np.empty([3, 0]) empty1 = np.empty([0, 2]) for fast in [True, False]: with self.cached_session(use_gpu=True): tf_ans = linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast).eval() self.assertEqual(tf_ans.shape, (0, 0)) tf_ans = linalg_ops.matrix_solve_ls(empty0, full, fast=fast).eval() self.assertEqual(tf_ans.shape, (0, 2)) tf_ans = linalg_ops.matrix_solve_ls(full, empty0, fast=fast).eval() self.assertEqual(tf_ans.shape, (2, 0)) tf_ans = linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast).eval() self.assertEqual(tf_ans.shape, (2, 2))
def testEmpty(self): full = np.array([[1., 2.], [3., 4.], [5., 6.]]) empty0 = np.empty([3, 0]) empty1 = np.empty([0, 2]) for fast in [True, False]: with self.test_session(use_gpu=True): tf_ans = linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast).eval() self.assertEqual(tf_ans.shape, (0, 0)) tf_ans = linalg_ops.matrix_solve_ls(empty0, full, fast=fast).eval() self.assertEqual(tf_ans.shape, (0, 2)) tf_ans = linalg_ops.matrix_solve_ls(full, empty0, fast=fast).eval() self.assertEqual(tf_ans.shape, (2, 0)) tf_ans = linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast).eval() self.assertEqual(tf_ans.shape, (2, 2))
def _verifySolveBatch(self, x, y): # Since numpy.linalg.lsqr does not support batch solves, as opposed # to numpy.linalg.solve, we just perform this test for a fixed batch size # of 2x3. for np_type in [np.float32, np.float64]: a = np.tile(x.astype(np_type), [2, 3, 1, 1]) b = np.tile(y.astype(np_type), [2, 3, 1, 1]) np_ans = np.empty([2, 3, a.shape[-1], b.shape[-1]]) for dim1 in range(2): for dim2 in range(3): np_ans[dim1, dim2, :, :], _, _, _ = np.linalg.lstsq( a[dim1, dim2, :, :], b[dim1, dim2, :, :]) for fast in [True, False]: with self.test_session(): tf_ans = linalg_ops.matrix_solve_ls(a, b, fast=fast).eval() self.assertEqual(np_ans.shape, tf_ans.shape) # Check residual norm. tf_r = b - BatchMatMul(a, tf_ans) tf_r_norm = np.sum(tf_r * tf_r) np_r = b - BatchMatMul(a, np_ans) np_r_norm = np.sum(np_r * np_r) self.assertAllClose(np_r_norm, tf_r_norm) # Check solution. if fast or a.shape[-2] >= a.shape[-1]: # We skip this test for the underdetermined case when using the # slow path, because Eigen does not return a minimum norm solution. # TODO(rmlarsen): Enable this check for all paths if/when we fix # Eigen's solver. self.assertAllClose(np_ans, tf_ans, atol=1e-5, rtol=1e-5)
def _verifySolve(self, x, y): for np_type in [np.float32, np.float64, np.complex64, np.complex128]: a = x.astype(np_type) b = y.astype(np_type) if np_type in [np.complex64, np.complex128]: a.imag = a.real b.imag = b.real np_ans, _, _, _ = np.linalg.lstsq(a, b) for fast in [True, False]: with self.test_session(): tf_ans = linalg_ops.matrix_solve_ls(a, b, fast=fast) ans = tf_ans.eval() self.assertEqual(np_ans.shape, tf_ans.get_shape()) self.assertEqual(np_ans.shape, ans.shape) # Check residual norm. tf_r = b - BatchMatMul(a, ans) tf_r_norm = np.sum(tf_r * tf_r) np_r = b - BatchMatMul(a, np_ans) np_r_norm = np.sum(np_r * np_r) self.assertAllClose(np_r_norm, tf_r_norm) # Check solution. if np_type == np.float32 or np_type == np.complex64: tol = 5e-5 else: tol = 1e-12 self.assertAllClose(np_ans, ans, atol=tol, rtol=tol)
def testEmpty(self): full = np.array([[1., 2.], [3., 4.], [5., 6.]]) empty0 = np.empty([3, 0]) empty1 = np.empty([0, 2]) for fast in [True, False]: tf_ans = self.evaluate( linalg_ops.matrix_solve_ls(empty0, empty0, fast=fast)) self.assertEqual(tf_ans.shape, (0, 0)) tf_ans = self.evaluate( linalg_ops.matrix_solve_ls(empty0, full, fast=fast)) self.assertEqual(tf_ans.shape, (0, 2)) tf_ans = self.evaluate( linalg_ops.matrix_solve_ls(full, empty0, fast=fast)) self.assertEqual(tf_ans.shape, (2, 0)) tf_ans = self.evaluate( linalg_ops.matrix_solve_ls(empty1, empty1, fast=fast)) self.assertEqual(tf_ans.shape, (2, 2))
def testBatchResultSize(self): # 3x3x3 matrices, 3x3x1 right-hand sides. matrix = np.array([1., 2., 3., 4., 5., 6., 7., 8., 9.] * 3).reshape(3, 3, 3) rhs = np.array([1., 2., 3.] * 3).reshape(3, 3, 1) answer = linalg_ops.matrix_solve(matrix, rhs) ls_answer = linalg_ops.matrix_solve_ls(matrix, rhs) self.assertEqual(ls_answer.get_shape(), [3, 3, 1]) self.assertEqual(answer.get_shape(), [3, 3, 1])
def testBatchResultSize(self): # 3x3x3 matrices, 3x3x1 right-hand sides. matrix = np.array([1., 0., 0., 0., 1., 0., 0., 0., 1.] * 3).reshape(3, 3, 3) # pylint: disable=too-many-function-args rhs = np.array([1., 2., 3.] * 3).reshape(3, 3, 1) # pylint: disable=too-many-function-args answer = linalg_ops.matrix_solve(matrix, rhs) ls_answer = linalg_ops.matrix_solve_ls(matrix, rhs) self.assertEqual(ls_answer.get_shape(), [3, 3, 1]) self.assertEqual(answer.get_shape(), [3, 3, 1])
def _full(op, grad): a = op.inputs[0] output = op.outputs[0] a_H = math_ops.conj(array_ops.matrix_transpose(a)) grad_b = linalg_ops.matrix_solve_ls(a_H, grad, fast=False) grad_a = -math_ops.matmul(grad_b, output, adjoint_b=True) return grad_a, grad_b, None
def _verifyRegularized(self, x, y, l2_regularizer): for np_type in [np.float32, np.float64]: # Test with a single matrix. a = x.astype(np_type) b = y.astype(np_type) np_ans = BatchRegularizedLeastSquares(a, b, l2_regularizer) with self.test_session(): # Test matrix_solve_ls on regular matrices tf_ans = linalg_ops.matrix_solve_ls( a, b, l2_regularizer=l2_regularizer, fast=True).eval() self.assertAllClose(np_ans, tf_ans, atol=1e-5, rtol=1e-5) # Test with a 2x3 batch of matrices. a = np.tile(x.astype(np_type), [2, 3, 1, 1]) b = np.tile(y.astype(np_type), [2, 3, 1, 1]) np_ans = BatchRegularizedLeastSquares(a, b, l2_regularizer) with self.test_session(): tf_ans = linalg_ops.matrix_solve_ls( a, b, l2_regularizer=l2_regularizer, fast=True).eval() self.assertAllClose(np_ans, tf_ans, atol=1e-5, rtol=1e-5)
# Alternative shape that consistently produces a valid numerical Jacobian shape = extra + (size + 1, size + 1) name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape))) _AddTest( MatrixUnaryFunctorGradientTest, 'MatrixSquareRootGradient', name, _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_square_root, dtype, shape)) # Tests for gradients of matrix_solve_ls for dtype in np.float32, np.float64: for rows in 2, 5, 10: for cols in 2, 5, 10: for l2_regularization in 1e-6, 0.001, 1.0: shape = (rows, cols) name = '%s_%s_%s' % (dtype.__name__, '_'.join(map(str, shape)), l2_regularization) float32_tol_fudge = 5.1 if l2_regularization == 1e-6 else 4.0 _AddTest( MatrixBinaryFunctorGradientTest, 'MatrixSolveLsGradient', name, # pylint: disable=long-lambda,g-long-lambda _GetMatrixBinaryFunctorGradientTest( (lambda a, b, l=l2_regularization: linalg_ops.matrix_solve_ls(a, b, l)), dtype, shape, float32_tol_fudge)) test_lib.main()
# The numerical Jacobian is consistently invalid for these four shapes # because the matrix square root of the perturbed input doesn't exist if shape in {(2, 5, 5), (3, 5, 5), (3, 10, 10), (3, 2, 5, 5)}: # Alternative shape that consistently produces a valid numerical Jacobian shape = extra + (size + 1, size + 1) name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape))) _AddTest( MatrixUnaryFunctorGradientTest, 'MatrixSquareRootGradient', name, _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_square_root, dtype, shape)) # Tests for gradients of matrix_solve_ls for dtype in np.float32, np.float64: for rows in 2, 5, 10: for cols in 2, 5, 10: for l2_regularization in 1e-6, 0.001, 1.0: shape = (rows, cols) name = '%s_%s_%s' % (dtype.__name__, '_'.join(map( str, shape)), l2_regularization) float32_tol_fudge = 5.1 if l2_regularization == 1e-6 else 4.0 _AddTest( MatrixBinaryFunctorGradientTest, 'MatrixSolveLsGradient', name, # pylint: disable=long-lambda,g-long-lambda _GetMatrixBinaryFunctorGradientTest( (lambda a, b, l=l2_regularization: linalg_ops.matrix_solve_ls( a, b, l)), dtype, shape, float32_tol_fudge)) test_lib.main()
for extra in [(), (2,), (3,)] + [(3, 2)] * (size < 10): shape = extra + (size, size) name = '%s_%s' % (dtype.__name__, '_'.join(map(str, shape))) _AddTest(MatrixUnaryFunctorGradientTest, 'MatrixInverseGradient', name, _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_inverse, dtype, shape)) _AddTest( MatrixUnaryFunctorGradientTest, 'MatrixDeterminantGradient', name, _GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_determinant, dtype, shape)) # Tests for gradients of matrix_solve_ls for dtype in np.float32, np.float64: for rows in 2, 5, 10: for cols in 2, 5, 10: for l2_regularization in 0.0, 0.001, 1.0: shape = (rows, cols) name = '%s_%s_%s' % (dtype.__name__, '_'.join(map(str, shape)), l2_regularization) _AddTest( MatrixBinaryFunctorGradientTest, 'MatrixSolveLsGradient', name, _GetMatrixBinaryFunctorGradientTest( lambda a, b, l=l2_regularization: linalg_ops.matrix_solve_ls(a, b, l), dtype, shape, float32_tol_fudge=4.0)) test_lib.main()
_GetMatrixUnaryFunctorGradientTest(linalg_ops.matrix_determinant, dtype, shape)) _AddTest( MatrixUnaryFunctorGradientTest, 'LogMatrixDeterminantGradient', name, _GetMatrixUnaryFunctorGradientTest( lambda x: linalg_ops.log_matrix_determinant(x)[1], dtype, shape)) # Tests for gradients of matrix_solve_ls for dtype in np.float32, np.float64: for rows in 2, 5, 10: for cols in 2, 5, 10: for l2_regularization in 1e-6, 0.001, 1.0: shape = (rows, cols) name = '%s_%s_%s' % (dtype.__name__, '_'.join(map(str, shape)), l2_regularization) _AddTest( MatrixBinaryFunctorGradientTest, 'MatrixSolveLsGradient', name, # pylint: disable=long-lambda,g-long-lambda _GetMatrixBinaryFunctorGradientTest( (lambda a, b, l=l2_regularization: linalg_ops.matrix_solve_ls(a, b, l)), dtype, shape, float32_tol_fudge=4.0)) test_lib.main()
def apply_gradients( self, grads_and_vars, worker_id, global_step=None, name=None, collect_cdfs=False, # batch_idx_list=None, worker_kill_list=None, num_workers=None, num_batches_per_epoch=None): matrix_to_solve=None, num_batches_per_epoch=None): """Apply gradients to variables. This contains most of the synchronization implementation and also wraps the apply_gradients() from the real optimizer. Args: grads_and_vars: List of (gradient, variable) pairs as returned by compute_gradients(). global_step: Optional Variable to increment by one after the variables have been updated. name: Optional name for the returned operation. Default to the name passed to the Optimizer constructor. Returns: train_op: The op to dequeue a token so the replicas can exit this batch and start the next one. This is executed by each replica. Raises: ValueError: If the grads_and_vars is empty. ValueError: If global step is not provided, the staleness cannot be checked. """ if not grads_and_vars: raise ValueError("Must supply at least one variable") if global_step is None: raise ValueError("Global step is required to check staleness") self._global_step = global_step train_ops = [] aggregated_grad = [] var_list = [] self._local_step = variables.Variable( initial_value=0, trainable=False, collections=[ops.GraphKeys.LOCAL_VARIABLES], dtype=global_step.dtype.base_dtype, name="sync_rep_local_step") self.local_step_init_op = state_ops.assign(self._local_step, global_step._ref()) chief_init_ops = [self.local_step_init_op] self.ready_for_local_init_op = variables.report_uninitialized_variables( variables.all_variables()) # The wait op waits for the current worker to dequeue a token from its respective token queue self._wait_op = self._sync_token_queues[worker_id].dequeue() # Replicas have to wait until they can get a token from the token queue # BEFORE begining to compute gradients. with ops.device(global_step.device): queue_size = self._sync_token_queues[worker_id].size() update_local_step_op = state_ops.assign(self._local_step, global_step._ref()) # Gradient accum creation with ops.name_scope(None, self._name): for grad, var in grads_and_vars: var_list.append(var) tf.logging.info("Grad " + str(grad) + " assigned to " + str(var.device)) with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = data_flow_ops.ConditionalAccumulator( grad.dtype, shape=var.get_shape(), shared_name=var.name + "/grad_accum") else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = data_flow_ops.SparseConditionalAccumulator( grad.dtype, shape=(), shared_name=var.name + "/grad_accum") self._accumulator_list.append((grad_accum, var)) """# Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_grad(grad, local_step=self._local_step._ref())) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] train_ops.append(grad_accum.apply_indexed_slices_grad( grad, local_step=self._local_step._ref()))""" # Phase 1 gradient computation with ops.control_dependencies([update_local_step_op]): for index, (grad, var) in enumerate(grads_and_vars): print_start_op = logging_ops.Print( global_step, [global_step], message="Starting to apply grads for variable %d" % index) train_ops.append(print_start_op) with ops.device(var.device): work_idx_print = logging_ops.Print( worker_id, [worker_id], message="worker id for comp grad") ps_step_printer0 = logging_ops.Print( global_step, [global_step], message="global step printer0 on ps") train_ops.append(work_idx_print) train_ops.append(ps_step_printer0) '''Implement LS computation and solution here''' #b = np.ones(int(num_batches_per_epoch)) b = tf.ones([int(num_batches_per_epoch), 1], tf.float32) A = matrix_to_solve # A_for_calc = np.transpose(A) LS_solution = linalg_ops.matrix_solve_ls(A, b, fast=False) LS_calc = tf.reshape(LS_solution, [-1]) weight = tf.slice(LS_calc, [worker_id], [1]) # print_ls_op = logging_ops.Print(LS_calc, [LS_calc], message="Solution for LS!") # train_ops.append(print_ls_op) weighted_grad = tf.scalar_mul(weight[0], grad) '''Kill some workers''' if grad is None: continue elif isinstance(grad, ops.Tensor): grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): # apply_grad_op = grad_accum.apply_grad(grad, apply_grad_op = grad_accum.apply_grad( weighted_grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) else: if not isinstance(grad, ops.IndexedSlices): raise ValueError("Unknown grad type!") grad_accum = self._accumulator_list[index][0] with ops.control_dependencies([print_start_op]): with tf.device("job:worker/task:%d" % worker_id): apply_grad_op = grad_accum.apply_indexed_slices_grad( # grad, local_step=self._local_step._ref()) weighted_grad, local_step=self._local_step._ref()) with ops.control_dependencies( [apply_grad_op]): finished_print_op = logging_ops.Print( global_step, [global_step], message= "Done applying grads for variable %d" % index) train_ops.append(finished_print_op) # Phase 2 gradient applying for index, (grad, var) in enumerate(grads_and_vars): with ops.device(var.device): grad_accum = self._accumulator_list[index][0] work_idx_print1 = logging_ops.Print( worker_id, [worker_id], message="worker id for aggregate grad") ps_step_printer1 = logging_ops.Print( global_step, [global_step], message="global step printer1 on ps") num_replica_aggragate = logging_ops.Print( self._replicas_to_aggregate, [self._replicas_to_aggregate], message="num replica aggregate") train_ops.append(work_idx_print1) train_ops.append(ps_step_printer1) train_ops.append(num_replica_aggragate) if grad is None: aggregated_grad.append(None) elif isinstance(grad, ops.Tensor): if collect_cdfs: # aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) aggregated_grad.append( grad_accum.take_grad( self._replicas_to_aggregate)) else: aggregated_grad.append(grad_accum.take_grad(1)) else: if collect_cdfs: # aggregated_grad.append(grad_accum.take_grad(self._total_num_replicas)) aggregated_grad.append( grad_accum.take_grad( self._replicas_to_aggregate)) else: aggregated_grad.append( grad_accum.take_indexed_slices_grad(1)) aggregated_grads_and_vars = zip(aggregated_grad, var_list) # Some debug operations self.print_sizes = logging_ops.Print(global_step, [ self._sync_token_queues[i].size() for i in range(self._total_num_replicas) ], message="queue sizes") self.print_accum_sizes = logging_ops.Print( self._local_step, [x[0].num_accumulated() for x in self._accumulator_list] + [worker_id], message="Accum sizes") self.print_local_step = logging_ops.Print( self._local_step, [self._local_step._ref(), global_step._ref()], message="local vs global step") # sync_op will be assigned to the same device as the global step. with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies([self.print_accum_sizes]): update_op = self._opt.apply_gradients( aggregated_grads_and_vars, global_step) self._update_op = update_op with ops.control_dependencies([update_op]): sync_op = [] for cur_worker_id in range(self._total_num_replicas): sync_op.append( self._sync_token_queues[cur_worker_id].enqueue( global_step)) sync_op = control_flow_ops.group(*(sync_op)) # dummy_queue is passed to the queue runner. Don't use the real queues # because the queue runner doesn't automatically reopen it once it # closed queues in PS devices. dummy_queue = (data_flow_ops.FIFOQueue( 1, types_pb2.DT_INT32, shapes=(), shared_name="dummy_queue")) self._chief_queue_runner = queue_runner.QueueRunner( dummy_queue, [sync_op]) with ops.device(global_step.device), ops.name_scope(""): with ops.control_dependencies(train_ops): # Worker finished applying gradients. Add token to phase1_finished_queue train_op = logging_ops.Print( self._local_step._ref(), [ x[0].num_accumulated() for x in self._accumulator_list ] + [worker_id], message="Finished worker updates", name="FinishedWorkerUpdatesPrint") for accum, var in self._accumulator_list: with ops.device(var.device): chief_init_ops.append( accum.set_global_step(global_step, name="SetGlobalStep")) self.chief_init_op = control_flow_ops.group(*(chief_init_ops)) self._gradients_applied = True return train_op