Esempio n. 1
0
	def train(self, dataset_flow):
		with tf.variable_scope(tf.get_variable_scope()):
			#global_step= tf.Variable(initial_value=tf.constant(0), trainable=False, collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES], name='global_step') ## this is ok, I will use the format global_step as follow #
			global_step = tf.train.get_or_create_global_step()
			#print 'tf.global_variables()', tf.global_variables()
			#print 'tf.train.get_global_step()', tf.train.get_global_step()
			in_x, in_y = dataset_flow
			lr_decay   = tf.train.exponential_decay(learning_rate=0.008, global_step=global_step, decay_steps=1000, decay_rate=0.99, staircase = True)
			optimizer  = tf.train.AdamOptimizer(learning_rate = lr_decay)
			x_list = tf.split(in_x, num_or_size_splits=4, axis=0)
			y_list = tf.split(in_y, num_or_size_splits=4, axis=0)
			tower_grads = []
			tower_logits= []
			for i in xrange(4):
				with tf.device('/gpu:' +str(i)):
					with tf.name_scope('name_scope-'+str(i)) as scope:
						logits = self.logits(x_list[i])
						tf.losses.softmax_cross_entropy(onehot_labels=y_list[i], logits=logits)
						update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
						print 'update_ops', update_ops
						with tf.control_dependencies(update_ops): 
							losses = tf.get_collection(tf.GraphKeys.LOSSES, scope)
							print 'gpu:', i, 'losses is :', losses 
							total_loss = tf.add_n(losses, name='total_loss')
							grads  = optimizer.compute_gradients(total_loss)
							tf.summary.scalar('loss', total_loss) ## 这里对每个name_scope下的loss都做了记录
						tower_grads.append(grads)
						tower_logits.append(logits)
			grads = average_gradients(tower_grads)
			train_op = optimizer.apply_gradients(grads, global_step=global_step) ## 每次执行到这里,会对变量global_step自增1 #
			
			merged_summary = tf.summary.merge_all()
			saver  = tf.train.Saver()
			config = tf.ConfigProto(gpu_options = tf.GPUOptions(allow_growth=True), device_count = {'GPU':4}, allow_soft_placement = True)
			with tf.Session(config=config) as sess:
				writer = tf.summary.FileWriter(self.model_dir, sess.graph)
				sess.run(tf.global_variables_initializer())
				sess.run(tf.tables_initializer())
				start_time = time.time()
				while True:
					try:
						_, loss_ = sess.run([train_op, total_loss])
					except tf.errors.OutOfRangeError:
						print 'train end'
						break
					cur_step = tf.train.global_step(sess, global_step)
					if cur_step % 100== 0: start_time = time.time()
					if cur_step % 100== 1:
						accu, _      = sess.run(self.eval(in_x, in_y))
						summary_res  = sess.run(merged_summary)
						writer.add_summary(summary_res, cur_step)
						duration     = time.time() - start_time
						print 'iter:\t', cur_step, '\tloss:\t', loss_, '\taccuracy:\t', accu, '\ttime cost(sec):\t', duration
					if cur_step % 10000==1:
						print 'save model into path:', self.model_dir, cur_step
						saver.save(sess, self.model_dir+'/ckpt', global_step=cur_step)
Esempio n. 2
0
	def train(self, dataset_flow):
		with tf.variable_scope(tf.get_variable_scope()):
			ema = tf.train.ExponentialMovingAverage(decay=0.999)
			in_x, in_y = dataset_flow
			optimizer  = tf.train.AdamOptimizer(learning_rate = 0.008)
			x_list = tf.split(in_x, num_or_size_splits=4, axis=0)
			y_list = tf.split(in_y, num_or_size_splits=4, axis=0)
			tower_grads = []
			tower_logits= []
			for i in xrange(4):
				with tf.device('/gpu:' +str(i)):
					with tf.name_scope('name_scope-'+str(i)) as scope:
						logits = self.logits(x_list[i])
						tf.losses.softmax_cross_entropy(onehot_labels=y_list[i], logits=logits)
						update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, scope)
						with tf.control_dependencies(update_ops): 
							losses = tf.get_collection(tf.GraphKeys.LOSSES, scope)
							print 'gpu:', i, 'losses is :', losses 
							total_loss = tf.add_n(losses, name='total_loss')
							grads  = optimizer.compute_gradients(total_loss)
							tf.summary.scalar('loss', total_loss) ## 这里对每个name_scope下的loss都做了记录
						tower_grads.append(grads)
						tower_logits.append(logits)
			grads = average_gradients(tower_grads)
			train_op = optimizer.apply_gradients(grads)
			train_avg  = ema.apply(tf.trainable_variables()) ## 对指定的变量做滑动平均:ema.apply([self.w1,self.w2]) ##
			train_op_move = tf.group(train_op, train_avg) ## 将两组操作绑定到一起, 所有动作都是完成的,当整个op完成时 

			merged_summary = tf.summary.merge_all()
			saver  = tf.train.Saver(tf.global_variables()) ## 这里保存要保存全部变量 ##才可以将所有的包括move_avg ##
			#saver = tf.train.Saver(em.variables_to_restore()) ## 这样好像也是可以的 ##
			config = tf.ConfigProto(gpu_options = tf.GPUOptions(allow_growth=True), device_count = {'GPU':4}, allow_soft_placement = True)
			with tf.Session(config=config) as sess:
				writer = tf.summary.FileWriter(self.model_dir, sess.graph)
				sess.run(tf.global_variables_initializer())
				sess.run(tf.tables_initializer())
				iter_num = 0
				while True:
					try:
						_, loss_ = sess.run([train_op_move, total_loss]) ## 这里需要是对gropu的操作优化 ##
					except tf.errors.OutOfRangeError:
						print 'train end'
						break
					if iter_num == 0: start_time = time.time()
					if iter_num % 100== 1:
						accu, _      = sess.run(self.eval(in_x, in_y))
						summary_res  = sess.run(merged_summary)
						writer.add_summary(summary_res, iter_num)
						duration     = time.time() - start_time
						start_time   = time.time()
						print 'iter:\t', iter_num, '\tloss:\t', loss_, '\taccuracy:\t', accu, '\ttime cost(sec):\t', duration
					if iter_num % 10000==1:
						print 'save model into path:', self.model_dir, iter_num
						saver.save(sess, self.model_dir+'/ckpt', global_step=iter_num)
					iter_num += 1
Esempio n. 3
0
def mode_mine(features, labels, mode, params):
    net_used = net_model()
    x = tf.feature_column.input_layer(features=features,
                                      feature_columns=params['columns'])
    #print '-'*10, '#debug in mode_mine as input-x:', x.get_shape()
    #print '-'*10, '#debug in mode_mine as input-y:', labels.get_shape()
    ## predict mode ##
    if mode == tf.estimator.ModeKeys.PREDICT:
        logits = net_used.output(x)
        #prob        = tf.nn.softmax(logits, dim=1)
        prob = tf.nn.sigmoid(logits)
        #prob_class  = tf.argmax(prob, axis=1)
        prob_class = tf.cast(prob > 0.5, tf.int32)
        predictions_op = {'prob': prob, 'prob_class': prob_class}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions_op)
    ## train and eval mode using GPU ##
    #print '#debug current dataset batch_size:', x.get_shape().as_list(), x
    # tf.1.3无法处理最后的batch不等于batch_size的情况,这里tf.split会报错,那怎么办呢?
    # tf.1.3+可以用dataset.apply(tf.contrib.data.batch_and_drop_remainder(batch_size)) 来做丢弃
    x_list = tf.split(x, num_or_size_splits=params['gpu_num'], axis=0)
    y_list = tf.split(labels, num_or_size_splits=params['gpu_num'], axis=0)
    tower_grads = []
    tower_logits = []
    optimizer = tf.train.AdamOptimizer(learning_rate=0.002)
    print '#debug, tf.get_variable_scope():', tf.get_variable_scope()
    print '#debug, tf.variable_scope(tf.get_variable_scope()):', tf.variable_scope(
        tf.get_variable_scope())
    with tf.variable_scope(tf.get_variable_scope()):
        for i in xrange(params['gpu_num']):
            with tf.device('/gpu:' + str(i)):
                ## 指定在不同的GPU上,设置对应的操作 ##
                with tf.name_scope('classification-' + str(i)) as scope:
                    # model and loss
                    # name_scope是用来做什么的呢?
                    #print '#debug, net_used.w1.name: ', net_used.w1.name
                    #print '#debug, tf.get_collection(TRAINABLE_VARIABLES)', tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
                    #print '#debug, tf.get_variable_scope():', tf.get_variable_scope()
                    ## 发现collection里面的可训练变量只有 layer-1/w1:0 这样的模型参数 ##
                    logits = net_used.output(x_list[i])
                    #print '#debug, here scope:', scope, logits
                    loss = tf.reduce_mean(
                        tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(
                            y_list[i], tf.float32),
                                                                logits=logits))
                    tf.add_to_collection(tf.GraphKeys.LOSSES, loss)
                    #tf.losses.softmax_cross_entropy(onehot_labels=y_list[i], logits=logits)
                    ## 疑问,上面这句话为什么要单独执行 ?
                    ## tf.losses.softmax_cross_entrypy will do what ?
                    ## 1) create a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits;
                    ## 2) notice: loss_collection=tf.GraphKeys.LOSSES 这个参数
                    ##    表示:collection to which the loss will be added
                    ##    将这个GPU上的计算loss结果add到tf.GraphKeys.LOSSES(也即losses) 里.
                    ##    其实如果有其他自定义的loss,也可以通过tf.losses.add_loss添加到collection的损失里.
                    ##    背后都是用ops.add_to_collection(GraphKeys.LOSSES, loss)
                    ## 3) 在来看下tf.Graph.add_to_collection(name, value)是干什么的?
                    ##    store the value in the collection given by name
                    ##    查看代码,最重要的一句: self._collections[name].append(value)
                    ##    于是我们知道了collections是一个map,key=tf.GraphKeys, value是通过add_to_collection追加的.
                    ## 难道是是为了在这个GPU下执行计算一遍loss,方便后面采集loss ##
                    ## tf.losses.softmax_cross_entropy 与 tf.nn.softmax_cross_entropy_with_logits的区别
                    ## 前者处理onehot_labels 后者更适合处理2分类 ##
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                                   scope)
                    ## 这句话,做了什么? 从collections的map里,取出了name=Update_ops in scope下的[value]
                    ## 采集 需要update的操作 ## update_ops in combination with reuse varscope
                    ## explain the tf.GraphKeys.UPDATE_OPS
                    ## Custom functions to update some variables can be added to UPDATE_OPS
                    ## and, separately run at each iteration using sess.run(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
                    ## In this case, these variables are set trainable=False to avoid being updated by gradient descent.
                    with tf.control_dependencies(update_ops):  ## 这里是做什么用的 ?##
                        losses = tf.get_collection(tf.GraphKeys.LOSSES, scope)
                        print 'gpu:', i, 'losses', losses
                        total_loss = tf.add_n(losses, name='total_loss')
                        ## tf.control_depencies表示 with段内的操作是在updates_op执行之后,再执行的 ## 控制了图的执行顺序 ##
                        ## notice: 这里get_collection都使用了 scope来过滤,保证 操作和结果 都是本scope-GPU内的.
                        ## 在本GPU上,根据切片输入,计算了logits,并将loss追加到collection里面;然后执行有update var的操作,再获取整个GPU上计算的loss,合起来,作为本GPU本切片的loss.
                        ## 如何将对应的正则loss也提取出来 ? ##
                        ## 如果前面在变量定义时,已经用regularizer设定了,则会自动被收集到colleciton的key=regularizer_loss里面
                    ## 疑惑,tf.GraphKeys.LOSSES里的值会在下次计算的时候更新么?还是继续追加呢,感觉有清理机制 ##
                    ##reuse var
                    tf.get_variable_scope().reuse_variables(
                    )  ## 将当前变量空间,设置为其中变量可以重复使用 ##
                    ## name_space 并不会影响 variable_space ##
                    ## 当我的变量在定义时,都在variable_scope(reuse=tf.AUTO_REUSE)设置下,这里岂不是就可以不用了?是的 ##
                    ## 这里共享的到底是谁?哪些变量?用tf.get_variable_scope来查看,发现是4个GPU下都是一样的variable_scope ##
                    ## 4个GPU在同一个variable_scope下,是因为最开始的with tf.variable_scope(tf.get_variable_scope)
                    ## 第一个GPU使用的trainable_variable,会由于 reuse=True的设置,允许后面GPU在使用tf.get_variable时,同名的变量是一样的/共享的.
                    ## 比如,这个GPU下使用了变量layer-1/w1:0来计算loss,那么后面的GPU在使用变量layer-1/w1:0来计算loss时,是用同一个变量layer-1/w1:0
                    ## 疑惑,为什么要将当前变量空间搞成变量可共享呢?都有什么变量呢? 感觉是将w1,w2,b1,b2共享 ##
                    print '-' * 10, '#debug, tf.get_variable_scope', tf.get_variable_scope(
                    ), tf.get_variable_scope().name
                    # grad compute
                    grads = optimizer.compute_gradients(total_loss)
                    #print '-'*10, '#debug, compute_gradients', grads
                    ## this is the first part of minimize() ##
                    ## optimizer.compute_gradients(loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None)
                    ## what will do this operation ? ##
                    ## compute gradients of loss for the variables in var_list(default=tf.GraphKeys.TRAINABLE_VARIABLES)
                    ## return a list of (gradient, variable) pair
                    tower_grads.append(grads)
                    tower_logits.append(logits)
    # we must calculate the mean of each gradient, notice: this is synchronization across all tower #
    grads = average_gradients(tower_grads)
    ## apply the gradients to adjust the sared variables.
    train_op = optimizer.apply_gradients(
        grads, global_step=tf.train.get_global_step())
    #prob       = tf.nn.softmax(tf.concat(tower_logits, 0), dim=1)
    prob = tf.nn.sigmoid(tf.concat(tower_logits, 0))
    #prob_class = tf.argmax(prob, axis=1)
    prob_class = tf.cast(prob > 0.5, tf.int32)
    accuracy = tf.metrics.accuracy(labels=labels, predictions=prob_class)
    ## train mode and eval mode ##
    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(
            mode,
            loss=total_loss,
            train_op=train_op,
            eval_metric_ops={'accuracy': accuracy})
Esempio n. 4
0
                print 'gpu:======', i, 'losses is :', losses
                regular_losses = tf.get_collection(
                    tf.GraphKeys.REGULARIZATION_LOSSES
                )  ## 疑问?这里没法在scope获取到权重的正则化值 ##是因为大家都是共享的权重,其当前的正则化应该也是一致的才对 #
                print 'gpu:======', i, 'regular loss is:', regular_losses
                total_loss = tf.add_n(losses + regular_losses,
                                      name='total_loss')
                grads = optimizer.compute_gradients(total_loss)
                #gradients, variables = zip(*optimizer.compute_gradients(total_loss))
                #gradients, _ = tf.clip_by_global_norm(t_list=gradients, clip_norm=100.0) ## clip ##
                #grads   = zip(gradients, variables)
                tf.summary.scalar('loss',
                                  total_loss)  ## 这里对每个name_scope下的loss都做了记录
            tower_grads.append(grads)
            tower_logits.append(logits)
grads = average_gradients(tower_grads)
train_op = optimizer.apply_gradients(
    grads, global_step=global_step)  ## 每次执行到这里,会对变量global_step自增1 #

merged_summary = tf.summary.merge_all()
saver = tf.train.Saver()
config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True),
                        device_count={'GPU': 4},
                        allow_soft_placement=True)
with tf.Session(config=config) as sess:
    writer = tf.summary.FileWriter(model_dir, sess.graph)
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    start_time = time.time()
    sess.graph.finalize()
    while True:
Esempio n. 5
0
def mode_mine(features, labels, mode, params):
    net_used = net_model()
    print 'before tf.feature_column.input_layer time:', time.ctime()
    x = tf.feature_column.input_layer(features=features,
                                      feature_columns=params['columns'])
    print 'after  tf.feature_column.input_layer time:', time.ctime()
    #print '-'*10, '#debug in mode_mine as input-x:', x.get_shape()
    #print '-'*10, '#debug in mode_mine as input-y:', labels.get_shape()
    ## predict mode ##
    if mode == tf.estimator.ModeKeys.PREDICT:
        logits = net_used.output(x)
        prob = tf.nn.softmax(logits, dim=1)
        prob_class = tf.argmax(prob, axis=1)
        predictions_op = {'prob': prob, 'prob_class': prob_class}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions_op)
    ## train and eval mode using GPU ##
    #print '#debug current dataset batch_size:', x.get_shape().as_list(), x
    # tf.1.3无法处理最后的batch不等于batch_size的情况,这里tf.split会报错,那怎么办呢?
    # tf.1.3+可以用dataset.apply(tf.contrib.data.batch_and_drop_remainder(batch_size)) 来做丢弃
    print 'before tf.split time:', time.ctime()
    x_list = tf.split(x, num_or_size_splits=params['gpu_num'], axis=0)
    y_list = tf.split(labels, num_or_size_splits=params['gpu_num'], axis=0)
    print 'after  tf.split time:', time.ctime()
    tower_grads = []
    tower_logits = []
    optimizer = tf.train.AdamOptimizer(learning_rate=0.002)
    print '#debug, before gpu, tf.get_variable_scope():', tf.get_variable_scope(
    )
    print '#debug, before gpu, tf.variable_scope(tf.get_variable_scope()):', tf.variable_scope(
        tf.get_variable_scope())
    ## you will find, here tf.get_variable_scope 与 net_used 时,是同一个variable_scope
    with tf.variable_scope(tf.get_variable_scope()):
        ## what is happening tf.variable_scope, why the same tf.get_variable_scope as input get diff result
        ## tf.variable_scope : A context manager for defining ops that creates variables (layers) ##
        for i in xrange(params['gpu_num']):
            with tf.device('/gpu:' + str(i)):
                ## 指定在不同的GPU上,设置对应的操作 ##
                with tf.name_scope('classification-' + str(i)) as scope:
                    # model and loss
                    # name_scope是用来做什么的呢?
                    #print '#debug, net_used.w1.name: ', net_used.w1.name
                    #print '#debug, tf.get_collection(TRAINABLE_VARIABLES)', tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
                    #print '#debug, tf.get_variable_scope():', tf.get_variable_scope()
                    ## 发现collection里面的可训练变量只有 layer-1/w1:0 这样的模型参数 ##
                    #print 'gpu:', str(i), 'before net_use.output time:', time.ctime()
                    logits = net_used.output(x_list[i])
                    print 'gpu:', str(i), 'logits.name', logits.name
                    #print '#debug, here scope:', scope, logits
                    #print 'gpu:', str(i), 'before tf.losses time:', time.ctime()
                    loss = tf.losses.softmax_cross_entropy(
                        onehot_labels=y_list[i], logits=logits)
                    print 'gpu:', str(i), 'loss.name', loss.name
                    print 'gpu:', str(
                        i), 'trainable_variabels', tf.get_collection(
                            tf.GraphKeys.TRAINABLE_VARIABLES)
                    ## 疑问,上面这句话为什么要单独执行 ? 将当前name_scope的loss-opertion保存起来 ##
                    ## tf.losses.softmax_cross_entrypy will do what ?
                    ## 1) create a cross-entropy loss using tf.nn.softmax_cross_entropy_with_logits;
                    ## 2) notice: loss_collection=tf.GraphKeys.LOSSES 这个参数
                    ##    表示:collection to which the loss will be added
                    ##    将这个GPU上的计算loss结果add到tf.GraphKeys.LOSSES(也即losses) 里.
                    ##    其实如果有其他自定义的loss,也可以通过tf.losses.add_loss添加到collection的损失里.
                    ##    背后都是用ops.add_to_collection(GraphKeys.LOSSES, loss)
                    ## 3) 在来看下tf.Graph.add_to_collection(name, value)是干什么的?
                    ##    store the value in the collection given by name
                    ##    查看代码,最重要的一句: self._collections[name].append(value)
                    ##    于是我们知道了collections是一个map,key=tf.GraphKeys, value是通过add_to_collection追加的.
                    ## 难道是是为了在这个GPU下执行计算一遍loss,方便后面采集loss ##
                    ## tf.losses.softmax_cross_entropy 与 tf.nn.softmax_cross_entropy_with_logits的区别
                    ## 前者处理onehot_labels 后者更适合处理2分类,且没有自动添加到tf.GraphKeys.LOSSES的动作 ## 前者处理多分类 ##
                    ## 也有类似的tf.losses.sigmoid_cross_entropy 处理2分类,且将loss自动添加到tf.GraphKeys.LOSSES里 ##
                    ## 也可以使用 tf.nn.softmax_cross_entropy() 和 tf.add_to_collection(tf.GraphKeys.LOSSES, loss)来完成同样的动作 ##
                    ## 疑问,为什么不直接执行 loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits) ?而要多此一举,将loss添加到tf.GraphKeys.LOSSSE里面
                    ## 是因为,想要控制图的执行顺序,下面的update_ops必须先执行,再执行计算总loss的动作tf.add_n(loss_cur_gpu) ## 这个控制在BN里是非常重要的 ##
                    ## 实际上,这里并不需要控制update_ops在total_loss前执行,所以可以删掉控制流逻辑的diam,total_loss直接用tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits())来计算 ##
                    update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS,
                                                   scope)
                    #print 'gpu:', str(i), 'update_ops.name in scop', update_ops[0].name
                    #print 'gpu:', str(i), 'update_ops scope', update_ops
                    #print 'gpu:', str(i), 'update_ops all',  tf.get_collection(tf.GraphKeys.UPDATE_OPS)
                    ## 这句话,做了什么? 从collections的map里,取出了name=Update_ops in scope下的[value]
                    ## 采集 需要update的操作 ## update_ops in combination with reuse varscope
                    ## explain the tf.GraphKeys.UPDATE_OPS is What.
                    ## Custom functions to update some variables can be added to UPDATE_OPS
                    ## and, separately run at each iteration using sess.run(tf.get_collection(tf.GraphKeys.UPDATE_OPS))
                    ## In this case, these variables are set trainable=False to avoid being updated by gradient descent.
                    ## 注意这里是很重要的,会将trainable=True所涉及到的会change-variable值的操作都添加到tf.GraphKeys.UPDATE_OPS里面,
                    ## 是不是,意味着tf的opt.apply_gradient是会被添加到update_ops里面?然而并不能在执行前后,发现tf.GraphKeys.UPDATE_OPS与内容,都是空的 ##?常用的是BN对参数的更新在这里更新 ##
                    ## on earth, what is tf.GraphKeys.UPDATE_OPS ?
                    with tf.control_dependencies(
                            update_ops
                    ):  ## 这里是做什么用的 ?## 控制流程执行顺序,update_ops先执行完毕,再计算总loss值。
                        losses = tf.get_collection(tf.GraphKeys.LOSSES, scope)
                        total_loss = tf.add_n(
                            losses, name='total_loss')  ## 为什么没有除以数量,求均值 ? ##
                        print '#debug# ---losses in gpu:', str(i), losses
                        print '#debug# ---losses all   :', str(
                            i), tf.get_collection(tf.GraphKeys.LOSSES)
                        print '#debug# ---trainable var:', str(
                            i), tf.get_collection(
                                tf.GraphKeys.TRAINABLE_VARIABLES)
                        print '#total_loss ----- here  :', str(
                            i
                        ), total_loss  ## 这个GPU上的总loss ## 难道这里上面的loss 不是这个GPU上的总loss么,因为那个loss尺寸是[N, 1]=y.shape,并没有加和。
                        #print '#debug# --- update_ops  :', str(i), update_ops
                        ## tf.control_depencies表示 with段内的操作是在updates_op执行之后,再执行的 ## 控制了图的执行顺序 ##
                        ## notice: 这里get_collection都使用了 scope来过滤,保证 操作和结果 都是本scope-GPU内的.
                        ## 在本GPU上,根据切片输入,计算了logits,并将loss追加到collection里面;然后执行有update var的操作,再获取整个GPU上计算的loss,合起来,作为本GPU本切片的loss.
                        ## notice: 这里的total_loss是定义计算loss操作 ##
                        ## 如何将对应的正则loss也提取出来 ? ##
                        ## 如果前面在变量定义时,已经用regularizer设定了,则会自动被收集到colleciton的key=regularizer_loss里面
                    ## 疑惑,tf.GraphKeys.LOSSES里的值会在下次计算的时候更新么?还是继续追加呢,感觉有清理机制 ##
                    ## reuse variable ##
                    tf.get_variable_scope().reuse_variables(
                    )  ## 将当前变量空间,设置为其中变量可以重复使用,必须配合tf.get_variable来使用 ## gpu上的操作都在同一个变量空间内,后面的gpu:1/2/3都可以复用gpu:0时使用的变量
                    ## name_space 并不会影响 variable_space ##
                    ## 当我的变量在定义时,都在variable_scope(reuse=tf.AUTO_REUSE)设置下,这里岂不是就可以不用了?是的 ## 当每个用到了tf.get_variable的variable_scope下,都是共享参数的 ##
                    ## 这里共享的到底是谁?哪些变量?用tf.get_variable_scope来查看,发现是4个GPU下都是一样的variable_scope ##
                    ## 4个GPU在同一个variable_scope下,是因为最开始的with tf.variable_scope(tf.get_variable_scope)
                    ## 第一个GPU使用的trainable_variable,会由于 reuse=True的设置,允许后面GPU在使用tf.get_variable时,同名的变量是一样的/共享的.
                    ## 比如,这个GPU下使用了变量layer-1/w1:0来计算loss,那么后面的GPU在使用变量layer-1/w1:0来计算loss时,是用同一个变量layer-1/w1:0
                    ## 疑惑,为什么要将当前变量空间搞成变量可共享呢?都有什么变量呢? 感觉是将w1,w2,b1,b2共享 ## yes, 就是这个样子的,如果将变量定义的地方设置为variable_scope(reuse=tf.AUTO_REUSE),这里就省了 ##
                    print '-' * 10, 'gpu:', i, '#debug, tf.get_variable_scope', tf.get_variable_scope(
                    )
                    # grad compute
                    print 'gpu:', i, 'before compute_gradients time:', time.ctime(
                    )
                    grads = optimizer.compute_gradients(total_loss)
                    #print '-'*10, '#debug, compute_gradients', grads
                    ## this is the first part of minimize() ##
                    ## optimizer.compute_gradients(loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None)
                    ## what will do this operation ? ##
                    ## compute gradients of loss for the variables in var_list(default=tf.GraphKeys.TRAINABLE_VARIABLES)
                    ## return a list of (gradient, variable) pair
                    tower_grads.append(grads)
                    tower_logits.append(logits)
    # we must calculate the mean of each gradient, notice: this is synchronization across all tower #
    print 'before average_gradient time:', time.ctime()
    grads = average_gradients(tower_grads)
    ## apply the gradients to adjust the sared variables.
    print 'before apply_gradients time:', time.ctime()
    train_op = optimizer.apply_gradients(
        grads, global_step=tf.train.get_global_step())
    print 'before tf.nn.softmax time:', time.ctime()
    prob = tf.nn.softmax(tf.concat(tower_logits, 0), dim=1)
    print 'before tf.argmax time:', time.ctime()
    prob_class = tf.argmax(prob, axis=1)
    print 'before tf.metrics.accuracy time:', time.ctime()
    accuracy = tf.metrics.accuracy(labels=tf.argmax(labels, axis=1),
                                   predictions=prob_class)
    ## train mode and eval mode ##
    if mode == tf.estimator.ModeKeys.TRAIN or mode == tf.estimator.ModeKeys.EVAL:
        print 'before tf.estiamtor.EstimatorSpe time:', time.ctime()
        return tf.estimator.EstimatorSpec(
            mode,
            loss=total_loss,
            train_op=train_op,
            eval_metric_ops={'accuracy': accuracy})