def _device(self, cluster): ''' get the device args: cluster: a tf cluster returns: - the device specification - the chief paramater server device ''' if 'local' in cluster.as_dict(): device = tf.DeviceSpec(job='local') chief_ps = None else: #distributed training num_servers = len(cluster.as_dict()['ps']) ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy( num_tasks=num_servers, load_fn=tf.contrib.training.byte_size_load_fn) device = tf.train.replica_device_setter(ps_tasks=num_servers, ps_strategy=ps_strategy) chief_ps = tf.DeviceSpec(job='ps', task=0) return device, chief_ps
def _model_fn(features, labels, mode, params): model_fn = MODELS[FLAGS.model].model global_step = tf.train.get_or_create_global_step() if FLAGS.num_gpus > 0 and mode == learn.ModeKeys.TRAIN: split_features = {k: tf.split(v, FLAGS.num_gpus) for k, v in features.iteritems()} split_labels = {k: tf.split(v, FLAGS.num_gpus) for k, v in labels.iteritems()} grads = [] predictions = collections.defaultdict(list) losses = [] opt = ops.create_optimizer( params.optimizer, params.learning_rate, params.decay_steps) for i in range(FLAGS.num_gpus): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): with tf.name_scope("tower_%d" % i): with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0): device_features = {k: v[i] for k, v in split_features.iteritems()} device_labels = {k: v[i] for k, v in split_labels.iteritems()} device_predictions, device_loss = model_fn( device_features, device_labels, mode, params) for k, v in device_predictions.iteritems(): predictions[k].append(v) if device_loss is not None: losses.append(device_loss) device_grads = opt.compute_gradients(device_loss) grads.append(device_grads) grads = ops.average_gradients(grads) train_op = opt.apply_gradients(grads, global_step=global_step) for k, v in predictions.iteritems(): predictions[k] = tf.concat(v, axis=0) loss = tf.add_n(losses) if losses else None else: with tf.device(tf.DeviceSpec(device_type="GPU", device_index=0)): predictions, loss = model_fn(features, labels, mode, params) train_op = None if mode == learn.ModeKeys.TRAIN: opt = ops.create_optimizer( params.optimizer, params.learning_rate, params.decay_steps) train_op = opt.minimize(loss, global_step=global_step) tf.summary.scalar("loss/loss", loss) return tf.contrib.learn.ModelFnOps( mode=mode, predictions=predictions, loss=loss, train_op=train_op)
def stop_chief(self, server, sess=None): # num_ps = cluster_spec.num_tasks(JobType.ps) # num_workers = cluster_spec.num_tasks(JobType.worker) num_ps = len(self.clusterspec_dict[JobType.ps]) num_workers = len(self.clusterspec_dict[JobType.worker]) enq_ops = [] ps_devtasklist = [ tf.DeviceSpec(job=JobType.ps, task=ii) for ii in range(num_ps) ] wrk_devtasklist = [ tf.DeviceSpec(job=JobType.worker, task=ii) for ii in range(1, num_workers) ] devtasklist = ps_devtasklist + wrk_devtasklist for q in create_done_queues_chief(devtasklist): qop = q.enqueue(1) enq_ops.append(qop) if sess is None: # config = server.server_def.default_session_config # with tf.Session(server.target, config=config) as sess: with self.get_session(server) as sess: for op in enq_ops: sess.run(op) else: for op in enq_ops: sess.run(op)
def _set_train_or_infer(self, hparams, res, loss): """Set up training and inference.""" # Training if self.mode == tf.estimator.ModeKeys.TRAIN: trainable_vars = tf.trainable_variables() total_loss = loss[0] # Print trainable variables utils.print_out("# Trainable variables") utils.print_out("Format: <name>, <shape>, <(soft) device placement>") for param in trainable_vars: utils.print_out(" {}, {}, {}".format(param.name, str(param.get_shape()), param.op.device)) # [K by N]. K: num_gpu, N:num_variables per gpu list_vars = [list(filter(lambda x: "tower_{:d}".format(gpu_idx) in x.name, trainable_vars)) for gpu_idx in range(self.num_gpu)] with tf.variable_scope("optimization"): # Calculate gradient per device list_grads = [] with tf.name_scope("compute_gradients"): for gpu_idx in range(self.num_gpu): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.name_scope("tower_{}".format(gpu_idx)): loss = total_loss[gpu_idx] list_grads.append( tf.gradients(loss, list_vars[gpu_idx], colocate_gradients_with_ops=hparams.colocate_gradients_with_ops)) # Apply NCCL all reduce w/ average on the list_grads with tf.name_scope("all_reduce"): list_grads = model_helper.allreduce_tensors(list_grads, average=True) # Gradient clipping (Not clipped if max_gradient_norm=None) with tf.name_scope("clipping"): list_grads, list_norms = model_helper.gradient_clip(list_grads, max_gradient_norm=hparams.max_gradient_norm) self.grad_norm = list_norms[0] # Apply gradient per device opts = [] update_ops = [] with tf.variable_scope("optimizer"): for gpu_idx in range(self.num_gpu): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.variable_scope("tower_{}".format(gpu_idx)): if hparams.optimizer == "sgd": optimizer = tf.train.GradientDescentOptimizer(self.learning_rate) elif hparams.optimizer == "adam": optimizer = tf.train.AdamOptimizer(self.learning_rate) else: raise ValueError("Unknown optimizer type {}".format(hparams.optimizer)) opts.append(optimizer) update_ops.append(optimizer.apply_gradients(zip(list_grads[gpu_idx], list_vars[gpu_idx]))) add_global_step = tf.assign_add(self.global_step, 1) with tf.control_dependencies([add_global_step]): self.update = tf.group(*update_ops, name='update_op') self.train_summary = self._get_train_summary()
def __init__(self, create_fn, embeddings, labels, **kwargs): super(ClassifyParallelModel, self).__init__() # We need to remove these because we may be calling back to our caller, and we need # the condition of calling to be non-parallel gpus = kwargs.pop('gpus', -1) # If the gpu ID is set to -1, use CUDA_VISIBLE_DEVICES to figure it out if gpus == -1: gpus = len(os.getenv('CUDA_VISIBLE_DEVICES', os.getenv('NV_GPU', '0')).split(',')) print('Num GPUs', gpus) self.labels = labels nc = len(labels) self.saver = None self.replicas = [] self.mxlen = int(kwargs.get('mxlen', 100)) self.mxwlen = int(kwargs.get('mxwlen', 40)) # This only exists to make exporting easier self.pdrop_value = kwargs.get('dropout', 0.5) # This only exists to make exporting easier self.x = kwargs.get('x', tf.placeholder(tf.int32, [None, self.mxlen], name="x_parallel")) self.y = kwargs.get('y', tf.placeholder(tf.int32, [None, nc], name="y_parallel")) self.lengths = kwargs.get('lengths', tf.placeholder(tf.int32, [None], name="lengths_parallel")) self.pkeep = kwargs.get('pkeep', tf.placeholder_with_default(1.0, shape=(), name="pkeep")) self.pdrop_value = kwargs.get('dropout', 0.5) x_splits = tf.split(self.x, gpus) y_splits = tf.split(self.y, gpus) lengths_splits = tf.split(self.lengths, gpus) xch_splits = None c2v = embeddings.get('char') if c2v is not None: self.xch = kwargs.get('xch', tf.placeholder(tf.int32, [None, self.mxlen, self.mxwlen], name='xch_parallel')) xch_splits = tf.split(self.xch, gpus) losses = [] self.labels = labels with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)) as sess: with tf.device(tf.DeviceSpec(device_type="CPU")): self.inference = create_fn(embeddings, labels, sess=sess, **kwargs) for i in range(gpus): with tf.device(tf.DeviceSpec(device_type='GPU', device_index=i)): replica = create_fn(embeddings, labels, sess=sess, x=x_splits[i], y=y_splits[i], xch=xch_splits[i] if xch_splits is not None else None, lengths=lengths_splits[i], pkeep=self.pkeep, **kwargs) self.replicas.append(replica) loss_op = replica.create_loss() losses.append(loss_op) self.loss = tf.reduce_mean(tf.stack(losses)) self.sess = sess self.best = self.inference.best
def _compute_loss(self, hparams, res): """Compute loss.""" # Regression Loss with tf.name_scope("regression_loss"): loss_type = hparams.loss with tf.name_scope("target_placeholder"): target_phs = list_ops.list_placeholder(self.num_gpu, (None, self.target_length, self.target_dims), tf.float32) for ph in target_phs: tf.add_to_collection('placeholder', ph) with tf.name_scope("{:s}_loss".format(loss_type)): if loss_type == "l2": loss = list_ops.list_l2(target_phs, res) elif loss_type == "weighted_smooth_l1": loss = list_ops.list_weighted_smooth_l1(target_phs, res) else: raise ValueError("Unknown loss type {:s}".format(loss_type)) with tf.name_scope("reduce_sum"): loss = list_ops.list_reduce_sum(loss) with tf.name_scope("cast"): batch_size_float = list_ops.list_cast(self.batch_size, tf.float32) with tf.name_scope("division"): list_regression_loss = list_ops.list_divide(loss, batch_size_float) with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)), tf.name_scope("reduce_mean"): self.regression_loss = tf.reduce_mean(list_regression_loss) # Weight Decay Loss with tf.name_scope("weight_decay_loss"): all_decay_losses = tf.losses.get_regularization_losses() if len(all_decay_losses): list_regs = [list(filter(lambda x: "tower_{:d}".format(gpu_idx) in x.name, all_decay_losses)) for gpu_idx in range(self.num_gpu)] with tf.name_scope("add_n"): list_decay_loss = list_ops.list_add_n(list_regs) else: list_decay_loss = list_ops.list_zeros_like([np.float32(0.0) for _ in range(self.num_gpu)]) self.decay_loss = list_decay_loss[0] # Total Loss with tf.name_scope("total_loss"): list_total_loss = [*zip(list_regression_loss, list_decay_loss)] with tf.name_scope("add_n"): list_total_loss = list_ops.list_add_n(list_total_loss) with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)), tf.name_scope("reduce_mean"): self.total_loss = tf.reduce_mean(list_total_loss) # Define TB loss summaries tf.summary.scalar("all_tower_mean", self.total_loss, family='total_loss') [tf.summary.scalar("tower_{:d}".format(gpu_idx), list_total_loss[gpu_idx], family='total_loss') for gpu_idx in range(self.num_gpu)] tf.summary.scalar("all_tower_mean", self.regression_loss, family='regression_loss') [tf.summary.scalar("tower_{:d}".format(gpu_idx), list_regression_loss[gpu_idx], family='regression_loss') for gpu_idx in range(self.num_gpu)] tf.summary.scalar("weight_decay_loss", self.decay_loss, family='regularization_loss') return list_total_loss, list_regression_loss, list_decay_loss
def stop_chief(self, server, sess=None, stop_workers=True): num_workers = self.num_workers chief_devtask = tf.DeviceSpec(job=JobType.worker, task=0) queue_from_workers = [ create_done_queue_task( chief_devtask, shared_name='done_queue_worker_{}'.format(ii)) for ii in range(1, num_workers) ] sess = self.get_session(server) if sess is None else sess # MAKE SURE ALL THE WORKERS ARE DONE BEFORE STOPPING # for iw, qfw in enumerate(queue_from_workers): for qfw in queue_from_workers: # RECEIVE SIGNAL FROM WORKERS. # if sess is None: # with self.get_session(server) as sess: # sess.run(qfw.dequeue()) # else: sess.run(qfw.dequeue()) # print("CHIEF {} RECEIVED DONE FROM WORKER {}. QUITTING" # .format(qfw, iw), file=sys.stderr) # SEND SIGNALS TO EVERYONE ELSE TO QUIT num_ps = self.num_ps num_workers = self.num_workers enq_ops = [] ps_devtasklist = [ tf.DeviceSpec(job=JobType.ps, task=ii) for ii in range(num_ps) ] wrk_devtasklist = [ tf.DeviceSpec(job=JobType.worker, task=ii) for ii in range(1, num_workers) ] # STOP WORKERS FIRST BEFORE PS if stop_workers: devtasklist = wrk_devtasklist + ps_devtasklist else: devtasklist = ps_devtasklist for q in create_done_queues_chief(devtasklist): qop = q.enqueue(1) enq_ops.append(qop) if sess is None: # config = server.server_def.default_session_config # with tf.Session(server.target, config=config) as sess: with self.get_session(server) as sess: for op in enq_ops: sess.run(op) else: for op in enq_ops: sess.run(op)
def get_device_spec(device, next_=True): global current_index if device in ('cpu', 'CPU'): device_spec = tf.DeviceSpec(device_type='CPU', device_index=0) else: device_spec = tf.DeviceSpec(device_type=device['name'], device_index=current_index) if next_: current_index = current_index + 1 current_index = current_index % device['count'] LOGGER.debug(device_spec.to_string()) return device_spec
def get_device_spec(device): global GPU_INDEX if device in ('cpu', 'CPU'): device_spec = tf.DeviceSpec(device_type='CPU', device_index=0) else: device_spec = tf.DeviceSpec( device_type=device['name'], device_index=GPU_INDEX) GPU_INDEX += 1 GPU_INDEX %= device['count'] LOGGER.debug(device_spec.to_string()) return device_spec
def maybe_device_gpu(device_index=0): if USE_DEVICE == defines.DEVICE_GPU: if not tf.test.is_built_with_cuda(): print('WARNING: Tensorflow was not built with cuda, ' 'we use cpu mode.') return defines.DEVICE_CPU if not tf.test.is_gpu_available(): print('WARNING: There is no GPU available we use cpu mode.') return defines.DEVICE_CPU return tf.device( tf.DeviceSpec(device_type=defines.DEVICE_GPU, device_index=device_index)) return tf.device( tf.DeviceSpec(device_type=defines.DEVICE_CPU, device_index=0))
def make_parallel(model_fn, features, labels, mode, params, num_gpus): with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)): split_features = { k: tf.split(v, num_gpus) for k, v in features.items() } split_labels = {k: tf.split(v, num_gpus) for k, v in labels.items()} predictions = collections.defaultdict(list) losses = [] tower_grads_and_vars = [] for i in range(num_gpus): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): with tf.name_scope("tower_%d" % i) as name_scope: with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0): device_features = { k: v[i] for k, v in split_features.items() } device_labels = {k: v[i] for k, v in split_labels.items()} device_predictions, device_loss, device_metrics = model_fn( device_features, device_labels, mode, params) if i == 0: eval_metrics = device_metrics update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS, name_scope) reg_losses = tf.get_collection( tf.GraphKeys.REGULARIZATION_LOSSES, name_scope) for k, v in device_predictions.items(): predictions[k].append(v) if device_loss is not None: losses.append(device_loss) device_all_vars = tf.trainable_variables() device_grads = tf.gradients( device_loss, device_all_vars) device_grads_and_vars = list( zip(device_grads, device_all_vars)) tower_grads_and_vars.append(device_grads_and_vars) for k, v in predictions.items(): predictions[k] = tf.concat(v, axis=0) return predictions, losses, reg_losses, update_ops, eval_metrics, tower_grads_and_vars
def parallelize(fn, num_gpus, **kwargs): """Parallelizes a tensorflow function Args: fn (callable): A function taking keywords arguments num_gpus (int): The number of GPUs to parallelize on. kwargs: Keywords arguments for fn. The values should be tensors, because they have to be split into num_gpus parts. Returns: list: A list of tensors containing the concatenated results of the parallel function calls. """ parts = {} for k, v in iteritems(kwargs): parts[k] = tf.split(v, num_gpus) output = [] for g in range(num_gpus): with tf.device(tf.DeviceSpec(device_type='GPU', device_index=g)): # all gpus use vars of gpu 0 with tf.variable_scope(tf.get_variable_scope(), reuse=g > 0): output.append(fn(**{k: v[g] for k, v in iteritems(parts)})) output = [outp for outp in zip(*output)] concat_output = [] for outp in output: # can't concat scalars, so use stack instead if isinstance(outp[0], list): concat_output.append(outp) elif outp[0].get_shape().ndims == 0: concat_output.append(tf.stack(outp)) else: concat_output.append(tf.concat(outp, axis=0)) return concat_output
def make_parallel(self, fn, num_gpus, **kwargs): """Parallelize given model on multiple gpu devices. adapted from: https://github.com/vahidk/EffectiveTensorflow#make_parallel """ in_splits = {} for k, v in kwargs.items(): if k in ('num_classes', 'is_training'): in_splits[k] = [v] * num_gpus elif type(v) is tf.SparseTensor: in_splits[k] = tf.sparse_split(sp_input=v, num_split=num_gpus, axis=0) else: in_splits[k] = tf.split(v, num_gpus) out_split = [] for i in range(num_gpus): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): outputs = fn(**{k: v[i] for k, v in in_splits.items()}) for o in range(len(outputs)): if o >= len(out_split): out_split.append([]) out_split[o].append(outputs[o]) return [tf.stack(o, axis=0) for o in out_split]
def get_allps_devlist(self): num_ps = self.num_ps ps_devtasklist = [ tf.DeviceSpec(job=JobType.ps, task=ii) for ii in range(num_ps) ] return ps_devtasklist
def __init__(self, output_actions_size, thread_id=0, device='cpu', device_index=0, learning_rate=0.0001, beta=0.01): self.width, self.height, self.depth = 84, 84, 4 self.thread_id = thread_id self.device_spec = tf.DeviceSpec(device_type=device, device_index=device_index) self.scope = 'net_' + str(thread_id) self.learning_rate = learning_rate self.beta = beta self.output_actions_size = output_actions_size with tf.device(self.device_spec), tf.variable_scope( self.scope) as scope: self.input_state = tf.placeholder( "float", [None, self.height, self.width, self.depth]) self.advantage = tf.placeholder("float", [None]) self.targets = tf.placeholder("float", [None]) self.actions = tf.placeholder("float", [None, self.output_actions_size]) self._build_graph()
def build_graph(self, hparams, scope=None): with tf.variable_scope("Model"): utils.print_out("# Creating {} graph ...".format(self.mode)) is_training = (self.mode == tf.estimator.ModeKeys.TRAIN) # Encoder list_encoder_output, list_encoder_state = self._build_encoder(hparams, is_training) # Stop discriminator list_stop_score, list_classifier_result = self._build_stop_discriminator(hparams, list_encoder_output, is_training) # Decoder list_regression, _ = self._build_decoder(hparams, list_encoder_output, list_encoder_state, is_training) with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)), tf.name_scope("output"): # Concatenate final outputs from all devices self.regression = tf.concat(list_regression, axis=0) self.stop = tf.concat(list_classifier_result, axis=0) list_losses = None if self.mode != tf.estimator.ModeKeys.PREDICT: # Calculate loss in train and eval phase with tf.name_scope("loss"): list_losses = self._compute_loss(hparams, list_regression, list_stop_score) return (list_regression, list_classifier_result), list_losses
def __create_validate(self, depth_multiplier, is_reuse=False): # create network graph for validation logger.info( 'creating a mobilenet graph for validation... is_reuse=%d' % (is_reuse)) with tf.device( tf.DeviceSpec(device_type="GPU", device_index=0)), tf.variable_scope('tower0'): self.output_valid, _ = self.__create_network_for_imagenet( self.ph_valid_image, is_training=self.is_training, is_reuse=is_reuse, depth_multiplier=depth_multiplier) # loss self.loss_valid = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.ph_valid_label, logits=self.output_valid) self.acc_valid_top1 = tf.cast(tf.nn.in_top_k(self.output_valid, self.ph_valid_label, k=1), dtype=tf.float32) self.acc_valid_top5 = tf.cast(tf.nn.in_top_k(self.output_valid, self.ph_valid_label, k=5), dtype=tf.float32)
def call(self, features, training=False): device_spec = tf.DeviceSpec(device_type="CPU", device_index=0) with tf.device(device_spec): # shape: (B, T, E) vgids_emb, sequence_len = self.vgids_layer(features) vsids_emb, _ = self.vsids_layer(features) vcids_emb, _ = self.vcids_layer(features) vgprices_emb, _ = self.vgprices_layer(features) if training: add_mba_reg(self, features, vgids_emb, 'user.visited_goods_ids') add_mba_reg(self, features, vsids_emb, 'user.visited_shop_ids') add_mba_reg(self, features, vcids_emb, 'user.visited_cate_ids') add_mba_reg(self, features, vgprices_emb, 'user.visited_goods_prices') vgoods_shape = tf.shape(vgids_emb) query_emb = self.text_emb(features, self.query_layer, self.query_conv_layer, vgoods_shape[1]) # shape: (B, T, E) user_behavior_rep = tf.concat( [vgids_emb, vsids_emb, vcids_emb, vgprices_emb, query_emb], axis=-1) # shape: (B, T, 64) user_behavior_rep = self.mlp(user_behavior_rep, training=training) return [user_behavior_rep, sequence_len]
def call(self, features, training=False): device_spec = tf.DeviceSpec(device_type="CPU", device_index=0) with tf.device(device_spec): # shape: (B, T, E) gids_emb, sequence_len = self.gids_layer(features) sids_emb, _ = self.sids_layer(features) cids_emb, _ = self.cids_layer(features) gprices_emb, _ = self.gprices_layer(features) rankpos_emb, _ = self.rankpos_layer(features) showpos_emb, _ = self.showpos_layer(features) if training: add_mba_reg(self, features, gids_emb, 'item.goods_ids') add_mba_reg(self, features, sids_emb, 'item.shop_ids') add_mba_reg(self, features, cids_emb, 'item.cate_ids') add_mba_reg(self, features, gprices_emb, 'item.goods_prices') title_emb = self.text_emb(features, self.title_layer, self.title_conv_layer, self.title_len, self.twe_dim) content_emb = self.text_emb(features, self.content_layer, self.content_conv_layer, self.content_len, self.cwe_dim) # shape: (B, T, E) items_rep = tf.concat([ gids_emb, sids_emb, cids_emb, gprices_emb, title_emb, content_emb ], axis=-1) # modeling rank pos items_rep = rankpos_emb + items_rep # shape: (B, T, 64) items_rep = self.mlp(items_rep, training=training) return [items_rep, sequence_len, showpos_emb]
def Multigpu_train(model_fn, num_gpus, rgb_input, flow_input): in_splits = {} in_splits['rgb'] = tf.split( rgb_input, num_gpus) if rgb_input is not None else None in_splits['flow'] = tf.split( flow_input, num_gpus) if flow_input is not None else None out_split = [] for i in range(num_gpus): if tf.test.is_built_with_cuda(): device_type = 'GPU' else: device_type = 'CPU' with tf.device( tf.DeviceSpec(device_type=device_type, device_index=i)): with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): if in_splits['flow'] is None: out_split.append(model_fn(in_splits['rgb'][i], None)) elif in_splits['rgb'] is None: out_split.append(model_fn(None, in_splits['flow'][i])) else: out_split.append( model_fn(in_splits['rgb'][i], in_splits['flow'][i])) out = tf.concat(out_split, axis=0) return out
def make_parallel( num_gpus, images, questions, answers, phase_train, ): with tf.device('/cpu:0'): image = tf.split(images, num_gpus) answer = tf.split(answers, num_gpus) # question = tf.split(questions, num_gpus) question = tf.split(tf.reverse(questions, [-1]), num_gpus) loss_split = [] mi_loss_split = [] accuracy_split = [] with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): for i in range(num_gpus): with tf.name_scope('Tower_%d' % i): with tf.device(tf.DeviceSpec(device_type='GPU', device_index=i)): (cross_entropy, mi_loss, correct_prediction) = \ get_model(image[i], question[i], answer[i], phase_train) loss_split.append(cross_entropy) mi_loss_split.append(mi_loss) accuracy_split.append(correct_prediction) with tf.device('/cpu:0'): mean_loss = tf.reduce_mean(loss_split) mean_mi_loss = tf.reduce_mean(mi_loss_split) mean_accuracy = tf.reduce_mean(accuracy_split) return (mean_loss, mean_mi_loss, mean_accuracy)
def join(self, server, sess=None, exit_flag=True): # server.join() task_id = self.mytask_id jobtype = self.myjobtype if jobtype == JobType.worker: self._signal_chief(server, sess) mydevtask = tf.DeviceSpec(job=jobtype, task=task_id) queue = create_done_queue_task(mydevtask) # RECEIVE SIGNAL FROM CHIEF. if sess is None: # config = server.server_def.default_session_config # with tf.Session(server.target, config=config) as sess: with self.get_session(server) as sess: sess.run(queue.dequeue()) else: sess.run(queue.dequeue()) print("{} {} RECEIVED DONE. QUITTING".format(jobtype, task_id), file=sys.stderr) if exit_flag: sys.exit(0)
def __init__(self, feature_config, rate=0.3): super(UserBehaviorEmbedding, self).__init__() feature_configs = feature_config.get_feature_configs() self.query_len = feature_configs['user.query_word_ids']['query_len'] device_spec = tf.DeviceSpec(device_type="CPU", device_index=0) with tf.device(device_spec): feature_columns = feature_config.get_feature_columns() self.vgids_layer = SequenceFeatures( [feature_columns.get('user.visited_goods_ids')]) self.vsids_layer = SequenceFeatures( [feature_columns.get('user.visited_shop_ids')]) self.vcids_layer = SequenceFeatures( [feature_columns.get('user.visited_cate_ids')]) self.vgprices_layer = SequenceFeatures( [feature_columns.get('user.visited_goods_prices')]) self.query_layer = SequenceFeatures( [feature_columns.get('user.query_word_ids')]) # item text convolution layer self.query_conv_layer = QueryTextConv(FLAGS.qtxt_filters, FLAGS.qtxt_kernel_sizes, self.query_len) # multi-layer projection self.mlp_bn1 = tf.keras.layers.BatchNormalization(epsilon=1e-6) self.mlp_drop1 = tf.keras.layers.Dropout(rate=rate) self.mlp_dense1 = tf.keras.layers.Dense(FLAGS.be_filter_size, activation='relu') self.mlp_bn2 = tf.keras.layers.BatchNormalization(epsilon=1e-6) self.mlp_drop2 = tf.keras.layers.Dropout(rate=rate) self.mlp_dense2 = tf.keras.layers.Dense(FLAGS.hidden_size, activation='relu')
def make_parallel(fn, num_gpus, **kwargs): in_splits = {} # create empty dictionary # for each of the tensors in kwargs, create a split and add it to the dictionary for k, v in kwargs.items(): in_splits[k] = tf.split(v, num_gpus) loss_split = [] # create empty list correct_split = [] pred_split = [] for i in range(num_gpus): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): # allow for variable reuse on GPUs beyond index 0 with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0): #pass the splits into the function and append results loss, correct_prediction, pred = fn( **{k: v[i] for k, v in in_splits.items()}) loss_split.append(loss) correct_split.append(correct_prediction) pred_split.append(pred) return tf.concat(loss_split, axis=0), tf.concat(correct_split, axis=0), tf.concat(pred_split, axis=0)
def model(input, targets, training, alpha, dropout=0.3, gpu_num=0): target15, target14, target13, target12, target11, target10 = targets print('input:', input.shape) input = tf.split(input, gpu_num) target15 = tf.split(target15, gpu_num) target14 = tf.split(target14, gpu_num) target13 = tf.split(target13, gpu_num) target12 = tf.split(target12, gpu_num) target11 = tf.split(target11, gpu_num) target10 = tf.split(target10, gpu_num) losses = [] Decoded_all = [] for gpu_id in range(int(gpu_num)): reuse = gpu_id > 0 with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_id)): Decoded = en_decode(input[gpu_id], training, dropout, reuse) out15, out14, out13, out12, out11, out10 = Decoded loss = 0 loss += abs_loss(out15, target15[gpu_id]) * pdims(out15) * 10 loss += abs_loss(out14, target14[gpu_id]) * pdims(out14) * 2 loss += abs_loss(out13, target13[gpu_id]) * pdims(out13) loss += abs_loss(out12, target12[gpu_id]) * pdims(out12) loss += abs_loss(out11, target11[gpu_id]) * pdims(out11) loss += abs_loss(out10, target10[gpu_id]) * pdims(out10) loss /= 100 losses.append(loss) Decoded_all.append(Decoded) L2_loss = tf.losses.get_regularization_loss() * 1e-4 loss = tf.reduce_mean(tf.stack(losses, axis=0)) loss += L2_loss trainables = tf.trainable_variables() train_vgg = tf.train.MomentumOptimizer(tf.maximum( alpha / 2, 1e-7), 0.9).minimize( loss, var_list=[var for var in trainables if 'vgg' in var.name]) train_others = tf.train.MomentumOptimizer(alpha, 0.9).minimize( loss, var_list=[var for var in trainables if 'vgg' not in var.name]) train = tf.group(train_vgg, train_others) D = [] for i in range(len(Decoded_all[0])): # for j in range(len(Decoded_all)) outs = [Decoded_all[j][i] for j in range(len(Decoded_all))] outs = tf.concat(outs, axis=0) D.append(tf.nn.relu(outs)) m = L2_loss return train, loss, D, m
def make_parallel(model, num_gpus, imgs): out_split = [] for i in range(num_gpus): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): with tf.variable_scope(tf.get_variable_scope(), reuse=i > 0): out_split.append(model(images=imgs[i])) # return tf.concat(out_split, axis=0) return out_split
def distribution_gpus(num_gpus): if num_gpus == 1: return tf.contrib.distribute.OneDeviceStrategy( device=tf.DeviceSpec(device_type="GPU", device_index=0)) elif num_gpus > 1: return tf.contrib.distribute.MirroredStrategy(num_gpus=num_gpus) else: return None
def _set_params_initializer(self, hparams, mode, scope): """Set various params for self and initialize.""" self.mode = mode self.num_gpu = hparams.num_gpu with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)): with tf.variable_scope("training_parameters"): # Batch size placeholder self.batch_size = list( tf.placeholder(dtype=tf.int32, shape=[], name="tower_{}_batch_size".format(gpu_idx)) for gpu_idx in range(self.num_gpu)) # Learning rate self.learning_rate = tf.get_variable( name="learning_rate", initializer=hparams.learning_rate, dtype=tf.float32, trainable=False) with tf.name_scope("learning_rate_decay"): self.decay_ratio = tf.placeholder(dtype=tf.float32, shape=[], name="lr_dacay_ratio") # Global step self.global_step = tf.get_variable(name="global_step", initializer=np.array( 0, np.int64), dtype=tf.int64, trainable=False) with tf.device(tf.DeviceSpec( device_type="CPU", device_index=0)), tf.variable_scope( "batch_norm_decay"): bn_momentum = tf.train.exponential_decay( hparams.bn_init_decay, self.global_step * hparams.batch_size, hparams.bn_decay_step, hparams.bn_decay_rate, staircase=True) self.bn_decay = tf.minimum(hparams.bn_decay_clip, 1 - bn_momentum) # Initializer self.random_seed = hparams.random_seed
def list_tile(list_input, multiples, new_scope=True): assert type(list_input) == list list_output = [] for gpu_idx, inputs in enumerate(list_input): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=gpu_idx)), tf.name_scope("tower_{:d}".format(gpu_idx) if new_scope else tf.get_default_graph().get_name_scope() + "/tower_{:d}/".format(gpu_idx)): list_output.append(tf.tile(inputs, multiples)) return list_output
def _get_histogram_summary(self): with tf.device(tf.DeviceSpec(device_type="CPU", device_index=0)), tf.name_scope("activation_histogram"): rnn_state_stack = tf.stack(self.list_encoder_output, axis=0) feature_stack = tf.stack(self.list_global_feature, axis=0) merged_state_stack = tf.stack(self.list_merged_state, axis=0) return [tf.summary.merge([tf.summary.histogram("rnn_state", rnn_state_stack), tf.summary.histogram("global_feature", feature_stack), tf.summary.histogram("merged_feature", merged_state_stack)])]