def language_model_op(X, M, params, featurizer_state): language_model_state = language_model( X=X, M=M, config=params, embed_weights=featurizer_state['embed_weights'], hidden=featurizer_state['sequence_features'], ) lm_logits = language_model_state["logits"] lm_logit_mask = np.zeros([1, lm_logits.get_shape().as_list()[-1]], dtype=np.float32) lm_logit_mask[:, encoder.vocab_size:] = -np.inf if "use_extra_toks" in params and not params.use_extra_toks: lm_logit_mask[:, encoder.start] = -np.inf lm_logit_mask[:, encoder.delimiter] = -np.inf lm_logit_mask[:, encoder.clf_token] = -np.inf lm_logits += lm_logit_mask lm_predict_op = sample_with_temperature(lm_logits, params.lm_temp) return lm_predict_op, language_model_state
def _construct_graph(self, n_updates_total, target_dim=None, train=True): gpu_grads = [] self.summaries = [] # store whether or not graph was previously compiled with dropout self.train = train self._define_placeholders(target_dim=target_dim) aggregator = defaultdict(list) train_loss_tower = 0 gpus = self.config.visible_gpus n_splits = max(len(gpus), 1) # multi-GPU setup, using CPU as param server is most efficient unless system has direct GPU connections # single GPU, no need to use a different GPU as a parameter server params_device = 'cpu' if len(gpus) != 1 else gpus[0] # decide on setting for language model loss coefficient # if the language model loss does not contribute to overall loss, # remove the language model computation from the graph lm_loss_coef = self.config.lm_loss_coef if target_dim is None: lm_loss_coef = 1.0 compile_lm = (train and lm_loss_coef > 0) or self.require_lm for i, (X, M, Y) in enumerate(soft_split(self.X, self.M, self.Y, n_splits=n_splits)): do_reuse = True if i > 0 else tf.AUTO_REUSE if gpus: device = tf.device(assign_to_gpu(gpus[i], params_device=params_device)) else: device = tf.device('cpu') scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse) with device, scope: featurizer_state = featurizer( X, config=self.config, encoder=self.encoder, dropout_placeholder=self.do_dropout, train=train, reuse=do_reuse ) if compile_lm: language_model_state = language_model( X=X, M=M, config=self.config, embed_weights=featurizer_state['embed_weights'], hidden=featurizer_state['sequence_features'], reuse=do_reuse ) train_loss = lm_loss_coef * tf.reduce_mean(language_model_state['losses']) aggregator['lm_losses'].append(language_model_state['losses']) lm_logits = language_model_state["logits"] aggregator["lm_model"].append(sample_with_temperature(lm_logits, self.config.lm_temp)) else: train_loss = 0 aggregator['features'].append(featurizer_state['features']) if target_dim is not None: with tf.variable_scope('model/target'): target_model_state = self._target_model( featurizer_state=featurizer_state, targets=Y, n_outputs=target_dim, train=train, reuse=do_reuse, max_length=self.config.max_length ) train_loss += (1 - lm_loss_coef) * tf.reduce_mean(target_model_state['losses']) train_loss_tower += train_loss aggregator['logits'].append(target_model_state['logits']) aggregator['target_losses'].append(target_model_state['losses']) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) with tf.device(params_device): self.features = tf.concat(aggregator['features'], axis=0) if compile_lm: self.lm_predict_op = tf.concat(aggregator["lm_model"], 0) self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0) self.lm_loss = tf.reduce_mean(self.lm_losses) self.summaries.append(tf.summary.scalar('LanguageModelLoss', self.lm_loss)) if train: self._compile_train_op( params=params, grads=gpu_grads, n_updates_total=n_updates_total ) if target_dim is not None: self.logits = tf.concat(aggregator['logits'], axis=0) self.target_losses = concat_or_stack(aggregator['target_losses']) self.predict_op = self._predict_op( self.logits, **target_model_state.get("predict_params", {}) ) self.predict_proba_op = self._predict_proba_op( self.logits, **target_model_state.get("predict_params", {}) ) self.target_loss = tf.reduce_mean(self.target_losses) self.summaries.append(tf.summary.scalar('TargetModelLoss', self.target_loss)) self.summaries.append(tf.summary.scalar('TotalLoss', train_loss_tower / n_splits)) self.summaries = tf.summary.merge(self.summaries) if self.summaries else self.noop
def _construct_graph(self, n_updates_total, target_dim=None, train=True, pre_trained_weights=None): gpu_grads = [] self.summaries = [] # store whether or not graph was previously compiled with dropout self.train = train self._define_placeholders(target_dim=target_dim) aggregator = defaultdict(list) train_loss_tower = 0 gpus = self.config.visible_gpus n_splits = max(len(gpus), 1) # multi-GPU setup, using CPU as param server is most efficient unless system has direct GPU connections # single GPU, no need to use a different GPU as a parameter server params_device = 'cpu' if len(gpus) != 1 else gpus[0] for i, (X, M, Y) in enumerate(soft_split(self.X, self.M, self.Y, n_splits=n_splits)): do_reuse = True if i > 0 else tf.AUTO_REUSE if gpus: device = tf.device(assign_to_gpu(gpus[i], params_device=params_device)) else: device = tf.device('cpu') scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse) with device, scope: featurizer_state = featurizer( X, config=self.config, encoder=self.encoder, dropout_placeholder=self.do_dropout, train=train, reuse=do_reuse ) language_model_state = language_model( X=X, M=M, config=self.config, embed_weights=featurizer_state['embed_weights'], hidden=featurizer_state['sequence_features'], reuse=do_reuse ) lm_loss_coef = self.config.lm_loss_coef if target_dim is None: lm_loss_coef = 1.0 train_loss = lm_loss_coef * tf.reduce_mean(language_model_state['losses']) aggregator['features'].append(featurizer_state['features']) aggregator['lm_losses'].append(language_model_state['losses']) lm_logits = language_model_state["logits"] aggregator["lm_model"].append(sample_with_temperature(lm_logits, self.config.lm_temp)) if target_dim is not None: with tf.variable_scope('model/target'): target_model_state = self._target_model( featurizer_state=featurizer_state, targets=Y, n_outputs=target_dim, train=train, reuse=do_reuse, max_length=self.config.max_length ) train_loss += (1 - lm_loss_coef) * tf.reduce_mean(target_model_state['losses']) train_loss_tower += train_loss aggregator['logits'].append(target_model_state['logits']) aggregator['target_losses'].append(target_model_state['losses']) params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) with tf.device(params_device): self.lm_predict_op = tf.concat(aggregator["lm_model"], 0) self.features = tf.concat(aggregator['features'], axis=0) self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0) if train: self._compile_train_op( params=params, grads=gpu_grads, n_updates_total=n_updates_total, initial_params=pre_trained_weights ) if target_dim is not None: self.logits = tf.concat(aggregator['logits'], axis=0) self.target_losses = concat_or_stack(aggregator['target_losses']) self.predict_op = self._predict_op( self.logits, **target_model_state.get("predict_params", {}) ) self.predict_proba_op = self._predict_proba_op( self.logits, **target_model_state.get("predict_params", {}) ) self.target_loss = tf.reduce_mean(self.target_losses) self.lm_loss = tf.reduce_mean(self.lm_losses) self.summaries.append(tf.summary.scalar('TargetModelLoss', self.target_loss)) self.summaries.append(tf.summary.scalar('LanguageModelLoss', self.lm_loss)) self.summaries.append(tf.summary.scalar('TotalLoss', train_loss_tower / n_splits)) self.summaries = tf.summary.merge(self.summaries) if self.summaries else self.noop
def _construct_graph(self, n_updates_total, target_dim=None, train=True): gpu_grads = [] self.summaries = [] # store whether or not graph was previously compiled with dropout self.train = train self.target_dim = target_dim self._define_placeholders() aggregator = defaultdict(list) train_loss_tower = 0 gpus = get_available_gpus(self.config) n_splits = max(len(gpus), 1) for i, (X, M, Y) in enumerate( soft_split(self.X, self.M, self.Y, n_splits=n_splits)): do_reuse = True if i > 0 else tf.AUTO_REUSE if gpus: device = tf.device( assign_to_gpu(gpus[i], params_device=gpus[0])) else: device = tf.device('cpu') scope = tf.variable_scope(tf.get_variable_scope(), reuse=do_reuse) with device, scope: featurizer_state = featurizer( X, config=self.config, encoder=self.encoder, dropout_placeholder=self.do_dropout, train=train, reuse=do_reuse) language_model_state = language_model( X=X, M=M, config=self.config, embed_weights=featurizer_state['embed_weights'], hidden=featurizer_state['sequence_features'], reuse=do_reuse) lm_loss_coef = self.config.lm_loss_coef if target_dim is None: lm_loss_coef = 1.0 train_loss = lm_loss_coef * tf.reduce_mean( language_model_state['losses']) aggregator['features'].append(featurizer_state['features']) aggregator['lm_losses'].append(language_model_state['losses']) lm_logits = language_model_state["logits"] aggregator["lm_model"].append( sample_with_temperature(lm_logits, self.config.lm_temp)) if target_dim is not None: target_model_state = self._target_model( featurizer_state=featurizer_state, targets=Y, n_outputs=target_dim, train=train, reuse=do_reuse, max_length=self.config.max_length) train_loss += (1 - lm_loss_coef) * tf.reduce_mean( target_model_state['losses']) train_loss_tower += train_loss params = find_trainable_variables("model") grads = tf.gradients(train_loss, params) grads = list(zip(grads, params)) gpu_grads.append(grads) aggregator['logits'].append(target_model_state['logits']) aggregator['clf_losses'].append( target_model_state['losses']) self.lm_predict_op = tf.concat(aggregator["lm_model"], 0) self.features = tf.concat(aggregator['features'], axis=0) self.lm_losses = tf.concat(aggregator['lm_losses'], axis=0) if target_dim is not None: self.logits = tf.concat(aggregator['logits'], axis=0) self.clf_losses = concat_or_stack(aggregator['clf_losses']) self.predict_op, self.predict_proba_op = self._predict_ops( self.logits, **target_model_state.get("predict_params", {})) self._compile_train_op(params=params, grads=gpu_grads, n_updates_total=n_updates_total) self.clf_loss = tf.reduce_mean(self.clf_losses) self.lm_loss = tf.reduce_mean(self.lm_losses) self.summaries.append( tf.summary.scalar('TargetModelLoss', self.clf_loss)) self.summaries.append( tf.summary.scalar('LanguageModelLoss', self.lm_loss)) self.summaries.append( tf.summary.scalar('TotalLoss', train_loss_tower / n_splits)) self.summaries = tf.summary.merge(self.summaries)