def __init__(self, args, dat, rng, aux_model=None, log_file=sys.stdout): log_file.write('\n# Creating model.\n') log_file.flush() self.aux_model = aux_model if aux_model is not None: # Add padding entity for unassigned leaves of auxiliary model. padded_range_e = aux_model.padded_range_e else: padded_range_e = dat.range_e self._summaries = [] initializer = tf.contrib.layers.xavier_initializer() self.emb = tf.Variable(initializer( (padded_range_e, dat.embedding_dim)), name='emb') if aux_model is None: self.bias = tf.Variable(initializer( (padded_range_e,)), name='bias') else: self.bias = tf.Variable( tf.pad(-aux_model.avg_lls, [[0, 1]]), name='bias') self.log_normalizer = tf.Variable( tf.zeros((), dtype=tf.float32), name='log_normalizer') self._summaries.append(tf.summary.scalar( 'log_normalizer', self.log_normalizer)) with tf.variable_scope('minibatch'): if aux_model is None: self.minibatch_htr = tf.placeholder( tf.int32, shape=(None,), name='minibatch') else: self.minibatch_htr = aux_model.minibatch_htr minibatch_size = tf.shape(self.minibatch_htr)[0] minibatch_size_float = tf.cast(minibatch_size, tf.float32) self.feed_train_features = tf.placeholder( tf.float32, shape=dat.features['train'].shape) all_train_features = tf.Variable( self.feed_train_features, dtype=tf.float32, trainable=False) features_minibatch = tf.gather( # (B, d) all_train_features, self.minibatch_htr) labels_pos = tf.gather( # (B,) tf.constant(dat.labels['train'], dtype=tf.int32), self.minibatch_htr) with tf.variable_scope('evaluation'): valid_features = tf.gather( # (B, d) dat.features['valid'], self.minibatch_htr) valid_labels = tf.gather( # (B,) dat.labels['valid'], self.minibatch_htr) emb_eval = self.emb bias_eval = self.bias if padded_range_e != dat.range_e: emb_eval = emb_eval[:-1, :] bias_eval = bias_eval[:-1] valid_scores_main = ( # shape (batch, categories) tf.matmul(valid_features, emb_eval, transpose_b=True, name='valid_scores') + bias_eval) valid_scores_aux = aux_model.unnormalized_score(None, args) valid_scores = valid_scores_main + valid_scores_aux self.valid_likelihood = -tf.reduce_sum( tf.nn.sparse_softmax_cross_entropy_with_logits( labels=valid_labels, logits=valid_scores)) with tf.variable_scope('log_likelihood'): emb_pos = tf.gather( # (B, d) self.emb, labels_pos, name='emb_pos') bias_pos = tf.gather( # (B,) self.bias, labels_pos, name='bias_pos') if aux_model is None: self.scores = ( # shape (batch, categories) tf.matmul( features_minibatch, self.emb, transpose_b=True, name='scores') + tf.expand_dims(self.bias, 0)) # The documentation for `tf.nn.sparse_softmax_cross_entropy_with_logits` is unclear # about signs and normalization. It turns out that the function does the following, # assuming that `labels.shape = (m,)` and `logits.shape = (m, n)`: # tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) # = - [[logits[i, labels[i]] for i in range(m)] for j in range(n)] # + log(sum(exp(logits), axis=1)) neg_log_likelihood = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_pos, logits=self.scores)) else: labels_neg, lls_neg = aux_model.create_sampler( None, args.neg_samples) # `labels_neg` has shape (minibatch_size, neg_samples) emb_neg = tf.gather( # (B, n, d) self.emb, labels_neg, name='emb_neg') bias_neg = tf.gather( # (B, n) self.bias, labels_neg, name='bias_neg') scores_pos = ( # (B, 1, 1) tf.matmul( tf.expand_dims(features_minibatch, 1), tf.expand_dims(emb_pos, 1), transpose_b=True, name='scores_pos') + tf.expand_dims(tf.expand_dims(bias_pos + self.log_normalizer, 1), 2)) scores_neg = ( # (B, 1, n) tf.matmul( tf.expand_dims(features_minibatch, 1), emb_neg, transpose_b=True, name='scores_neg') + tf.expand_dims(bias_neg + self.log_normalizer, 1)) self._summaries += [ tf.summary.histogram( 'acceptance_pos_t', tf.nn.sigmoid(scores_pos)), tf.summary.histogram('acceptance_neg_t', tf.nn.sigmoid(scores_neg))] neg_log_likelihood = ( (1.0 / args.neg_samples) * tf.reduce_sum( tf.nn.softplus(scores_neg)) + tf.reduce_sum(tf.nn.softplus(-scores_pos))) with tf.variable_scope('regularizer'): regularizer = args.initial_reg_strength * tf.reduce_sum(emb_pos**2) if aux_model is not None: lls_pos = aux_model.training_samples_ll(labels_pos) # TODO: maybe we want to use `aux_model.avg_lls` instead of `lls_neg` reg_bias_neg = tf.reduce_sum((bias_neg + lls_neg)**2) # or, regularize complete score_neg/pos towards -lls_neg/pos reg_bias_pos = tf.reduce_sum((bias_pos + lls_pos)**2) regularizer += ( (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum(emb_neg**2) + (args.bias_reg / args.neg_samples) * reg_bias_neg + args.bias_reg * reg_bias_pos) self.loss = tf.add_n([neg_log_likelihood, regularizer], name='loss') with tf.variable_scope('loss_parts'): normalizer_per_embedding = ( len(dat.labels['train']) / (dat.embedding_dim * padded_range_e) * minibatch_size_float) normalizer_per_datapoint = 1.0 / minibatch_size_float self._summaries.append(tf.summary.scalar( 'regularizer_per_embedding_and_dimension', normalizer_per_embedding * regularizer)) self._summaries.append(tf.summary.scalar( 'neg_log_likelihood_per_datapoint', normalizer_per_datapoint * neg_log_likelihood)) self._summaries.append(tf.summary.scalar( 'loss_per_datapoint', normalizer_per_datapoint * self.loss)) global_step, lr, lr_summary = optimizer.define_learning_rate(args) self._summaries.append(lr_summary) opt = optimizer.define_optimizer(args, lr) with tf.variable_scope('opt'): self._e_step = opt.minimize( self.loss, var_list=[self.emb, self.bias, self.log_normalizer], global_step=global_step) self._summary_op = tf.summary.merge(self._summaries)
def __init__(self, args, dat, rng, aux_model=None, log_file=sys.stdout): log_file.write('\n# Creating model.\n') log_file.flush() self.aux_model = aux_model if aux_model is not None: # Add padding entity for unassigned leaves of auxiliary model. padded_range_e = aux_model.padded_range_e else: padded_range_e = dat.range_e self._summaries = [] initializer = tf.contrib.layers.xavier_initializer() self.emb = tf.Variable(initializer( (padded_range_e, dat.embedding_dim)), name='emb') if aux_model is None or not args.initialize_to_inverse_aux: self.bias = tf.Variable(initializer( (padded_range_e,)), name='bias') else: self.bias = tf.Variable( -aux_model.avg_lls, name='bias') if aux_model is not None: if args.use_log_norm_weight: self.log_normalizer_weight = tf.Variable(initializer( (dat.embedding_dim,)), name='log_normalizer_weight') self.log_normalizer_bias_var = tf.Variable( tf.zeros((), dtype=tf.float32), name='log_normalizer_bias') self.log_normalizer_bias = 0 * self.log_normalizer_bias_var self._summaries.append(tf.summary.scalar( 'log_normalizer_bias', self.log_normalizer_bias)) with tf.variable_scope('minibatch'): if aux_model is None: self.minibatch_htr = tf.placeholder( tf.int32, shape=(None,), name='minibatch') else: self.minibatch_htr = aux_model.minibatch_htr minibatch_size = tf.shape(self.minibatch_htr)[0] minibatch_size_float = tf.cast(minibatch_size, tf.float32) self.feed_train_features = tf.placeholder( tf.float32, shape=dat.features['train'].shape) all_train_features = tf.Variable( self.feed_train_features, dtype=tf.float32, trainable=False) features_minibatch = tf.gather( # (B, d) all_train_features, self.minibatch_htr) labels_pos = tf.gather( # (B,) tf.constant(dat.labels['train'], dtype=tf.int32), self.minibatch_htr) with tf.variable_scope('evaluation'): evaluation_features = { subset: tf.gather( dat.features[subset], self.minibatch_htr) # (B, d) for subset in ['valid', 'test']} self.evaluation_labels = { subset: tf.gather( dat.labels[subset], self.minibatch_htr) # (B,) for subset in ['valid', 'test']} emb_eval = self.emb bias_eval = self.bias if padded_range_e != dat.range_e: assert padded_range_e == dat.range_e + 1 emb_eval = emb_eval[:-1, :] bias_eval = bias_eval[:-1] evaluation_scores_main = { subset: bias_eval + tf.matmul(evaluation_features[subset], emb_eval, transpose_b=True, name='%s_scores' % subset) # (B, dat.range_e) for subset in ['valid', 'test']} evaluation_scores_aux = { subset: aux_model.unnormalized_score(None, args, subset) for subset in ['valid', 'test']} evaluation_scores = { subset: (evaluation_scores_main[subset] + evaluation_scores_aux[subset]) for subset in ['valid', 'test']} target_indices = { subset: self.evaluation_labels[subset] + dat.range_e * tf.range( tf.shape(self.minibatch_htr)[0]) for subset in ['valid', 'test']} target_scores = { subset: tf.expand_dims( tf.gather(tf.reshape(evaluation_scores[subset], (-1,)), target_indices[subset]), 1) for subset in ['valid', 'test']} target_scores_main = { subset: tf.expand_dims( tf.gather(tf.reshape(evaluation_scores_main[subset], (-1,)), target_indices[subset]), 1) for subset in ['valid', 'test']} # Make sure to get the corner cases right: # * Count scores that are worse than target score and subtract them from `dat.range_e` # to ensure that NaN values are always punished. # * Use `dat.range_e`, which is the first dimension of `evaluation_scores`, not # `padded_range_e`. # * Use strict comparison `<` and not `<=` to punish models that set all scores to the # same value (e.g., zero). self.evaluation_ranks = { subset: dat.range_e - tf.reduce_sum(tf.cast( evaluation_scores[subset] < target_scores[subset], tf.int32), axis=1) for subset in ['valid', 'test']} self.evaluation_ranks_main = { subset: dat.range_e - tf.reduce_sum(tf.cast( evaluation_scores_main[subset] < target_scores_main[subset], tf.int32), axis=1) for subset in ['valid', 'test']} self.evaluation_log_likelihood = { subset: -tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.evaluation_labels[subset], logits=evaluation_scores[subset])) for subset in ['valid', 'test']} self.evaluation_log_likelihood_main = { subset: -tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=self.evaluation_labels[subset], logits=evaluation_scores_main[subset])) for subset in ['valid', 'test']} with tf.variable_scope('log_likelihood'): emb_pos = tf.gather( # (B, d) self.emb, labels_pos, name='emb_pos') bias_pos = tf.gather( # (B,) self.bias, labels_pos, name='bias_pos') scores_pos = ( # (B, 1, 1) tf.matmul( tf.expand_dims(features_minibatch, 1), tf.expand_dims(emb_pos, 1), transpose_b=True, name='scores_pos') + tf.expand_dims(tf.expand_dims(bias_pos, 1), 2)) if aux_model is None: self.scores = ( # shape (batch, categories) tf.matmul( features_minibatch, self.emb, transpose_b=True, name='scores') + tf.expand_dims(self.bias, 0)) # The documentation for `tf.nn.sparse_softmax_cross_entropy_with_logits` is unclear # about signs and normalization. It turns out that the function does the following, # assuming that `labels.shape = (m,)` and `logits.shape = (m, n)`: # tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) # = - [[logits[i, labels[i]] for i in range(m)] for j in range(n)] # + log(sum(exp(logits), axis=1)) neg_log_likelihood = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels_pos, logits=self.scores)) else: labels_neg, lls_neg = aux_model.create_sampler( None, args.neg_samples) # `labels_neg` has shape (minibatch_size, neg_samples) emb_neg = tf.gather( # (B, n, d) self.emb, labels_neg, name='emb_neg') bias_neg = tf.gather( # (B, n) self.bias, labels_neg, name='bias_neg') lls_pos = tf.reshape( aux_model.training_samples_ll(labels_pos), (-1, 1, 1)) scores_neg = ( # (B, 1, n) tf.matmul( tf.expand_dims(features_minibatch, 1), emb_neg, transpose_b=True, name='scores_neg') + tf.expand_dims(bias_neg, 1)) # self._summaries += [ # tf.summary.histogram('eta_pos', scores_pos), # tf.summary.histogram('exp_eta_neg', tf.exp(scores_neg))] # neg_log_likelihood = ( # (1.0 / args.neg_samples) * tf.reduce_sum( # tf.exp(scores_neg)) # - tf.reduce_sum(scores_pos)) self._summaries += [ tf.summary.histogram( 'acceptance_pos_t', tf.nn.sigmoid(scores_pos)), tf.summary.histogram('acceptance_neg_t', tf.nn.sigmoid(scores_neg))] if args.model == 'supervised': neg_log_likelihood = ( (1.0 / args.neg_samples) * tf.reduce_sum( tf.nn.softplus(scores_neg)) + tf.reduce_sum(tf.nn.softplus(-scores_pos))) elif args.model == 'supervised_nce': neg_log_likelihood = ( (1.0 / args.neg_samples) * tf.reduce_sum( tf.nn.softplus(scores_neg - lls_neg)) + tf.reduce_sum(tf.nn.softplus(lls_pos - scores_pos))) with tf.variable_scope('regularizer'): if args.reg_separate: to_regularize_pos = tf.expand_dims( tf.expand_dims(bias_pos, 1), 2) to_regularize_neg = tf.expand_dims(bias_neg, 1) regularizer = ( args.initial_reg_strength * tf.reduce_sum(emb_pos**2) + (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum(emb_neg**2)) else: to_regularize_pos = scores_pos to_regularize_neg = scores_neg regularizer = 0 if aux_model is None: regularizer += ( args.initial_reg_strength * tf.reduce_sum(to_regularize_pos**2)) else: if args.model == 'supervised': log_normalizer = self.log_normalizer_bias if args.use_log_norm_weight: log_normalizer += tf.reshape( tf.matmul( features_minibatch, tf.expand_dims(self.log_normalizer_weight, 0), transpose_b=True, name='log_normalizer'), (-1, 1, 1)) regularizer += ( args.initial_reg_strength * tf.reduce_sum( (to_regularize_pos + lls_pos - log_normalizer)**2) + (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum( (to_regularize_neg + tf.expand_dims(lls_neg, 1) - log_normalizer)**2)) elif args.model == 'supervised_nce': regularizer += ( args.initial_reg_strength * tf.reduce_sum( to_regularize_pos**2) + (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum( to_regularize_neg**2)) self.loss = tf.add_n([neg_log_likelihood, regularizer], name='loss') with tf.variable_scope('loss_parts'): normalizer_per_embedding = ( len(dat.labels['train']) / (dat.embedding_dim * padded_range_e) * minibatch_size_float) normalizer_per_datapoint = 1.0 / minibatch_size_float self._summaries.append(tf.summary.scalar( 'regularizer_per_embedding_and_dimension', normalizer_per_embedding * regularizer)) self._summaries.append(tf.summary.scalar( 'neg_log_likelihood_per_datapoint', normalizer_per_datapoint * neg_log_likelihood)) self._summaries.append(tf.summary.scalar( 'loss_per_datapoint', normalizer_per_datapoint * self.loss)) global_step, lr, lr_summary = optimizer.define_learning_rate(args) self._summaries.append(lr_summary) opt = optimizer.define_optimizer(args, lr) with tf.variable_scope('opt'): var_list = [self.emb, self.bias] if aux_model is not None: var_list.append(self.log_normalizer_bias_var) if args.use_log_norm_weight: var_list.append(self.log_normalizer_weight) self._e_step = opt.minimize( self.loss, var_list=var_list, global_step=global_step) self._summary_op = tf.summary.merge(self._summaries)
def __init__(self, args, dat, rng, aux_model=None, log_file=sys.stdout): log_file.write('\n# Creating model.\n') log_file.flush() self.aux_model = aux_model if aux_model is not None: # Add padding entity for unassigned leaves of auxiliary model. padded_range_e = aux_model.padded_range_e else: padded_range_e = dat.range_e self._summaries = [] with tf.device('/cpu:0'): with tf.variable_scope('means'): self.means_e, self.means_r = self.define_emb( args, padded_range_e, dat.range_r) with tf.variable_scope('samples_e'): self.samples_e, self.expanded_means_e, self.log_std_e = self.create_all_samplers( self.means_e, args) with tf.variable_scope('samples_r'): self.samples_r, self.expanded_means_r, self.log_std_r = self.create_all_samplers( self.means_r, args) with tf.variable_scope('minibatch'): if aux_model is None: self.minibatch_htr = tf.placeholder(tf.int32, shape=(None, 3), name='minibatch_htr') self.idx_h = self.minibatch_htr[:, 0] self.idx_t = self.minibatch_htr[:, 1] idx_r_predict_t = self.minibatch_htr[:, 2] else: self.minibatch_htr = aux_model.minibatch_htr self.idx_h = aux_model.idx_h self.idx_t = aux_model.idx_t idx_r_predict_t = aux_model.idx_r idx_r_predict_h = idx_r_predict_t + dat.range_r minibatch_size = tf.shape(self.minibatch_htr)[0] minibatch_size_float = tf.cast(minibatch_size, tf.float32) emb_h = { label: tf.gather(samples, self.idx_h, name='gather_mb_h') for label, samples in self.samples_e.items() } emb_t = { label: tf.gather(samples, self.idx_t, name='gather_mb_t') for label, samples in self.samples_e.items() } emb_r_predict_t = { label: tf.gather(samples, idx_r_predict_t, name='gather_mb_r_predict_t') for label, samples in self.samples_r.items() } emb_r_predict_h = { label: tf.gather(samples, idx_r_predict_h, name='gather_mb_r_predict_h') for label, samples in self.samples_r.items() } self.minibatch_mean_h = { label: tf.gather(means, self.idx_h) for label, means in self.expanded_means_e.items() } self.minibatch_mean_t = { label: tf.gather(means, self.idx_t) for label, means in self.expanded_means_e.items() } self.minibatch_mean_r_predict_t = { label: tf.gather(means, idx_r_predict_t) for label, means in self.expanded_means_r.items() } self.minibatch_mean_r_predict_h = { label: tf.gather(means, idx_r_predict_h) for label, means in self.expanded_means_r.items() } # Prefactor for normalization per training data point. # normalizer = 1.0 / tf.cast(args.num_samples, tf.float32) with tf.variable_scope('log_likelihood'): # TODO: factor out duplication of code for head / tail prediction if aux_model is None: with tf.variable_scope('tail_prediction'): self.scores_predict_t = self.unnormalized_score( emb_h, emb_r_predict_t, self.samples_e, args) ll_predict_t = self._log_likelihood( self.scores_predict_t, self.idx_t, args) with tf.variable_scope('head_prediction'): self.scores_predict_h = self.unnormalized_score( emb_t, emb_r_predict_h, self.samples_e, args) ll_predict_h = self._log_likelihood( self.scores_predict_h, self.idx_h, args) else: with tf.variable_scope('tail_prediction'): idx_neg, lls_neg_t = aux_model.create_sampler( 't', args.neg_samples) # `idx_neg` has shape (minibatch_size, neg_samples) emb_neg = { label: tf.squeeze(tf.gather(samples, idx_neg), axis=2) for label, samples in self.samples_e.items() } scores_pos_t, scores_neg_t = self.batch_unnormalized_scores( emb_h, emb_r_predict_t, (emb_t, emb_neg), args) # `scores_pos_t` has shape (minibatch_size, 1) # `scores_neg_t` has shape (minibatch_size, neg_samples) self._summaries += [ tf.summary.histogram('acceptance_pos_t', tf.nn.sigmoid(scores_pos_t)), tf.summary.histogram('acceptance_neg_t', tf.nn.sigmoid(scores_neg_t)) ] ll_predict_t = ( (-1.0 / args.neg_samples) * tf.reduce_sum(tf.nn.softplus(scores_neg_t)) - tf.reduce_sum(tf.nn.softplus(-scores_pos_t))) with tf.variable_scope('head_prediction'): idx_neg, lls_neg_h = aux_model.create_sampler( 'h', args.neg_samples) # `idx_neg` has shape (minibatch_size, neg_samples) emb_neg = { label: tf.squeeze(tf.gather(samples, idx_neg), axis=2) for label, samples in self.samples_e.items() } scores_pos_h, scores_neg_h = self.batch_unnormalized_scores( emb_t, emb_r_predict_h, (emb_h, emb_neg), args) # `scores_pos_h` has shape (minibatch_size, 1) # `scores_neg_h` has shape (minibatch_size, neg_samples) self._summaries += [ tf.summary.histogram('acceptance_pos_h', tf.nn.sigmoid(scores_pos_h)), tf.summary.histogram('acceptance_neg_h', tf.nn.sigmoid(scores_neg_h)) ] ll_predict_h = ( (-1.0 / args.neg_samples) * tf.reduce_sum(tf.nn.softplus(scores_neg_h)) - tf.reduce_sum(tf.nn.softplus(-scores_pos_h))) # log_likelihood = normalizer * (ll_predict_t + ll_predict_h) log_likelihood = ll_predict_t + ll_predict_h # with tf.variable_scope('hyperparameters'): # frequencies_e, counts_e, sort_indices_e = self._get_frequencies( # dat.dat['train'][:, :2].flatten(), padded_range_e, 'e') # frequencies_r, counts_r, sort_indices_r = self._get_frequencies( # dat.dat['train'][:, 2], dat.range_r, 'r') # self.log_lambda_e = self._define_log_lambda(args, counts_e, 'e') # self.log_lambda_r = self._define_log_lambda(args, counts_r, 'r') # inverse_counts_e = (1.0 / counts_e).astype(np.float32) # inverse_counts_r = (1.0 / counts_r).astype(np.float32) # self._lambda_sigma_summary( # self.log_lambda_e, self.log_std_e, inverse_counts_e, sort_indices_e, 'e') # self._lambda_sigma_summary( # self.log_lambda_r, self.log_std_r, inverse_counts_r, sort_indices_r, 'r') # with tf.variable_scope('log_prior'): # # r-counts are the same for head and tail prediction, so gather them only once. # minibatch_inverse_counts_r = tf.gather( # inverse_counts_r, idx_r_predict_t) # log_prior = normalizer * ( # tf.reduce_sum( # tf.gather(inverse_counts_e, self.idx_h) * self.single_log_prior( # tf.gather(self.log_lambda_e, self.idx_h), emb_h)) # + tf.reduce_sum( # tf.gather(inverse_counts_e, self.idx_t) * self.single_log_prior( # tf.gather(self.log_lambda_e, self.idx_t), emb_t)) # + tf.reduce_sum( # minibatch_inverse_counts_r * self.single_log_prior( # tf.gather(self.log_lambda_r, idx_r_predict_t), emb_r_predict_t)) # + tf.reduce_sum( # minibatch_inverse_counts_r * self.single_log_prior( # tf.gather(self.log_lambda_r, idx_r_predict_h), emb_r_predict_h))) with tf.variable_scope('regularizer'): if args.reg_separate: raise "unimplemented" # to_regularize_pos = tf.expand_dims( # tf.expand_dims(bias_pos, 1), 2) # to_regularize_neg = tf.expand_dims(bias_neg, 1) # regularizer = ( # args.initial_reg_strength * tf.reduce_sum(emb_pos**2) # + (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum(emb_neg**2)) else: to_regularize_pos_t = scores_pos_t to_regularize_neg_t = scores_neg_t to_regularize_pos_h = scores_pos_h to_regularize_neg_h = scores_neg_h regularizer = 0 if aux_model is None: regularizer += args.initial_reg_strength * ( tf.reduce_sum(to_regularize_pos_t**2) + tf.reduce_sum(to_regularize_pos_h**2)) else: lls_pos_t = tf.reshape(aux_model.training_samples_ll('t'), (-1, 1, 1)) lls_pos_h = tf.reshape(aux_model.training_samples_ll('h'), (-1, 1, 1)) log_normalizer_t = self.log_normalizer_bias log_normalizer_h = self.log_normalizer_bias if args.use_log_norm_weight: raise "unimplemented" # log_normalizer += tf.reshape( # tf.matmul( # features_minibatch, # tf.expand_dims(self.log_normalizer_weight, 0), # transpose_b=True, name='log_normalizer'), # (-1, 1, 1)) regularizer += ( args.initial_reg_strength * (tf.reduce_sum( (to_regularize_pos_t + lls_pos_t - log_normalizer_t)** 2) + tf.reduce_sum((to_regularize_pos_h + lls_pos_h - log_normalizer_h)**2)) + (args.initial_reg_strength / args.neg_samples) * (tf.reduce_sum( (to_regularize_neg_t + tf.expand_dims(lls_neg_t, 1) - log_normalizer_t)**2) + tf.reduce_sum( (to_regularize_neg_h + tf.expand_dims( lls_neg_h, 1) - log_normalizer_h)**2))) if args.em: raise "unimplemented" # # Calculate entropy of entire variational distribution (independent of minibatch). # # Normalize per training data point. # with tf.variable_scope('entropy'): # entropy = (minibatch_size_float / len(dat.dat['train'])) * tf.add_n( # [tf.reduce_sum(i) for i in # list(self.log_std_e.values()) + list(self.log_std_r.values())], # name='entropy') # self.loss = -tf.add_n([log_prior, log_likelihood, entropy], # name='elbo') else: self.loss = tf.add_n([regularizer, -log_likelihood], name='log_joint') # with tf.variable_scope('loss_parts'): # normalizer_per_embedding = ( # len(dat.dat['train']) / # (args.embedding_dim * (padded_range_e + 2 * dat.range_r) * minibatch_size_float)) # normalizer_per_datapoint = 0.5 / minibatch_size_float # if args.em: # self._summaries.append(tf.summary.scalar('entropy_per_embedding_and_dimension', # normalizer_per_embedding * entropy)) # self._summaries.append(tf.summary.scalar('log_prior_per_embedding_and_dimension', # normalizer_per_embedding * log_prior)) # self._summaries.append(tf.summary.scalar('log_likelihood_per_datapoint', # normalizer_per_datapoint * log_likelihood)) # self._summaries.append(tf.summary.scalar('loss_per_datapoint', # normalizer_per_datapoint * self.loss)) global_step, lr, lr_summary = optimizer.define_learning_rate(args) self._summaries.append(lr_summary) opt = optimizer.define_optimizer(args, lr) with tf.variable_scope('e_step'): var_list = (tf.trainable_variables('means/') + tf.trainable_variables('samples_e/') + tf.trainable_variables('samples_r/')) if aux_model is not None: var_list.append(self.log_normalizer_bias_var) if args.use_log_norm_weight: var_list.append(self.log_normalizer_weight) log_file.write('# %d variational parameters\n' % len(variational_parameters)) gvs_e = opt.compute_gradients(self.loss, var_list=var_list) self._e_step = opt.apply_gradients(gvs_e, global_step) if args.em: with tf.variable_scope('m_step'): hyperparameters = tf.trainable_variables('hyperparameters/') log_file.write('# %d hyperparameters\n' % len(hyperparameters)) gvs_m = opt.compute_gradients(self.loss, var_list=hyperparameters) m_step = opt.apply_gradients(gvs_m) self._em_step = tf.group(self._e_step, m_step, name='em_step') else: self._em_step = None self._summary_op = tf.summary.merge(self._summaries)
def __init__(self, args, dat, rng, log_file=sys.stdout): log_file.write('\n# Creating model.\n') log_file.flush() self._summaries = [] with tf.variable_scope('means'): self.means_e, self.means_r = self.define_emb(args, dat) with tf.variable_scope('samples_e'): self.samples_e, self.expanded_means_e, self.log_std_e = self.create_all_samplers( self.means_e, args) with tf.variable_scope('samples_r'): self.samples_r, self.expanded_means_r, self.log_std_r = self.create_all_samplers( self.means_r, args) with tf.variable_scope('minibatch'): self.minibatch_htr = tf.placeholder( tf.int32, shape=(None, 3), name='minibatch_htr') minibatch_size = tf.shape(self.minibatch_htr)[0] minibatch_size_float = tf.cast(minibatch_size, tf.float32) self.idx_h = self.minibatch_htr[:, 0] self.idx_t = self.minibatch_htr[:, 1] idx_r_predict_t = self.minibatch_htr[:, 2] idx_r_predict_h = idx_r_predict_t + dat.range_r emb_h = {label: tf.gather(samples, self.idx_h) for label, samples in self.samples_e.items()} emb_t = {label: tf.gather(samples, self.idx_t) for label, samples in self.samples_e.items()} emb_r_predict_t = {label: tf.gather(samples, idx_r_predict_t) for label, samples in self.samples_r.items()} emb_r_predict_h = {label: tf.gather(samples, idx_r_predict_h) for label, samples in self.samples_r.items()} self.minibatch_mean_h = { label: tf.gather(means, self.idx_h) for label, means in self.expanded_means_e.items()} self.minibatch_mean_t = { label: tf.gather(means, self.idx_t) for label, means in self.expanded_means_e.items()} self.minibatch_mean_r_predict_t = { label: tf.gather(means, idx_r_predict_t) for label, means in self.expanded_means_r.items()} self.minibatch_mean_r_predict_h = { label: tf.gather(means, idx_r_predict_h) for label, means in self.expanded_means_r.items()} # Prefactor for normalization per training data point. normalizer = 1.0 / tf.cast(args.num_samples, tf.float32) with tf.variable_scope('log_likelihood'): with tf.variable_scope('tail_prediction'): self.scores_predict_t = self.unnormalized_score( emb_h, emb_r_predict_t, self.samples_e, args) ll_predict_t = normalizer * self._log_likelihood( self.scores_predict_t, self.idx_t, args) with tf.variable_scope('head_prediction'): self.scores_predict_h = self.unnormalized_score( emb_t, emb_r_predict_h, self.samples_e, args) ll_predict_h = normalizer * self._log_likelihood( self.scores_predict_h, self.idx_h, args) log_likelihood = ll_predict_t + ll_predict_h with tf.variable_scope('hyperparameters'): counts_e, sort_indices_e = self._get_counts( dat.dat['train'][:, :2].flatten(), dat.range_e, 'e') counts_r, sort_indices_r = self._get_counts( dat.dat['train'][:, 2], dat.range_r, 'r') self.inverse_lambda_e = self._define_inverse_lambda( args, counts_e, 'e') self.inverse_lambda_r = self._define_inverse_lambda( args, counts_r, 'r') inverse_counts_e = (1.0 / counts_e).astype(np.float32) inverse_counts_r = (1.0 / counts_r).astype(np.float32) self._lambda_sigma_summary( self.inverse_lambda_e, self.log_std_e, inverse_counts_e, sort_indices_e, 'e') self._lambda_sigma_summary( self.inverse_lambda_r, self.log_std_r, inverse_counts_r, sort_indices_r, 'r') with tf.variable_scope('log_prior'): # r-counts are the same for head and tail prediction, so gather them only once. minibatch_inverse_counts_r = tf.gather( inverse_counts_r, idx_r_predict_t) log_prior = normalizer * ( tf.reduce_sum( tf.gather(inverse_counts_e, self.idx_h) * self.single_log_prior( tf.gather(self.inverse_lambda_e, self.idx_h), emb_h)) + tf.reduce_sum( tf.gather(inverse_counts_e, self.idx_t) * self.single_log_prior( tf.gather(self.inverse_lambda_e, self.idx_t), emb_t)) + tf.reduce_sum( minibatch_inverse_counts_r * self.single_log_prior( tf.gather(self.inverse_lambda_r, idx_r_predict_t), emb_r_predict_t)) + tf.reduce_sum( minibatch_inverse_counts_r * self.single_log_prior( tf.gather(self.inverse_lambda_r, idx_r_predict_h), emb_r_predict_h))) if args.em: # Calculate entropy of entire variational distribution (independent of minibatch). # Normalize per training data point. with tf.variable_scope('entropy'): entropy = (minibatch_size_float / len(dat.dat['train'])) * tf.add_n( [tf.reduce_sum(i) for i in list(self.log_std_e.values()) + list(self.log_std_r.values())], name='entropy') self.loss = -tf.add_n([log_prior, log_likelihood, entropy], name='elbo') else: self.loss = -tf.add_n([log_prior, log_likelihood], name='log_joint') with tf.variable_scope('loss_parts'): normalizer_per_embedding = ( len(dat.dat['train']) / (args.embedding_dim * (dat.range_e + 2 * dat.range_r) * minibatch_size_float)) normalizer_per_datapoint = 0.5 / minibatch_size_float if args.em: self._summaries.append(tf.summary.scalar('entropy_per_embedding_and_dimension', normalizer_per_embedding * entropy)) self._summaries.append(tf.summary.scalar('log_prior_per_embedding_and_dimension', normalizer_per_embedding * log_prior)) self._summaries.append(tf.summary.scalar('log_likelihood_per_datapoint', normalizer_per_datapoint * log_likelihood)) self._summaries.append(tf.summary.scalar('loss_per_datapoint', normalizer_per_datapoint * self.loss)) global_step, lr_base, lr_summary = optimizer.define_base_learning_rate( args) self._summaries.append(lr_summary) with tf.variable_scope('e_step'): opt_mean = optimizer.define_optimizer(args, args.lr0_mu * lr_base) variational_parameters_mean = tf.trainable_variables('means/') update_means = opt_mean.minimize( self.loss, global_step=global_step, var_list=variational_parameters_mean) log_file.write('# %d variational parameters for means\n' % len(variational_parameters_mean)) if args.em: opt_sigma = optimizer.define_optimizer( args, args.lr0_sigma * lr_base) variational_parameters_sigma = (tf.trainable_variables('samples_e/') + tf.trainable_variables('samples_r/')) log_file.write('# %d variational parameters for standard deviations\n' % len(variational_parameters_sigma)) update_sigmas = opt_sigma.minimize( self.loss, var_list=variational_parameters_sigma) self._e_step = tf.group( update_means, update_sigmas, name='e_step') else: self._e_step = update_means if args.em: with tf.variable_scope('m_step'): lr_lambda = args.lr0_lambda * lr_base update_lambda_e = tf.assign( self.inverse_lambda_e, (1.0 - lr_lambda) * self.inverse_lambda_e + lr_lambda * self.estimate_inverse_lambda(self.samples_e)) update_lambda_r = tf.assign( self.inverse_lambda_r, (1.0 - lr_lambda) * self.inverse_lambda_r + lr_lambda * self.estimate_inverse_lambda(self.samples_r)) m_step = tf.group( update_lambda_e, update_lambda_r, name='m_step') log_file.write('# 2 hyperparameters\n') self._em_step = tf.group(self._e_step, m_step, name='em_step') else: self._em_step = None self._summary_op = tf.summary.merge(self._summaries)