def __init__(self, args, dat, rng, aux_model=None, log_file=sys.stdout):
        log_file.write('\n# Creating model.\n')
        log_file.flush()

        self.aux_model = aux_model
        if aux_model is not None:
            # Add padding entity for unassigned leaves of auxiliary model.
            padded_range_e = aux_model.padded_range_e
        else:
            padded_range_e = dat.range_e

        self._summaries = []
        initializer = tf.contrib.layers.xavier_initializer()

        self.emb = tf.Variable(initializer(
            (padded_range_e, dat.embedding_dim)), name='emb')

        if aux_model is None:
            self.bias = tf.Variable(initializer(
                (padded_range_e,)), name='bias')
        else:
            self.bias = tf.Variable(
                tf.pad(-aux_model.avg_lls, [[0, 1]]), name='bias')

        self.log_normalizer = tf.Variable(
            tf.zeros((), dtype=tf.float32), name='log_normalizer')
        self._summaries.append(tf.summary.scalar(
            'log_normalizer', self.log_normalizer))

        with tf.variable_scope('minibatch'):
            if aux_model is None:
                self.minibatch_htr = tf.placeholder(
                    tf.int32, shape=(None,), name='minibatch')
            else:
                self.minibatch_htr = aux_model.minibatch_htr

            minibatch_size = tf.shape(self.minibatch_htr)[0]
            minibatch_size_float = tf.cast(minibatch_size, tf.float32)

            self.feed_train_features = tf.placeholder(
                tf.float32, shape=dat.features['train'].shape)
            all_train_features = tf.Variable(
                self.feed_train_features, dtype=tf.float32, trainable=False)

            features_minibatch = tf.gather(  # (B, d)
                all_train_features, self.minibatch_htr)
            labels_pos = tf.gather(  # (B,)
                tf.constant(dat.labels['train'], dtype=tf.int32), self.minibatch_htr)

        with tf.variable_scope('evaluation'):
            valid_features = tf.gather(  # (B, d)
                dat.features['valid'], self.minibatch_htr)
            valid_labels = tf.gather(  # (B,)
                dat.labels['valid'], self.minibatch_htr)

            emb_eval = self.emb
            bias_eval = self.bias
            if padded_range_e != dat.range_e:
                emb_eval = emb_eval[:-1, :]
                bias_eval = bias_eval[:-1]
            valid_scores_main = (  # shape (batch, categories)
                tf.matmul(valid_features, emb_eval,
                          transpose_b=True, name='valid_scores')
                + bias_eval)
            valid_scores_aux = aux_model.unnormalized_score(None, args)
            valid_scores = valid_scores_main + valid_scores_aux
            self.valid_likelihood = -tf.reduce_sum(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=valid_labels, logits=valid_scores))

        with tf.variable_scope('log_likelihood'):
            emb_pos = tf.gather(  # (B, d)
                self.emb, labels_pos, name='emb_pos')
            bias_pos = tf.gather(  # (B,)
                self.bias, labels_pos, name='bias_pos')

            if aux_model is None:
                self.scores = (  # shape (batch, categories)
                    tf.matmul(
                        features_minibatch, self.emb, transpose_b=True, name='scores')
                    + tf.expand_dims(self.bias, 0))
                # The documentation for `tf.nn.sparse_softmax_cross_entropy_with_logits` is unclear
                # about signs and normalization. It turns out that the function does the following,
                # assuming that `labels.shape = (m,)` and `logits.shape = (m, n)`:
                #   tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
                #   = - [[logits[i, labels[i]] for i in range(m)] for j in range(n)]
                #     + log(sum(exp(logits), axis=1))
                neg_log_likelihood = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels_pos, logits=self.scores))
            else:
                labels_neg, lls_neg = aux_model.create_sampler(
                    None, args.neg_samples)
                # `labels_neg` has shape (minibatch_size, neg_samples)
                emb_neg = tf.gather(  # (B, n, d)
                    self.emb, labels_neg, name='emb_neg')
                bias_neg = tf.gather(  # (B, n)
                    self.bias, labels_neg, name='bias_neg')

                scores_pos = (  # (B, 1, 1)
                    tf.matmul(
                        tf.expand_dims(features_minibatch, 1),
                        tf.expand_dims(emb_pos, 1),
                        transpose_b=True, name='scores_pos')
                    + tf.expand_dims(tf.expand_dims(bias_pos + self.log_normalizer, 1), 2))
                scores_neg = (  # (B, 1, n)
                    tf.matmul(
                        tf.expand_dims(features_minibatch, 1),
                        emb_neg,
                        transpose_b=True, name='scores_neg')
                    + tf.expand_dims(bias_neg + self.log_normalizer, 1))

                self._summaries += [
                    tf.summary.histogram(
                        'acceptance_pos_t', tf.nn.sigmoid(scores_pos)),
                    tf.summary.histogram('acceptance_neg_t', tf.nn.sigmoid(scores_neg))]
                neg_log_likelihood = (
                    (1.0 / args.neg_samples) * tf.reduce_sum(
                        tf.nn.softplus(scores_neg))
                    + tf.reduce_sum(tf.nn.softplus(-scores_pos)))

        with tf.variable_scope('regularizer'):
            regularizer = args.initial_reg_strength * tf.reduce_sum(emb_pos**2)
            if aux_model is not None:
                lls_pos = aux_model.training_samples_ll(labels_pos)
                # TODO: maybe we want to use `aux_model.avg_lls` instead of `lls_neg`
                reg_bias_neg = tf.reduce_sum((bias_neg + lls_neg)**2)
                # or, regularize complete score_neg/pos towards -lls_neg/pos
                reg_bias_pos = tf.reduce_sum((bias_pos + lls_pos)**2)
                regularizer += (
                    (args.initial_reg_strength / args.neg_samples) *
                    tf.reduce_sum(emb_neg**2)
                    + (args.bias_reg / args.neg_samples) * reg_bias_neg
                    + args.bias_reg * reg_bias_pos)

        self.loss = tf.add_n([neg_log_likelihood, regularizer], name='loss')

        with tf.variable_scope('loss_parts'):
            normalizer_per_embedding = (
                len(dat.labels['train']) /
                (dat.embedding_dim * padded_range_e) * minibatch_size_float)
            normalizer_per_datapoint = 1.0 / minibatch_size_float

            self._summaries.append(tf.summary.scalar(
                'regularizer_per_embedding_and_dimension', normalizer_per_embedding * regularizer))
            self._summaries.append(tf.summary.scalar(
                'neg_log_likelihood_per_datapoint', normalizer_per_datapoint * neg_log_likelihood))
            self._summaries.append(tf.summary.scalar(
                'loss_per_datapoint', normalizer_per_datapoint * self.loss))

        global_step, lr, lr_summary = optimizer.define_learning_rate(args)
        self._summaries.append(lr_summary)
        opt = optimizer.define_optimizer(args, lr)

        with tf.variable_scope('opt'):
            self._e_step = opt.minimize(
                self.loss, var_list=[self.emb, self.bias, self.log_normalizer], global_step=global_step)

        self._summary_op = tf.summary.merge(self._summaries)
    def __init__(self, args, dat, rng, aux_model=None, log_file=sys.stdout):
        log_file.write('\n# Creating model.\n')
        log_file.flush()

        self.aux_model = aux_model
        if aux_model is not None:
            # Add padding entity for unassigned leaves of auxiliary model.
            padded_range_e = aux_model.padded_range_e
        else:
            padded_range_e = dat.range_e

        self._summaries = []
        initializer = tf.contrib.layers.xavier_initializer()

        self.emb = tf.Variable(initializer(
            (padded_range_e, dat.embedding_dim)), name='emb')

        if aux_model is None or not args.initialize_to_inverse_aux:
            self.bias = tf.Variable(initializer(
                (padded_range_e,)), name='bias')
        else:
            self.bias = tf.Variable(
                -aux_model.avg_lls, name='bias')

        if aux_model is not None:
            if args.use_log_norm_weight:
                self.log_normalizer_weight = tf.Variable(initializer(
                    (dat.embedding_dim,)), name='log_normalizer_weight')
            self.log_normalizer_bias_var = tf.Variable(
                tf.zeros((), dtype=tf.float32), name='log_normalizer_bias')
            self.log_normalizer_bias = 0 * self.log_normalizer_bias_var
            self._summaries.append(tf.summary.scalar(
                'log_normalizer_bias', self.log_normalizer_bias))

        with tf.variable_scope('minibatch'):
            if aux_model is None:
                self.minibatch_htr = tf.placeholder(
                    tf.int32, shape=(None,), name='minibatch')
            else:
                self.minibatch_htr = aux_model.minibatch_htr

            minibatch_size = tf.shape(self.minibatch_htr)[0]
            minibatch_size_float = tf.cast(minibatch_size, tf.float32)

            self.feed_train_features = tf.placeholder(
                tf.float32, shape=dat.features['train'].shape)
            all_train_features = tf.Variable(
                self.feed_train_features, dtype=tf.float32, trainable=False)

            features_minibatch = tf.gather(  # (B, d)
                all_train_features, self.minibatch_htr)
            labels_pos = tf.gather(  # (B,)
                tf.constant(dat.labels['train'], dtype=tf.int32), self.minibatch_htr)

        with tf.variable_scope('evaluation'):
            evaluation_features = {
                subset: tf.gather(
                    dat.features[subset], self.minibatch_htr)  # (B, d)
                for subset in ['valid', 'test']}

            self.evaluation_labels = {
                subset: tf.gather(
                    dat.labels[subset], self.minibatch_htr)  # (B,)
                for subset in ['valid', 'test']}

            emb_eval = self.emb
            bias_eval = self.bias
            if padded_range_e != dat.range_e:
                assert padded_range_e == dat.range_e + 1
                emb_eval = emb_eval[:-1, :]
                bias_eval = bias_eval[:-1]

            evaluation_scores_main = {
                subset: bias_eval + tf.matmul(evaluation_features[subset], emb_eval,
                                              transpose_b=True, name='%s_scores' % subset)  # (B, dat.range_e)
                for subset in ['valid', 'test']}

            evaluation_scores_aux = {
                subset: aux_model.unnormalized_score(None, args, subset)
                for subset in ['valid', 'test']}
            evaluation_scores = {
                subset: (evaluation_scores_main[subset] +
                         evaluation_scores_aux[subset])
                for subset in ['valid', 'test']}

            target_indices = {
                subset: self.evaluation_labels[subset] + dat.range_e * tf.range(
                    tf.shape(self.minibatch_htr)[0])
                for subset in ['valid', 'test']}

            target_scores = {
                subset: tf.expand_dims(
                    tf.gather(tf.reshape(evaluation_scores[subset], (-1,)), target_indices[subset]), 1)
                for subset in ['valid', 'test']}
            target_scores_main = {
                subset: tf.expand_dims(
                    tf.gather(tf.reshape(evaluation_scores_main[subset], (-1,)), target_indices[subset]), 1)
                for subset in ['valid', 'test']}

            # Make sure to get the corner cases right:
            # * Count scores that are worse than target score and subtract them from `dat.range_e`
            #   to ensure that NaN values are always punished.
            # * Use `dat.range_e`, which is the first dimension of `evaluation_scores`, not
            #   `padded_range_e`.
            # * Use strict comparison `<` and not `<=` to punish models that set all scores to the
            #   same value (e.g., zero).
            self.evaluation_ranks = {
                subset: dat.range_e - tf.reduce_sum(tf.cast(
                    evaluation_scores[subset] < target_scores[subset], tf.int32),
                    axis=1)
                for subset in ['valid', 'test']}
            self.evaluation_ranks_main = {
                subset: dat.range_e - tf.reduce_sum(tf.cast(
                    evaluation_scores_main[subset] < target_scores_main[subset], tf.int32),
                    axis=1)
                for subset in ['valid', 'test']}

            self.evaluation_log_likelihood = {
                subset: -tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.evaluation_labels[subset],
                    logits=evaluation_scores[subset]))
                for subset in ['valid', 'test']}

            self.evaluation_log_likelihood_main = {
                subset: -tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=self.evaluation_labels[subset],
                    logits=evaluation_scores_main[subset]))
                for subset in ['valid', 'test']}

        with tf.variable_scope('log_likelihood'):
            emb_pos = tf.gather(  # (B, d)
                self.emb, labels_pos, name='emb_pos')
            bias_pos = tf.gather(  # (B,)
                self.bias, labels_pos, name='bias_pos')

            scores_pos = (  # (B, 1, 1)
                tf.matmul(
                    tf.expand_dims(features_minibatch, 1),
                    tf.expand_dims(emb_pos, 1),
                    transpose_b=True, name='scores_pos')
                + tf.expand_dims(tf.expand_dims(bias_pos, 1), 2))

            if aux_model is None:
                self.scores = (  # shape (batch, categories)
                    tf.matmul(
                        features_minibatch, self.emb, transpose_b=True, name='scores')
                    + tf.expand_dims(self.bias, 0))
                # The documentation for `tf.nn.sparse_softmax_cross_entropy_with_logits` is unclear
                # about signs and normalization. It turns out that the function does the following,
                # assuming that `labels.shape = (m,)` and `logits.shape = (m, n)`:
                #   tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits)
                #   = - [[logits[i, labels[i]] for i in range(m)] for j in range(n)]
                #     + log(sum(exp(logits), axis=1))
                neg_log_likelihood = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels_pos, logits=self.scores))
            else:
                labels_neg, lls_neg = aux_model.create_sampler(
                    None, args.neg_samples)
                # `labels_neg` has shape (minibatch_size, neg_samples)
                emb_neg = tf.gather(  # (B, n, d)
                    self.emb, labels_neg, name='emb_neg')
                bias_neg = tf.gather(  # (B, n)
                    self.bias, labels_neg, name='bias_neg')
                lls_pos = tf.reshape(
                    aux_model.training_samples_ll(labels_pos), (-1, 1, 1))

                scores_neg = (  # (B, 1, n)
                    tf.matmul(
                        tf.expand_dims(features_minibatch, 1),
                        emb_neg,
                        transpose_b=True, name='scores_neg')
                    + tf.expand_dims(bias_neg, 1))

                # self._summaries += [
                #     tf.summary.histogram('eta_pos', scores_pos),
                #     tf.summary.histogram('exp_eta_neg', tf.exp(scores_neg))]
                # neg_log_likelihood = (
                #     (1.0 / args.neg_samples) * tf.reduce_sum(
                #         tf.exp(scores_neg))
                #     - tf.reduce_sum(scores_pos))
                self._summaries += [
                    tf.summary.histogram(
                        'acceptance_pos_t', tf.nn.sigmoid(scores_pos)),
                    tf.summary.histogram('acceptance_neg_t', tf.nn.sigmoid(scores_neg))]
                if args.model == 'supervised':
                    neg_log_likelihood = (
                        (1.0 / args.neg_samples) * tf.reduce_sum(
                            tf.nn.softplus(scores_neg))
                        + tf.reduce_sum(tf.nn.softplus(-scores_pos)))
                elif args.model == 'supervised_nce':
                    neg_log_likelihood = (
                        (1.0 / args.neg_samples) * tf.reduce_sum(
                            tf.nn.softplus(scores_neg - lls_neg))
                        + tf.reduce_sum(tf.nn.softplus(lls_pos - scores_pos)))

        with tf.variable_scope('regularizer'):
            if args.reg_separate:
                to_regularize_pos = tf.expand_dims(
                    tf.expand_dims(bias_pos, 1), 2)
                to_regularize_neg = tf.expand_dims(bias_neg, 1)
                regularizer = (
                    args.initial_reg_strength * tf.reduce_sum(emb_pos**2)
                    + (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum(emb_neg**2))
            else:
                to_regularize_pos = scores_pos
                to_regularize_neg = scores_neg
                regularizer = 0

            if aux_model is None:
                regularizer += (
                    args.initial_reg_strength * tf.reduce_sum(to_regularize_pos**2))
            else:
                if args.model == 'supervised':
                    log_normalizer = self.log_normalizer_bias
                    if args.use_log_norm_weight:
                        log_normalizer += tf.reshape(
                            tf.matmul(
                                features_minibatch,
                                tf.expand_dims(self.log_normalizer_weight, 0),
                                transpose_b=True, name='log_normalizer'),
                            (-1, 1, 1))
                    regularizer += (
                        args.initial_reg_strength * tf.reduce_sum(
                            (to_regularize_pos + lls_pos - log_normalizer)**2)
                        + (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum(
                            (to_regularize_neg + tf.expand_dims(lls_neg, 1) - log_normalizer)**2))
                elif args.model == 'supervised_nce':
                    regularizer += (
                        args.initial_reg_strength * tf.reduce_sum(
                            to_regularize_pos**2)
                        + (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum(
                            to_regularize_neg**2))

        self.loss = tf.add_n([neg_log_likelihood, regularizer], name='loss')

        with tf.variable_scope('loss_parts'):
            normalizer_per_embedding = (
                len(dat.labels['train']) /
                (dat.embedding_dim * padded_range_e) * minibatch_size_float)
            normalizer_per_datapoint = 1.0 / minibatch_size_float

            self._summaries.append(tf.summary.scalar(
                'regularizer_per_embedding_and_dimension', normalizer_per_embedding * regularizer))
            self._summaries.append(tf.summary.scalar(
                'neg_log_likelihood_per_datapoint', normalizer_per_datapoint * neg_log_likelihood))
            self._summaries.append(tf.summary.scalar(
                'loss_per_datapoint', normalizer_per_datapoint * self.loss))

        global_step, lr, lr_summary = optimizer.define_learning_rate(args)
        self._summaries.append(lr_summary)
        opt = optimizer.define_optimizer(args, lr)

        with tf.variable_scope('opt'):
            var_list = [self.emb, self.bias]
            if aux_model is not None:
                var_list.append(self.log_normalizer_bias_var)
                if args.use_log_norm_weight:
                    var_list.append(self.log_normalizer_weight)
            self._e_step = opt.minimize(
                self.loss, var_list=var_list, global_step=global_step)

        self._summary_op = tf.summary.merge(self._summaries)
Beispiel #3
0
    def __init__(self, args, dat, rng, aux_model=None, log_file=sys.stdout):
        log_file.write('\n# Creating model.\n')
        log_file.flush()

        self.aux_model = aux_model
        if aux_model is not None:
            # Add padding entity for unassigned leaves of auxiliary model.
            padded_range_e = aux_model.padded_range_e
        else:
            padded_range_e = dat.range_e

        self._summaries = []
        with tf.device('/cpu:0'):
            with tf.variable_scope('means'):
                self.means_e, self.means_r = self.define_emb(
                    args, padded_range_e, dat.range_r)
            with tf.variable_scope('samples_e'):
                self.samples_e, self.expanded_means_e, self.log_std_e = self.create_all_samplers(
                    self.means_e, args)
            with tf.variable_scope('samples_r'):
                self.samples_r, self.expanded_means_r, self.log_std_r = self.create_all_samplers(
                    self.means_r, args)

        with tf.variable_scope('minibatch'):
            if aux_model is None:
                self.minibatch_htr = tf.placeholder(tf.int32,
                                                    shape=(None, 3),
                                                    name='minibatch_htr')
                self.idx_h = self.minibatch_htr[:, 0]
                self.idx_t = self.minibatch_htr[:, 1]
                idx_r_predict_t = self.minibatch_htr[:, 2]
            else:
                self.minibatch_htr = aux_model.minibatch_htr
                self.idx_h = aux_model.idx_h
                self.idx_t = aux_model.idx_t
                idx_r_predict_t = aux_model.idx_r

            idx_r_predict_h = idx_r_predict_t + dat.range_r

            minibatch_size = tf.shape(self.minibatch_htr)[0]
            minibatch_size_float = tf.cast(minibatch_size, tf.float32)

            emb_h = {
                label: tf.gather(samples, self.idx_h, name='gather_mb_h')
                for label, samples in self.samples_e.items()
            }
            emb_t = {
                label: tf.gather(samples, self.idx_t, name='gather_mb_t')
                for label, samples in self.samples_e.items()
            }
            emb_r_predict_t = {
                label: tf.gather(samples,
                                 idx_r_predict_t,
                                 name='gather_mb_r_predict_t')
                for label, samples in self.samples_r.items()
            }
            emb_r_predict_h = {
                label: tf.gather(samples,
                                 idx_r_predict_h,
                                 name='gather_mb_r_predict_h')
                for label, samples in self.samples_r.items()
            }

            self.minibatch_mean_h = {
                label: tf.gather(means, self.idx_h)
                for label, means in self.expanded_means_e.items()
            }
            self.minibatch_mean_t = {
                label: tf.gather(means, self.idx_t)
                for label, means in self.expanded_means_e.items()
            }
            self.minibatch_mean_r_predict_t = {
                label: tf.gather(means, idx_r_predict_t)
                for label, means in self.expanded_means_r.items()
            }
            self.minibatch_mean_r_predict_h = {
                label: tf.gather(means, idx_r_predict_h)
                for label, means in self.expanded_means_r.items()
            }

            # Prefactor for normalization per training data point.
            # normalizer = 1.0 / tf.cast(args.num_samples, tf.float32)

        with tf.variable_scope('log_likelihood'):
            # TODO: factor out duplication of code for head / tail prediction
            if aux_model is None:
                with tf.variable_scope('tail_prediction'):
                    self.scores_predict_t = self.unnormalized_score(
                        emb_h, emb_r_predict_t, self.samples_e, args)
                    ll_predict_t = self._log_likelihood(
                        self.scores_predict_t, self.idx_t, args)
                with tf.variable_scope('head_prediction'):
                    self.scores_predict_h = self.unnormalized_score(
                        emb_t, emb_r_predict_h, self.samples_e, args)
                    ll_predict_h = self._log_likelihood(
                        self.scores_predict_h, self.idx_h, args)
            else:
                with tf.variable_scope('tail_prediction'):
                    idx_neg, lls_neg_t = aux_model.create_sampler(
                        't', args.neg_samples)
                    # `idx_neg` has shape (minibatch_size, neg_samples)
                    emb_neg = {
                        label: tf.squeeze(tf.gather(samples, idx_neg), axis=2)
                        for label, samples in self.samples_e.items()
                    }
                    scores_pos_t, scores_neg_t = self.batch_unnormalized_scores(
                        emb_h, emb_r_predict_t, (emb_t, emb_neg), args)
                    # `scores_pos_t` has shape (minibatch_size, 1)
                    # `scores_neg_t` has shape (minibatch_size, neg_samples)
                    self._summaries += [
                        tf.summary.histogram('acceptance_pos_t',
                                             tf.nn.sigmoid(scores_pos_t)),
                        tf.summary.histogram('acceptance_neg_t',
                                             tf.nn.sigmoid(scores_neg_t))
                    ]
                    ll_predict_t = (
                        (-1.0 / args.neg_samples) *
                        tf.reduce_sum(tf.nn.softplus(scores_neg_t)) -
                        tf.reduce_sum(tf.nn.softplus(-scores_pos_t)))
                with tf.variable_scope('head_prediction'):
                    idx_neg, lls_neg_h = aux_model.create_sampler(
                        'h', args.neg_samples)
                    # `idx_neg` has shape (minibatch_size, neg_samples)
                    emb_neg = {
                        label: tf.squeeze(tf.gather(samples, idx_neg), axis=2)
                        for label, samples in self.samples_e.items()
                    }
                    scores_pos_h, scores_neg_h = self.batch_unnormalized_scores(
                        emb_t, emb_r_predict_h, (emb_h, emb_neg), args)
                    # `scores_pos_h` has shape (minibatch_size, 1)
                    # `scores_neg_h` has shape (minibatch_size, neg_samples)
                    self._summaries += [
                        tf.summary.histogram('acceptance_pos_h',
                                             tf.nn.sigmoid(scores_pos_h)),
                        tf.summary.histogram('acceptance_neg_h',
                                             tf.nn.sigmoid(scores_neg_h))
                    ]
                    ll_predict_h = (
                        (-1.0 / args.neg_samples) *
                        tf.reduce_sum(tf.nn.softplus(scores_neg_h)) -
                        tf.reduce_sum(tf.nn.softplus(-scores_pos_h)))

            # log_likelihood = normalizer * (ll_predict_t + ll_predict_h)
            log_likelihood = ll_predict_t + ll_predict_h

        # with tf.variable_scope('hyperparameters'):
        #     frequencies_e, counts_e, sort_indices_e = self._get_frequencies(
        #         dat.dat['train'][:, :2].flatten(), padded_range_e, 'e')
        #     frequencies_r, counts_r, sort_indices_r = self._get_frequencies(
        #         dat.dat['train'][:, 2], dat.range_r, 'r')
        #     self.log_lambda_e = self._define_log_lambda(args, counts_e, 'e')
        #     self.log_lambda_r = self._define_log_lambda(args, counts_r, 'r')

        # inverse_counts_e = (1.0 / counts_e).astype(np.float32)
        # inverse_counts_r = (1.0 / counts_r).astype(np.float32)
        # self._lambda_sigma_summary(
        #     self.log_lambda_e, self.log_std_e, inverse_counts_e, sort_indices_e, 'e')
        # self._lambda_sigma_summary(
        #     self.log_lambda_r, self.log_std_r, inverse_counts_r, sort_indices_r, 'r')

        # with tf.variable_scope('log_prior'):
        #     # r-counts are the same for head and tail prediction, so gather them only once.
        #     minibatch_inverse_counts_r = tf.gather(
        #         inverse_counts_r, idx_r_predict_t)
        #     log_prior = normalizer * (
        #         tf.reduce_sum(
        #             tf.gather(inverse_counts_e, self.idx_h) * self.single_log_prior(
        #                 tf.gather(self.log_lambda_e, self.idx_h), emb_h))
        #         + tf.reduce_sum(
        #             tf.gather(inverse_counts_e, self.idx_t) * self.single_log_prior(
        #                 tf.gather(self.log_lambda_e, self.idx_t), emb_t))
        #         + tf.reduce_sum(
        #             minibatch_inverse_counts_r * self.single_log_prior(
        #                 tf.gather(self.log_lambda_r, idx_r_predict_t), emb_r_predict_t))
        #         + tf.reduce_sum(
        #             minibatch_inverse_counts_r * self.single_log_prior(
        #                 tf.gather(self.log_lambda_r, idx_r_predict_h), emb_r_predict_h)))

        with tf.variable_scope('regularizer'):
            if args.reg_separate:
                raise "unimplemented"
                # to_regularize_pos = tf.expand_dims(
                #     tf.expand_dims(bias_pos, 1), 2)
                # to_regularize_neg = tf.expand_dims(bias_neg, 1)
                # regularizer = (
                #     args.initial_reg_strength * tf.reduce_sum(emb_pos**2)
                #     + (args.initial_reg_strength / args.neg_samples) * tf.reduce_sum(emb_neg**2))
            else:
                to_regularize_pos_t = scores_pos_t
                to_regularize_neg_t = scores_neg_t
                to_regularize_pos_h = scores_pos_h
                to_regularize_neg_h = scores_neg_h
                regularizer = 0

            if aux_model is None:
                regularizer += args.initial_reg_strength * (
                    tf.reduce_sum(to_regularize_pos_t**2) +
                    tf.reduce_sum(to_regularize_pos_h**2))
            else:
                lls_pos_t = tf.reshape(aux_model.training_samples_ll('t'),
                                       (-1, 1, 1))
                lls_pos_h = tf.reshape(aux_model.training_samples_ll('h'),
                                       (-1, 1, 1))
                log_normalizer_t = self.log_normalizer_bias
                log_normalizer_h = self.log_normalizer_bias
                if args.use_log_norm_weight:
                    raise "unimplemented"
                    # log_normalizer += tf.reshape(
                    #     tf.matmul(
                    #         features_minibatch,
                    #         tf.expand_dims(self.log_normalizer_weight, 0),
                    #         transpose_b=True, name='log_normalizer'),
                    #     (-1, 1, 1))
                regularizer += (
                    args.initial_reg_strength * (tf.reduce_sum(
                        (to_regularize_pos_t + lls_pos_t - log_normalizer_t)**
                        2) + tf.reduce_sum((to_regularize_pos_h + lls_pos_h -
                                            log_normalizer_h)**2)) +
                    (args.initial_reg_strength / args.neg_samples) *
                    (tf.reduce_sum(
                        (to_regularize_neg_t + tf.expand_dims(lls_neg_t, 1) -
                         log_normalizer_t)**2) + tf.reduce_sum(
                             (to_regularize_neg_h + tf.expand_dims(
                                 lls_neg_h, 1) - log_normalizer_h)**2)))

        if args.em:
            raise "unimplemented"
            # # Calculate entropy of entire variational distribution (independent of minibatch).
            # # Normalize per training data point.
            # with tf.variable_scope('entropy'):
            #     entropy = (minibatch_size_float / len(dat.dat['train'])) * tf.add_n(
            #         [tf.reduce_sum(i) for i in
            #          list(self.log_std_e.values()) + list(self.log_std_r.values())],
            #         name='entropy')
            # self.loss = -tf.add_n([log_prior, log_likelihood, entropy],
            #                       name='elbo')
        else:
            self.loss = tf.add_n([regularizer, -log_likelihood],
                                 name='log_joint')

        # with tf.variable_scope('loss_parts'):
        #     normalizer_per_embedding = (
        #         len(dat.dat['train']) /
        #         (args.embedding_dim * (padded_range_e + 2 * dat.range_r) * minibatch_size_float))
        #     normalizer_per_datapoint = 0.5 / minibatch_size_float
        #     if args.em:
        #         self._summaries.append(tf.summary.scalar('entropy_per_embedding_and_dimension',
        #                                                  normalizer_per_embedding * entropy))
        #     self._summaries.append(tf.summary.scalar('log_prior_per_embedding_and_dimension',
        #                                              normalizer_per_embedding * log_prior))
        #     self._summaries.append(tf.summary.scalar('log_likelihood_per_datapoint',
        #                                              normalizer_per_datapoint * log_likelihood))
        #     self._summaries.append(tf.summary.scalar('loss_per_datapoint',
        #                                              normalizer_per_datapoint * self.loss))

        global_step, lr, lr_summary = optimizer.define_learning_rate(args)
        self._summaries.append(lr_summary)
        opt = optimizer.define_optimizer(args, lr)

        with tf.variable_scope('e_step'):
            var_list = (tf.trainable_variables('means/') +
                        tf.trainable_variables('samples_e/') +
                        tf.trainable_variables('samples_r/'))
            if aux_model is not None:
                var_list.append(self.log_normalizer_bias_var)
                if args.use_log_norm_weight:
                    var_list.append(self.log_normalizer_weight)
            log_file.write('# %d variational parameters\n' %
                           len(variational_parameters))
            gvs_e = opt.compute_gradients(self.loss, var_list=var_list)
            self._e_step = opt.apply_gradients(gvs_e, global_step)

        if args.em:
            with tf.variable_scope('m_step'):
                hyperparameters = tf.trainable_variables('hyperparameters/')
                log_file.write('# %d hyperparameters\n' % len(hyperparameters))
                gvs_m = opt.compute_gradients(self.loss,
                                              var_list=hyperparameters)
                m_step = opt.apply_gradients(gvs_m)
            self._em_step = tf.group(self._e_step, m_step, name='em_step')
        else:
            self._em_step = None

        self._summary_op = tf.summary.merge(self._summaries)
    def __init__(self, args, dat, rng, log_file=sys.stdout):
        log_file.write('\n# Creating model.\n')
        log_file.flush()

        self._summaries = []
        with tf.variable_scope('means'):
            self.means_e, self.means_r = self.define_emb(args, dat)
        with tf.variable_scope('samples_e'):
            self.samples_e, self.expanded_means_e, self.log_std_e = self.create_all_samplers(
                self.means_e, args)
        with tf.variable_scope('samples_r'):
            self.samples_r, self.expanded_means_r, self.log_std_r = self.create_all_samplers(
                self.means_r, args)

        with tf.variable_scope('minibatch'):
            self.minibatch_htr = tf.placeholder(
                tf.int32, shape=(None, 3), name='minibatch_htr')
            minibatch_size = tf.shape(self.minibatch_htr)[0]
            minibatch_size_float = tf.cast(minibatch_size, tf.float32)

            self.idx_h = self.minibatch_htr[:, 0]
            self.idx_t = self.minibatch_htr[:, 1]
            idx_r_predict_t = self.minibatch_htr[:, 2]
            idx_r_predict_h = idx_r_predict_t + dat.range_r

            emb_h = {label: tf.gather(samples, self.idx_h)
                     for label, samples in self.samples_e.items()}
            emb_t = {label: tf.gather(samples, self.idx_t)
                     for label, samples in self.samples_e.items()}
            emb_r_predict_t = {label: tf.gather(samples, idx_r_predict_t)
                               for label, samples in self.samples_r.items()}
            emb_r_predict_h = {label: tf.gather(samples, idx_r_predict_h)
                               for label, samples in self.samples_r.items()}

            self.minibatch_mean_h = {
                label: tf.gather(means, self.idx_h)
                for label, means in self.expanded_means_e.items()}
            self.minibatch_mean_t = {
                label: tf.gather(means, self.idx_t)
                for label, means in self.expanded_means_e.items()}
            self.minibatch_mean_r_predict_t = {
                label: tf.gather(means, idx_r_predict_t)
                for label, means in self.expanded_means_r.items()}
            self.minibatch_mean_r_predict_h = {
                label: tf.gather(means, idx_r_predict_h)
                for label, means in self.expanded_means_r.items()}

            # Prefactor for normalization per training data point.
            normalizer = 1.0 / tf.cast(args.num_samples, tf.float32)

        with tf.variable_scope('log_likelihood'):
            with tf.variable_scope('tail_prediction'):
                self.scores_predict_t = self.unnormalized_score(
                    emb_h, emb_r_predict_t, self.samples_e, args)
                ll_predict_t = normalizer * self._log_likelihood(
                    self.scores_predict_t, self.idx_t, args)
            with tf.variable_scope('head_prediction'):
                self.scores_predict_h = self.unnormalized_score(
                    emb_t, emb_r_predict_h, self.samples_e, args)
                ll_predict_h = normalizer * self._log_likelihood(
                    self.scores_predict_h, self.idx_h, args)
            log_likelihood = ll_predict_t + ll_predict_h

        with tf.variable_scope('hyperparameters'):
            counts_e, sort_indices_e = self._get_counts(
                dat.dat['train'][:, :2].flatten(), dat.range_e, 'e')
            counts_r, sort_indices_r = self._get_counts(
                dat.dat['train'][:, 2], dat.range_r, 'r')
            self.inverse_lambda_e = self._define_inverse_lambda(
                args, counts_e, 'e')
            self.inverse_lambda_r = self._define_inverse_lambda(
                args, counts_r, 'r')

        inverse_counts_e = (1.0 / counts_e).astype(np.float32)
        inverse_counts_r = (1.0 / counts_r).astype(np.float32)
        self._lambda_sigma_summary(
            self.inverse_lambda_e, self.log_std_e, inverse_counts_e, sort_indices_e, 'e')
        self._lambda_sigma_summary(
            self.inverse_lambda_r, self.log_std_r, inverse_counts_r, sort_indices_r, 'r')

        with tf.variable_scope('log_prior'):
            # r-counts are the same for head and tail prediction, so gather them only once.
            minibatch_inverse_counts_r = tf.gather(
                inverse_counts_r, idx_r_predict_t)
            log_prior = normalizer * (
                tf.reduce_sum(
                    tf.gather(inverse_counts_e, self.idx_h) * self.single_log_prior(
                        tf.gather(self.inverse_lambda_e, self.idx_h), emb_h))
                + tf.reduce_sum(
                    tf.gather(inverse_counts_e, self.idx_t) * self.single_log_prior(
                        tf.gather(self.inverse_lambda_e, self.idx_t), emb_t))
                + tf.reduce_sum(
                    minibatch_inverse_counts_r * self.single_log_prior(
                        tf.gather(self.inverse_lambda_r, idx_r_predict_t), emb_r_predict_t))
                + tf.reduce_sum(
                    minibatch_inverse_counts_r * self.single_log_prior(
                        tf.gather(self.inverse_lambda_r, idx_r_predict_h), emb_r_predict_h)))

        if args.em:
            # Calculate entropy of entire variational distribution (independent of minibatch).
            # Normalize per training data point.
            with tf.variable_scope('entropy'):
                entropy = (minibatch_size_float / len(dat.dat['train'])) * tf.add_n(
                    [tf.reduce_sum(i) for i in
                     list(self.log_std_e.values()) + list(self.log_std_r.values())],
                    name='entropy')
            self.loss = -tf.add_n([log_prior, log_likelihood, entropy],
                                  name='elbo')
        else:
            self.loss = -tf.add_n([log_prior, log_likelihood],
                                  name='log_joint')

        with tf.variable_scope('loss_parts'):
            normalizer_per_embedding = (
                len(dat.dat['train']) /
                (args.embedding_dim * (dat.range_e + 2 * dat.range_r) * minibatch_size_float))
            normalizer_per_datapoint = 0.5 / minibatch_size_float
            if args.em:
                self._summaries.append(tf.summary.scalar('entropy_per_embedding_and_dimension',
                                                         normalizer_per_embedding * entropy))
            self._summaries.append(tf.summary.scalar('log_prior_per_embedding_and_dimension',
                                                     normalizer_per_embedding * log_prior))
            self._summaries.append(tf.summary.scalar('log_likelihood_per_datapoint',
                                                     normalizer_per_datapoint * log_likelihood))
            self._summaries.append(tf.summary.scalar('loss_per_datapoint',
                                                     normalizer_per_datapoint * self.loss))

        global_step, lr_base, lr_summary = optimizer.define_base_learning_rate(
            args)
        self._summaries.append(lr_summary)

        with tf.variable_scope('e_step'):
            opt_mean = optimizer.define_optimizer(args, args.lr0_mu * lr_base)
            variational_parameters_mean = tf.trainable_variables('means/')
            update_means = opt_mean.minimize(
                self.loss, global_step=global_step, var_list=variational_parameters_mean)
            log_file.write('# %d variational parameters for means\n' %
                           len(variational_parameters_mean))

            if args.em:
                opt_sigma = optimizer.define_optimizer(
                    args, args.lr0_sigma * lr_base)
                variational_parameters_sigma = (tf.trainable_variables('samples_e/') +
                                                tf.trainable_variables('samples_r/'))
                log_file.write('# %d variational parameters for standard deviations\n' %
                               len(variational_parameters_sigma))
                update_sigmas = opt_sigma.minimize(
                    self.loss, var_list=variational_parameters_sigma)
                self._e_step = tf.group(
                    update_means, update_sigmas, name='e_step')
            else:
                self._e_step = update_means

        if args.em:
            with tf.variable_scope('m_step'):
                lr_lambda = args.lr0_lambda * lr_base
                update_lambda_e = tf.assign(
                    self.inverse_lambda_e,
                    (1.0 - lr_lambda) * self.inverse_lambda_e
                    + lr_lambda * self.estimate_inverse_lambda(self.samples_e))
                update_lambda_r = tf.assign(
                    self.inverse_lambda_r,
                    (1.0 - lr_lambda) * self.inverse_lambda_r
                    + lr_lambda * self.estimate_inverse_lambda(self.samples_r))
                m_step = tf.group(
                    update_lambda_e, update_lambda_r, name='m_step')
                log_file.write('# 2 hyperparameters\n')
            self._em_step = tf.group(self._e_step, m_step, name='em_step')
        else:
            self._em_step = None

        self._summary_op = tf.summary.merge(self._summaries)