def _ModelFn(features, labels, mode):
      if is_training:
        logits_out = self._BuildGraph(features)
      else:
        graph_def = self._GetGraphDef(use_trt, batch_size, model_dir)
        logits_out = importer.import_graph_def(
            graph_def,
            input_map={INPUT_NODE_NAME: features},
            return_elements=[OUTPUT_NODE_NAME + ':0'],
            name='')[0]

      loss = losses.sparse_softmax_cross_entropy(
          labels=labels, logits=logits_out)
      summary.scalar('loss', loss)

      classes_out = math_ops.argmax(logits_out, axis=1, name='classes_out')
      accuracy = metrics.accuracy(
          labels=labels, predictions=classes_out, name='acc_op')
      summary.scalar('accuracy', accuracy[1])

      if mode == ModeKeys.EVAL:
        return EstimatorSpec(
            mode, loss=loss, eval_metric_ops={'accuracy': accuracy})
      elif mode == ModeKeys.TRAIN:
        optimizer = AdamOptimizer(learning_rate=1e-2)
        train_op = optimizer.minimize(loss, global_step=get_global_step())
        return EstimatorSpec(mode, loss=loss, train_op=train_op)
Ejemplo n.º 2
0
def add_optimizer(loss):
    global_step = tf.Variable(0, trainable=False)
    optimizer = AdamOptimizer()
    grads_and_vars = optimizer.compute_gradients(loss)
    for grad, var in grads_and_vars:
        if grad is not None:
            tf.histogram_summary(var.op.name + '/gradients', grad)
    return optimizer.apply_gradients(grads_and_vars, global_step)
Ejemplo n.º 3
0
 def test_optimizer_garbage_collection(self):
   graph = ops.Graph()
   with graph.as_default():
     optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
     keras.backend.track_tf_optimizer(optimizer)
     optimizer_weak = weakref.ref(optimizer)
   graph_weak = weakref.ref(graph)
   del graph, optimizer
   gc.collect()
   # Check that the weak references are dead now.
   self.assertIs(graph_weak(), None)
   self.assertIs(optimizer_weak(), None)
Ejemplo n.º 4
0
    def __init__(self, inputs, network, check_point="dqn.ckpt"):
        self.saver = tf.train.Saver()
        self.summary_writer = tf.summary.FileWriter("/tmp/dqn")
        self.inputs = inputs
        self.network = network
        self.targets = tf.placeholder(tf.float32, shape=(None, self.output_shape[1]))
        summary_names = ["actions", "loss", "exploration_rate", "fruits_eaten", "timesteps_survived"]

        self.summary_placeholders = {name: tf.placeholder(dtype=tf.float32) for name in summary_names}

        # self.summary_placeholders = [tf.placeholder(dtype=summary_variables[i].dtype)
        #                              for i in range(len(summary_names))]

        # summary_ops = [tf.assign(summary_variables[i],self.summary_placeholders[i])
        #                for i in range(len(summary_names))

        summary = [tf.summary.histogram(summary_names[i], self.summary_placeholders[summary_names[i]]) for i in
                   range(1)]
        summary += [tf.summary.scalar(summary_names[i], self.summary_placeholders[summary_names[i]]) for i in
                    range(1, len(summary_names))]

        self.summary_ops = tf.summary.merge_all()

        self.loss = tf.losses.mean_squared_error(self.network, self.targets)
        optimizer = AdamOptimizer()
        self.train_step = optimizer.minimize(loss=self.loss)
        #
        # with tf.colocate_with(global_step):
        #     self.update_op = tf.assign_add(global_step, 1)

        self.sess = tf.Session()

        self.summary_writer.add_graph(tf.get_default_graph())

        with self.sess.as_default():
            tf.global_variables_initializer().run()

        if os.path.exists(check_point):
            self.saver.restore(self.sess, check_point)
Ejemplo n.º 5
0
    def get(self, name=None, lr_decay=None, global_step=None):
        params = {} if self.params is None else self.params.copy()
        with tf.variable_scope('opt'):
            lr_tensor = tf.get_variable('lr',
                                        dtype=tf.float32,
                                        initializer=tf.constant(
                                            params['learning_rate']),
                                        trainable=False)
            if lr_decay is not None:
                params['learning_rate'] = lr_decay(
                    learning_rate=params['learning_rate'],
                    global_step=global_step,
                    name='lr_decay')

            self.lr_op = lr_tensor if lr_decay is None else lr_tensor.assign(
                params['learning_rate'])
            params['learning_rate'] = self.lr_op
        if self.opt_name == "Adam":
            if name is None:
                return AdamOptimizer(**params)
            else:
                return AdamOptimizer(name=name, **params)
        elif self.opt_name == "Adadelta":
            if name is None:
                return AdadeltaOptimizer(**params)
            else:
                return AdadeltaOptimizer(name=name, **params)
        elif self.opt_name == "RMSprop":
            if name is None:
                return RMSPropOptimizer(**params)
            else:
                return RMSPropOptimizer(name=name, **params)
        elif self.opt_name == "Momentum":
            if name is None:
                return MomentumOptimizer(**params)
            else:
                return MomentumOptimizer(name=name, **params)
        else:
            raise NotImplemented()
Ejemplo n.º 6
0
    def __init__(self, options, data_train, session=None):
        self.statistics = DBQAStatistics.from_data(data_train)
        self.options = options

        self.optimizer = AdamOptimizer()
        self.global_step = tf.train.get_or_create_global_step()

        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)

        self.question_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.question_bigram_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.answer_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.answer_bigram_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.wrong_answer_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.wrong_answer_bigram_2d_pl = tf.placeholder(tf.int32, (None, None))

        self.network = PairwiseSimilarity(options, self.statistics)
        self.loss, self.accuracy = self.network.get_loss(
            self.question_2d_pl,
            self.question_bigram_2d_pl,
            self.answer_2d_pl,
            self.answer_bigram_2d_pl,
            self.wrong_answer_2d_pl,
            self.wrong_answer_bigram_2d_pl,
        )

        self.similarity = self.network.get_similarity(
            self.question_2d_pl, self.question_bigram_2d_pl, self.answer_2d_pl,
            self.answer_bigram_2d_pl)

        self.optimize_op = self.optimizer.minimize(
            self.loss, global_step=self.global_step)

        if session is None:
            self.session = self.create_session()
            self.session.run(tf.global_variables_initializer())
        else:
            self.session = session
        self.random = Random(42)
Ejemplo n.º 7
0
 def test_optimizer_garbage_collection(self):
     if context.executing_eagerly():
         self.skipTest('v1 optimizer does not run in eager mode')
     graph = ops.Graph()
     with graph.as_default():
         optimizer = optimizer_v1.TFOptimizer(AdamOptimizer(0.01))
         keras.backend.track_tf_optimizer(optimizer)
         optimizer_weak = weakref.ref(optimizer)
     graph_weak = weakref.ref(graph)
     del graph, optimizer
     gc.collect()
     # Check that the weak references are dead now.
     self.assertIs(graph_weak(), None)
     self.assertIs(optimizer_weak(), None)
Ejemplo n.º 8
0
    def _add_optimizer(self):
        self.optimizer = AdamOptimizer()

        self.final_train_loss = self.main_train_loss

        with tf.variable_scope('l2_regularization'):
            # Find variables to regularize by iterating over all variables and checking if in set. Haven't found way to
            # directly get variables by absolute path.
            l2_regularized_names = {
                'encoder/bidirectional_rnn/fw/gru_cell/gates/weights:0'
                # If used, add additional complete variables names
            }
            l2_regularized = [
                variable for variable in tf.trainable_variables()
                if variable.name in l2_regularized_names
            ]

            l2_loss = 0.001 * tf.add_n(
                [tf.nn.l2_loss(variable) for variable in l2_regularized])

        gradients = self.optimizer.compute_gradients(self.final_train_loss)

        with tf.variable_scope('gradient_clipping'):

            def clip_gradient(gradient, variable):
                # Only clip normal tensors, IndexedSlices gives warning otherwise
                if isinstance(gradient, tf.Tensor):
                    gradient = tf.clip_by_norm(gradient, 10)
                return gradient, variable

            gradients = [
                clip_gradient(gradient, variable)
                for gradient, variable in gradients
            ]
        self.minimize_operation = self.optimizer.apply_gradients(
            gradients, global_step=self.global_step)
Ejemplo n.º 9
0
 def test_mixed_precision_loss_scale_optimizer(self):
     if context.executing_eagerly():
         self.skipTest('v1 optimizer does not run in eager mode')
     optimizer = MixedPrecisionLossScaleOptimizer(AdamOptimizer(),
                                                  'dynamic')
     model = keras.models.Sequential()
     model.add(
         keras.layers.Dense(2,
                            input_shape=(3, ),
                            kernel_constraint=keras.constraints.MaxNorm(1)))
     model.compile(loss='mean_squared_error',
                   optimizer=optimizer,
                   run_eagerly=testing_utils.should_run_eagerly())
     model.fit(np.random.random((5, 3)),
               np.random.random((5, 2)),
               epochs=1,
               batch_size=5,
               verbose=0)
Ejemplo n.º 10
0
 def test_tfoptimizer(self):
   optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
   model = keras.models.Sequential()
   model.add(keras.layers.Dense(
       2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
   # This is possible
   model.compile(loss='mean_squared_error', optimizer=optimizer)
   keras.backend.track_tf_optimizer(optimizer)
   model.fit(np.random.random((5, 3)),
             np.random.random((5, 2)),
             epochs=1,
             batch_size=5,
             verbose=0)
   # not supported
   with self.assertRaises(NotImplementedError):
     _ = optimizer.weights
   with self.assertRaises(NotImplementedError):
     optimizer.get_config()
   with self.assertRaises(NotImplementedError):
     optimizer.from_config(None)
Ejemplo n.º 11
0
def get_conv_classifier():
    n_classes = 5
    feature_columns = [layers.real_valued_column("", dimension=3)]

    # learning_rate = 1.0
    # optimizer = AdagradOptimizer(learning_rate)
    #
    # learning_rate = 1.0
    # optimizer = AdadeltaOptimizer(learning_rate=learning_rate)

    # ~ 62.55%
    learning_rate = 0.01
    optimizer = AdamOptimizer(learning_rate, epsilon=0.1)

    # learning_rate = 0.05
    # optimizer = GradientDescentOptimizer(learning_rate)

    # learning_rate = 0.1
    # optimizer = RMSPropOptimizer(learning_rate, momentum=0.1)

    # learning_rate = 0.1
    # optimizer = FtrlOptimizer(learning_rate)

    return SKCompat(
        Estimator(
            model_fn=get_conv_model,
            params={
                'head':
                head_lib._multi_class_head(  # pylint: disable=protected-access
                    n_classes,
                    enable_centered_bias=False),
                'feature_columns':
                feature_columns,
                'activation_fn':
                tf.nn.relu,
                'learning_rate':
                learning_rate,
                'optimizer':
                optimizer
            },
            model_dir='saved_model'))
Ejemplo n.º 12
0
  def test_tf_optimizer_iterations(self):
    if testing_utils.should_run_tf_function() or context.executing_eagerly():
      self.skipTest(
          'v1 optimizer does not run in experimental_run_tf_function mode or '
          'eager mode')
    with self.cached_session():
      optimizer = keras.optimizers.TFOptimizer(AdamOptimizer(0.01))
      model = keras.models.Sequential()
      model.add(keras.layers.Dense(
          2, input_shape=(3,), kernel_constraint=keras.constraints.MaxNorm(1)))
      model.compile(
          loss='mean_squared_error',
          optimizer=optimizer,
          run_eagerly=testing_utils.should_run_eagerly(),
          experimental_run_tf_function=testing_utils.should_run_tf_function())
      keras.backend.track_tf_optimizer(optimizer)
      self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 0)

      model.fit(np.random.random((55, 3)),
                np.random.random((55, 2)),
                epochs=1,
                batch_size=5,
                verbose=0)
      self.assertEqual(keras.backend.get_value(model.optimizer.iterations), 11)
Ejemplo n.º 13
0
                      user_num,
                      item_num,
                      cum_table,
                      batch_size=batch_size,
                      max_len=max_len,
                      n_workers=3)

model, emb = build_model(max_len=max_len,
                         input_dim=item_num + 1,
                         embedding_dim=50,
                         feed_forward_units=50,
                         head_num=1,
                         block_num=2,
                         dropout_rate=0.2)

optimizer = AdamOptimizer(0.001)
tbcb = TensorBoard(log_dir='/logs',
                   histogram_freq=1,
                   write_graph=True,
                   write_grads=True,
                   write_images=True,
                   embeddings_freq=1)

loss_history = []
cos_loss_history = []

T = 0.0
t0 = time.time()

tbcb.set_model(model)
tbcb.on_train_begin()
Ejemplo n.º 14
0
            def __init__(self, word_vector_size):
                tf.reset_default_graph()
                self.vector_size = word_vector_size

                self.vectors = tf.placeholder(tf.float32, shape=(None, None, word_vector_size))
                self.user_terms = tf.placeholder(tf.float32, shape=(None, None))
                self.ut2 = tf.placeholder(tf.float32, shape=(None, None))
                self.group_by = tf.placeholder(tf.float32, shape=(None, None, None))
                self.padding = tf.placeholder(tf.float32, shape=(None, None))
                self.output = tf.placeholder(tf.float32, shape=(None, 1))
                self.dropout_rate = tf.placeholder(tf.float32)

                xavier = tf.contrib.layers.xavier_initializer()

                # 50 tri-gram, 50 4-gram and 50 5-gram
                filter_tri = tf.Variable(xavier((1, 2, word_vector_size, 50)), name="weight")  #
                bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias")  #
                self.f3 = filter_tri
                self.b3 = bias_tri

                filter_4 = tf.Variable(xavier((1, 3, word_vector_size, 50)), name="weight")  #
                bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f4 = filter_4
                self.b4 = bias_4

                filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)), name="weight")  #
                bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f5 = filter_5
                self.b5 = bias_5

                with tf.name_scope("relevance"):
                    hidden = 150
                    self.relevance_weight = tf.Variable(0.01 * xavier((hidden, 2)))
                    self.relevance_bias = tf.Variable(0.0 * xavier((1, 2)))
                    self.relevance_attention_weight = tf.Variable(0.01 * xavier((100, 2)))
                    self.relevance_attention_bias = tf.Variable(0.0 * xavier((1, 2)))

                rel, pre_max_true_dropped, pre_max_sum = self.forward(self.vectors)
                self.relevance = rel[:, 1]

                ut = tf.expand_dims(self.ut2, 2)  # NWC
                rel_masked, pre_max_true_masked_dropped, _ = self.forward(self.vectors * ut)
                self.rel_masked = rel_masked

                self.pre_max = pre_max_sum
                self.get_attention()

                # true_attention_error = 0.0
                att_reg = 0.0

                prediction_error = -tf.reduce_sum((self.output * tf.log(rel[:, 1] + 10 ** -5, name="log2rel") + (
                        1 - self.output) * tf.log(rel[:, 0] + 10 ** -5, name="log3rel")))

                # N, num_unique, text_length ; N,text_length
                pos_attention = tf.squeeze(tf.matmul(self.group_by, tf.expand_dims(self.pos_attention, -1)),
                                       squeeze_dims=-1)
                neg_attention = tf.squeeze(tf.matmul(self.group_by, tf.expand_dims(self.neg_attention, -1)),
                                       squeeze_dims=-1)
                self.pos_att_grouped = pos_attention
                self.neg_att_grouped = neg_attention

                pos_heads = tf.reduce_sum(tf.multiply(pos_attention, self.user_terms), axis=1)
                neg_heads = tf.reduce_sum(tf.multiply(neg_attention, self.user_terms), axis=1)
                self.pos_heads = pos_heads

                attention_error = 0.0
                occlusion_error = 0.0
                if use_attention:
                    attention_error += tf.reduce_sum(self.output*(pos_heads - 0.5) ** 2)
                    att_reg = tf.reduce_sum(self.output * tf.nn.relu(self.pos_attention - att_max_value)
                                                     + (1-self.output) * tf.nn.relu(self.neg_attention-att_max_value))
                    occlusion_error =  -tf.reduce_sum((self.output * tf.log(rel_masked[:, 1] + 10 ** -5, name="log2rel2") + (
                        1 - self.output) * tf.log(rel_masked[:, 0] + 10 ** -5, name="log3rel2")))


                self.att = attention_error

                self.error = (   prediction_error
                              + tf.sign(tf.reduce_sum(self.user_terms)) * attention_error
                              +  tf.sign(tf.reduce_sum(self.user_terms)) * occlusion_error
                              + tf.sign(tf.reduce_sum(self.user_terms)) * att_reg)

                self.a = tf.check_numerics(attention_error, message="att") + tf.check_numerics(pos_heads,
                                                                                               message="pos-heads") + tf.check_numerics(
                    neg_heads, message="neg-heads")
                self.opt = AdamOptimizer()
                self.optimizer = self.opt.minimize(self.error)
                self.uncertainty = 1

                self.sess = tf.Session()
                self.sess.run(tf.global_variables_initializer())
                self.n_trained = 0
                self.training = False
Ejemplo n.º 15
0
        class CNN_prior:
            def get_attention(self):
                self.pos_attention = tf.reduce_sum(tf.gradients(self.pre_max[:, 1], self.vectors)[0] * self.vectors, axis=2)
                self.pos_attention = softmax_padding(self.pos_attention, self.padding, axis=1)

                self.neg_attention = tf.reduce_sum(tf.gradients(self.pre_max[:, 0], self.vectors)[0] * self.vectors, axis=2)
                self.neg_attention = softmax_padding(self.neg_attention, self.padding, axis=1)

            def forward(self, v):

                vectors2d = tf.expand_dims(v, 1)  # None x 1 x 200 x 300 ... NHWC

                conv1 = tf.nn.conv2d(
                    input=vectors2d,
                    filter=self.f3,
                    strides=[1, 1, 1, 1],
                    padding="VALID"
                )  # None x 1 x words x 50
                A1 = tf.nn.leaky_relu(conv1 + self.b3)

                self.a1 = A1
                conv2 = tf.nn.conv2d(
                    input=vectors2d,
                    filter=self.f4,
                    strides=[1, 1, 1, 1],
                    padding="VALID"
                )  # None x 1 x words x 50

                A2 = tf.nn.leaky_relu(conv2 + self.b4)
                self.a2 = A2

                conv3 = tf.nn.conv2d(
                    input=vectors2d,
                    filter=self.f5,
                    strides=[1, 1, 1, 1],
                    padding="VALID"
                )  # None x 1 x words x 5

                A3 = tf.nn.leaky_relu(conv3 + self.b5)

                max_A1_train = tf.reshape(tf.squeeze(tf.reduce_max(A1, 2)), [-1, 50])  # None x 5
                max_A2_train = tf.reshape(tf.squeeze(tf.reduce_max(A2, 2)), [-1, 50])  # None x 5
                max_A3_train = tf.reshape(tf.squeeze(tf.reduce_max(A3, 2)), [-1, 50])  # None x 5

                concat = tf.concat([max_A1_train, max_A2_train, max_A3_train], axis=1)
                concat_drop = tf.nn.dropout(concat,keep_prob=self.dropout_rate)
                pre_max_true_drop = tf.matmul(concat_drop, self.relevance_weight) + self.relevance_bias
                rel = tf.nn.softmax(pre_max_true_drop, axis=1)

                sum_A1_train = tf.reshape(tf.squeeze(tf.reduce_sum(A1, 2)), [-1, 50])  # None x 5
                sum_A2_train = tf.reshape(tf.squeeze(tf.reduce_sum(A2, 2)), [-1, 50])  # None x 5
                sum_A3_train = tf.reshape(tf.squeeze(tf.reduce_sum(A3, 2)), [-1, 50])  # None x 5

                concat_sums = tf.concat([sum_A1_train, sum_A2_train, sum_A3_train], axis=1)
                pre_max_sum = tf.matmul(concat_sums, self.relevance_weight) + self.relevance_bias
                return rel, pre_max_true_drop, pre_max_sum

            def groupby(self,att):
                return ndmatmul(self.group_by,att)

            def __init__(self, word_vector_size):
                tf.reset_default_graph()
                self.vector_size = word_vector_size

                self.vectors = tf.placeholder(tf.float32, shape=(None, None, word_vector_size))
                self.user_terms = tf.placeholder(tf.float32, shape=(None, None))
                self.ut2 = tf.placeholder(tf.float32, shape=(None, None))
                self.group_by = tf.placeholder(tf.float32, shape=(None, None, None))
                self.padding = tf.placeholder(tf.float32, shape=(None, None))
                self.output = tf.placeholder(tf.float32, shape=(None, 1))
                self.dropout_rate = tf.placeholder(tf.float32)

                xavier = tf.contrib.layers.xavier_initializer()

                # 50 tri-gram, 50 4-gram and 50 5-gram
                filter_tri = tf.Variable(xavier((1, 2, word_vector_size, 50)), name="weight")  #
                bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias")  #
                self.f3 = filter_tri
                self.b3 = bias_tri

                filter_4 = tf.Variable(xavier((1, 3, word_vector_size, 50)), name="weight")  #
                bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f4 = filter_4
                self.b4 = bias_4

                filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)), name="weight")  #
                bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f5 = filter_5
                self.b5 = bias_5

                with tf.name_scope("relevance"):
                    hidden = 150
                    self.relevance_weight = tf.Variable(0.01 * xavier((hidden, 2)))
                    self.relevance_bias = tf.Variable(0.0 * xavier((1, 2)))
                    self.relevance_attention_weight = tf.Variable(0.01 * xavier((100, 2)))
                    self.relevance_attention_bias = tf.Variable(0.0 * xavier((1, 2)))

                rel, pre_max_true_dropped, pre_max_sum = self.forward(self.vectors)
                self.relevance = rel[:, 1]

                ut = tf.expand_dims(self.ut2, 2)  # NWC
                rel_masked, pre_max_true_masked_dropped, _ = self.forward(self.vectors * ut)
                self.rel_masked = rel_masked

                self.pre_max = pre_max_sum
                self.get_attention()

                # true_attention_error = 0.0
                att_reg = 0.0

                prediction_error = -tf.reduce_sum((self.output * tf.log(rel[:, 1] + 10 ** -5, name="log2rel") + (
                        1 - self.output) * tf.log(rel[:, 0] + 10 ** -5, name="log3rel")))

                # N, num_unique, text_length ; N,text_length
                pos_attention = tf.squeeze(tf.matmul(self.group_by, tf.expand_dims(self.pos_attention, -1)),
                                       squeeze_dims=-1)
                neg_attention = tf.squeeze(tf.matmul(self.group_by, tf.expand_dims(self.neg_attention, -1)),
                                       squeeze_dims=-1)
                self.pos_att_grouped = pos_attention
                self.neg_att_grouped = neg_attention

                pos_heads = tf.reduce_sum(tf.multiply(pos_attention, self.user_terms), axis=1)
                neg_heads = tf.reduce_sum(tf.multiply(neg_attention, self.user_terms), axis=1)
                self.pos_heads = pos_heads

                attention_error = 0.0
                occlusion_error = 0.0
                if use_attention:
                    attention_error += tf.reduce_sum(self.output*(pos_heads - 0.5) ** 2)
                    att_reg = tf.reduce_sum(self.output * tf.nn.relu(self.pos_attention - att_max_value)
                                                     + (1-self.output) * tf.nn.relu(self.neg_attention-att_max_value))
                    occlusion_error =  -tf.reduce_sum((self.output * tf.log(rel_masked[:, 1] + 10 ** -5, name="log2rel2") + (
                        1 - self.output) * tf.log(rel_masked[:, 0] + 10 ** -5, name="log3rel2")))


                self.att = attention_error

                self.error = (   prediction_error
                              + tf.sign(tf.reduce_sum(self.user_terms)) * attention_error
                              +  tf.sign(tf.reduce_sum(self.user_terms)) * occlusion_error
                              + tf.sign(tf.reduce_sum(self.user_terms)) * att_reg)

                self.a = tf.check_numerics(attention_error, message="att") + tf.check_numerics(pos_heads,
                                                                                               message="pos-heads") + tf.check_numerics(
                    neg_heads, message="neg-heads")
                self.opt = AdamOptimizer()
                self.optimizer = self.opt.minimize(self.error)
                self.uncertainty = 1

                self.sess = tf.Session()
                self.sess.run(tf.global_variables_initializer())
                self.n_trained = 0
                self.training = False

            def get_feed_dict(self, doc):
                return {self.vectors: np.array(doc.vectors, dtype=np.float32).reshape([1, -1, self.vector_size]),
                        self.output: [[doc.class_ * 1]],
                        self.user_terms: np.array(doc.user_terms, dtype=np.float32).reshape([1, -1]),
                        self.padding: np.array([1 for i in doc.words]).reshape([1, -1])}

            def blow_up(self,mat,num_rows,num_cols):
                blowed_mat = [i+[0]*(num_cols-len(i)) for i in mat]
                x=([0] * num_cols) * (num_rows - len(blowed_mat))
                if x:
                    blowed_mat.append(x)
                return blowed_mat

            def get_feed_dict_multiple(self, docs):
                dp = 0.7 if self.training else 1
                maximum = max([len(doc.vectors) for doc in docs])
                maximum = max([maximum,7])
                max_terms = max([len(doc.user_terms) for doc in docs])
                return {self.vectors: np.array(
                    [doc.vectors[:maximum] + [[0] * (self.vector_size)] * (maximum - len(doc.vectors[:maximum])) for doc
                     in
                     docs]).reshape([-1, maximum, self.vector_size]),
                        self.group_by:np.array([self.blow_up(doc.gb,max_terms,maximum) for doc in docs]),
                        self.ut2: np.array(
                            [doc.ut2[:maximum] + [0] * (maximum - len(doc.ut2[:maximum])) for doc in
                             docs]).reshape([-1, maximum]),
                        self.output: [[doc.class_ * 1] for doc in docs],
                        self.user_terms: np.array(
                            [doc.user_terms[:max_terms] + [0] * (max_terms - len(doc.user_terms[:max_terms])) for doc in
                             docs]).reshape([-1, max_terms]),
                        self.padding: np.array(
                            [[1] * len(doc.vectors[:maximum]) + [0] * (maximum - len(doc.vectors[:maximum])) for doc in
                             docs]).reshape([-1, maximum]),
                        self.dropout_rate:dp}

            def load(self, filename):
                saver = tf.train.Saver()
                saver.restore(self.sess, filename)
                pass

            def train(self, docs, train_full=False):
                self.training = True
                self.sess.run(tf.global_variables_initializer())
                sess = self.sess
                print("====23")
                n = len(docs)
                epochs = 200
                if train_full:
                    epochs = 10
                self.n_trained = n
                import random
                random.shuffle(docs)
                last_10 = [100] * 10
                prev_error = None
                for epoch in range(epochs):
                    total_error = 0
                    for doc_s in [docs[i:i + 1] for i in range(0, len(docs), 1)]:
                        fd = self.get_feed_dict_multiple(doc_s)
                        try:
                            sess.run(self.a, feed_dict=fd)
                        except Exception as e:
                            print("check")
                        _, error = sess.run([self.optimizer, self.error], feed_dict=fd)
                        # print(x,y)
                        # if epoch>50 and x>=0.5:
                        #     print("ch")
                        # print(error,error-x,x)
                        total_error += error
                    total_error = total_error / len(docs)
                    # print(total_error)
                    if train_full:
                        saver = tf.train.Saver()
                        saver.save(sess, "./{}.pkl".format(epoch))
                    # print(total_error)
                    if epoch>10 and total_error > 4:
                        self.train(docs)
                        return
                    last_10.pop(0)
                    last_10.append(total_error)
                    if max(last_10) < 0.05:
                        print("breaking")
                        break
                print(total_error)
                self.training = False

            def run(self, docs):
                sess = self.sess
                for doc_s in [docs[i:i + 1] for i in range(0, len(docs), 1)]:
                    fd = self.get_feed_dict_multiple(doc_s)

                    try:
                        l1 = sess.run([self.relevance, self.pos_att_grouped, self.neg_att_grouped,self.pos_heads],
                                  feed_dict=fd)
                    except Exception as e:
                        print("here")
                    for ind, doc in enumerate(doc_s):
                        d = {
                            "rel": l1[0][ind],
                            "pos_att": l1[1][ind],
                            "neg_att": l1[2][ind],
                            "pos_heads": l1[3][ind]
                        }
                        doc.pred_class = 0 if d["rel"] < 0.5 else 1
                        doc.parameters = d
Ejemplo n.º 16
0
            def __init__(self, word_vector_size):
                tf.reset_default_graph()
                self.vector_size = word_vector_size

                self.vectors = tf.placeholder(tf.float32,
                                              shape=(None, None,
                                                     word_vector_size))
                self.user_terms = tf.placeholder(tf.float32,
                                                 shape=(None, None))
                self.padding = tf.placeholder(tf.float32, shape=(None, None))
                self.output = tf.placeholder(tf.float32, shape=(None, 1))
                self.dropout_rate = tf.placeholder(tf.float32)

                xavier = tf.contrib.layers.xavier_initializer()

                # 50 tri-gram, 50 4-gram and 50 5-gram
                filter_tri = tf.Variable(xavier((1, 3, word_vector_size, 50)),
                                         name="weight")  #
                bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias")  #
                self.f3 = filter_tri
                self.b3 = bias_tri

                filter_4 = tf.Variable(xavier((1, 4, word_vector_size, 50)),
                                       name="weight")  #
                bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f4 = filter_4
                self.b4 = bias_4

                filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)),
                                       name="weight")  #
                bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f5 = filter_5
                self.b5 = bias_5

                with tf.name_scope("relevance"):
                    hidden = 150
                    self.relevance_weight = tf.Variable(0.01 * xavier(
                        (hidden, 2)))
                    self.relevance_bias = tf.Variable(0.0 * xavier((1, 2)))
                    self.relevance_attention_weight = tf.Variable(
                        0.01 * xavier((100, 2)))
                    self.relevance_attention_bias = tf.Variable(0.0 * xavier(
                        (1, 2)))

                rel, pre_max_true_dropped, pre_max_sum = self.forward(
                    self.vectors)
                self.relevance = rel[:, 1]

                ut = tf.expand_dims(self.user_terms, 2)  # NWC
                rel_masked, pre_max_true_masked_dropped, _ = self.forward(
                    self.vectors * ut)
                self.rel_masked = rel_masked

                self.pre_max_sum = pre_max_sum
                self.get_attribution()

                prediction_error = -tf.reduce_sum(
                    (self.output * tf.log(rel[:, 1] + 10**-5, name="log2rel") +
                     (1 - self.output) *
                     tf.log(rel[:, 0] + 10**-5, name="log3rel")))

                pos_heads = tf.reduce_sum(tf.multiply(self.pos_attribution,
                                                      self.user_terms),
                                          axis=1)
                neg_heads = tf.reduce_sum(tf.multiply(self.neg_attribution,
                                                      self.user_terms),
                                          axis=1)

                misattribution_error = 0.0
                corrective_error = 0.0
                att_reg = 0.0

                if use_attribution:
                    misattribution_error += tf.reduce_sum(
                        self.output * (pos_heads - 0.9)**2 +
                        (1 - self.output) * (neg_heads - 0.9)**2)
                    att_reg = tf.reduce_sum(
                        self.output *
                        tf.nn.relu(self.pos_attribution - att_max_value) +
                        (1 - self.output) *
                        tf.nn.relu(self.neg_attribution - att_max_value))

                    corrective_error = -tf.reduce_sum(
                        (self.output *
                         tf.log(rel_masked[:, 1] + 10**-5, name="log2rel2") +
                         (1 - self.output) *
                         tf.log(rel_masked[:, 0] + 10**-5, name="log3rel2")))

                self.error = (
                    prediction_error +
                    tf.sign(tf.reduce_sum(self.user_terms)) *
                    (misattribution_error + corrective_error + att_reg))

                self.opt = AdamOptimizer()
                self.optimizer = self.opt.minimize(self.error)

                self.sess = tf.Session()
                self.sess.run(tf.global_variables_initializer())
                self.training = False
Ejemplo n.º 17
0
        class CNN_prior:
            def get_attribution(self):
                self.pos_attribution = tf.reduce_sum(
                    tf.gradients(self.pre_max_sum[:, 1], self.vectors)[0] *
                    self.vectors,
                    axis=2)
                self.pos_attribution = softmax_padding(self.pos_attribution,
                                                       self.padding,
                                                       axis=1)

                self.neg_attribution = tf.reduce_sum(
                    tf.gradients(self.pre_max_sum[:, 0], self.vectors)[0] *
                    self.vectors,
                    axis=2)
                self.neg_attribution = softmax_padding(self.neg_attribution,
                                                       self.padding,
                                                       axis=1)

            def forward(self, v):

                vectors2d = tf.expand_dims(v,
                                           1)  # None x 1 x 200 x 300 ... NHWC

                conv1 = tf.nn.conv2d(input=vectors2d,
                                     filter=self.f3,
                                     strides=[1, 1, 1, 1],
                                     padding="VALID")  # None x 1 x words x 50
                A1 = tf.nn.leaky_relu(conv1 + self.b3)

                self.a1 = A1
                conv2 = tf.nn.conv2d(input=vectors2d,
                                     filter=self.f4,
                                     strides=[1, 1, 1, 1],
                                     padding="VALID")  # None x 1 x words x 50

                A2 = tf.nn.leaky_relu(conv2 + self.b4)
                self.a2 = A2

                conv3 = tf.nn.conv2d(input=vectors2d,
                                     filter=self.f5,
                                     strides=[1, 1, 1, 1],
                                     padding="VALID")  # None x 1 x words x 5

                A3 = tf.nn.leaky_relu(conv3 + self.b5)

                max_A1_train = tf.reshape(tf.squeeze(tf.reduce_max(A1, 2)),
                                          [-1, 50])  # None x 5
                max_A2_train = tf.reshape(tf.squeeze(tf.reduce_max(A2, 2)),
                                          [-1, 50])  # None x 5
                max_A3_train = tf.reshape(tf.squeeze(tf.reduce_max(A3, 2)),
                                          [-1, 50])  # None x 5

                concat = tf.concat([max_A1_train, max_A2_train, max_A3_train],
                                   axis=1)
                concat_drop = tf.nn.dropout(concat,
                                            keep_prob=self.dropout_rate)
                pre_max_true_drop = tf.matmul(
                    concat_drop, self.relevance_weight) + self.relevance_bias
                rel = tf.nn.softmax(pre_max_true_drop, axis=1)

                sum_A1_train = tf.reshape(tf.squeeze(tf.reduce_sum(A1, 2)),
                                          [-1, 50])  # None x 5
                sum_A2_train = tf.reshape(tf.squeeze(tf.reduce_sum(A2, 2)),
                                          [-1, 50])  # None x 5
                sum_A3_train = tf.reshape(tf.squeeze(tf.reduce_sum(A3, 2)),
                                          [-1, 50])  # None x 5

                concat_sums = tf.concat(
                    [sum_A1_train, sum_A2_train, sum_A3_train], axis=1)
                pre_max_sum = tf.matmul(
                    concat_sums, self.relevance_weight) + self.relevance_bias
                return rel, pre_max_true_drop, pre_max_sum

            def __init__(self, word_vector_size):
                tf.reset_default_graph()
                self.vector_size = word_vector_size

                self.vectors = tf.placeholder(tf.float32,
                                              shape=(None, None,
                                                     word_vector_size))
                self.user_terms = tf.placeholder(tf.float32,
                                                 shape=(None, None))
                self.padding = tf.placeholder(tf.float32, shape=(None, None))
                self.output = tf.placeholder(tf.float32, shape=(None, 1))
                self.dropout_rate = tf.placeholder(tf.float32)

                xavier = tf.contrib.layers.xavier_initializer()

                # 50 tri-gram, 50 4-gram and 50 5-gram
                filter_tri = tf.Variable(xavier((1, 3, word_vector_size, 50)),
                                         name="weight")  #
                bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias")  #
                self.f3 = filter_tri
                self.b3 = bias_tri

                filter_4 = tf.Variable(xavier((1, 4, word_vector_size, 50)),
                                       name="weight")  #
                bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f4 = filter_4
                self.b4 = bias_4

                filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)),
                                       name="weight")  #
                bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f5 = filter_5
                self.b5 = bias_5

                with tf.name_scope("relevance"):
                    hidden = 150
                    self.relevance_weight = tf.Variable(0.01 * xavier(
                        (hidden, 2)))
                    self.relevance_bias = tf.Variable(0.0 * xavier((1, 2)))
                    self.relevance_attention_weight = tf.Variable(
                        0.01 * xavier((100, 2)))
                    self.relevance_attention_bias = tf.Variable(0.0 * xavier(
                        (1, 2)))

                rel, pre_max_true_dropped, pre_max_sum = self.forward(
                    self.vectors)
                self.relevance = rel[:, 1]

                ut = tf.expand_dims(self.user_terms, 2)  # NWC
                rel_masked, pre_max_true_masked_dropped, _ = self.forward(
                    self.vectors * ut)
                self.rel_masked = rel_masked

                self.pre_max_sum = pre_max_sum
                self.get_attribution()

                prediction_error = -tf.reduce_sum(
                    (self.output * tf.log(rel[:, 1] + 10**-5, name="log2rel") +
                     (1 - self.output) *
                     tf.log(rel[:, 0] + 10**-5, name="log3rel")))

                pos_heads = tf.reduce_sum(tf.multiply(self.pos_attribution,
                                                      self.user_terms),
                                          axis=1)
                neg_heads = tf.reduce_sum(tf.multiply(self.neg_attribution,
                                                      self.user_terms),
                                          axis=1)

                misattribution_error = 0.0
                corrective_error = 0.0
                att_reg = 0.0

                if use_attribution:
                    misattribution_error += tf.reduce_sum(
                        self.output * (pos_heads - 0.9)**2 +
                        (1 - self.output) * (neg_heads - 0.9)**2)
                    att_reg = tf.reduce_sum(
                        self.output *
                        tf.nn.relu(self.pos_attribution - att_max_value) +
                        (1 - self.output) *
                        tf.nn.relu(self.neg_attribution - att_max_value))

                    corrective_error = -tf.reduce_sum(
                        (self.output *
                         tf.log(rel_masked[:, 1] + 10**-5, name="log2rel2") +
                         (1 - self.output) *
                         tf.log(rel_masked[:, 0] + 10**-5, name="log3rel2")))

                self.error = (
                    prediction_error +
                    tf.sign(tf.reduce_sum(self.user_terms)) *
                    (misattribution_error + corrective_error + att_reg))

                self.opt = AdamOptimizer()
                self.optimizer = self.opt.minimize(self.error)

                self.sess = tf.Session()
                self.sess.run(tf.global_variables_initializer())
                self.training = False

            def get_feed_dict_multiple(self, docs):
                dp = 0.7 if self.training else 1
                maximum = max([len(doc.vectors) for doc in docs] + [5])
                return {
                    self.vectors:
                    np.array([
                        doc.vectors[:maximum] + [[0] * (self.vector_size)] *
                        (maximum - len(doc.vectors[:maximum])) for doc in docs
                    ]).reshape([-1, maximum, self.vector_size]),
                    self.output: [[doc.class_ * 1] for doc in docs],
                    self.user_terms:
                    np.array([
                        doc.user_terms[:maximum] + [0] *
                        (maximum - len(doc.user_terms[:maximum]))
                        for doc in docs
                    ]).reshape([-1, maximum]),
                    self.padding:
                    np.array([[1] * len(doc.vectors[:maximum]) + [0] *
                              (maximum - len(doc.vectors[:maximum]))
                              for doc in docs]).reshape([-1, maximum]),
                    self.dropout_rate:
                    dp
                }

            def train(self, docs):
                self.training = True

                # Re-initialize the machine during every training round
                self.sess.run(tf.global_variables_initializer())
                sess = self.sess
                print("====")

                epochs = 200  # maximum training epochs
                random.shuffle(docs)

                last_10 = [100] * 10

                for epoch in range(epochs):
                    total_error = 0
                    # Stochastic Gradient Descent (mini-batch size = 1) works best.
                    for doc_s in [
                            docs[i:i + 1] for i in range(0, len(docs), 1)
                    ]:
                        fd = self.get_feed_dict_multiple(doc_s)
                        _, error = sess.run([self.optimizer, self.error],
                                            feed_dict=fd)
                        total_error += error
                    total_error = total_error / len(docs)

                    if epoch > 10 and total_error > 4:
                        self.train(docs)
                        return
                    last_10.pop(0)
                    last_10.append(total_error)
                    if max(last_10) < 0.05:
                        print("breaking")
                        break
                print(total_error)
                self.training = False

            def run(self, docs):
                random.shuffle(docs)
                sess = self.sess
                num_correct = 0
                num_seen = 0
                for doc_s in [docs[i:i + 1] for i in range(0, len(docs), 1)]:
                    fd = self.get_feed_dict_multiple(doc_s)
                    l1 = sess.run([
                        self.relevance, self.pos_attribution,
                        self.neg_attribution
                    ],
                                  feed_dict=fd)
                    for ind, doc in enumerate(doc_s):
                        d = {
                            "rel": l1[0][ind],
                            "pos_att": l1[1][ind],
                            "neg_att": l1[2][ind]
                        }
                        doc.pred_class = 0 if d["rel"] < 0.5 else 1
                        doc.parameters = d
                        num_correct += 1 * (doc.pred_class == doc.class_)
                        num_seen += 1
                    if num_seen % 1000 == 0:
                        print(num_correct / num_seen * 100)
Ejemplo n.º 18
0
    # Build Model
    model = Sequential()
    model.add(Embedding(len(vocab), args.embedding_size, input_length=max_answer_len))
    model.add(Dropout(args.dropout))
    if args.flatten:
        model.add(Flatten())
        model.add(Reshape((1, args.embedding_size * max_answer_len)))
    if args.lstm_dim_2:
        model.add(LSTM(args.lstm_dim_1, return_sequences=True))
        model.add(LSTM(args.lstm_dim_2, return_sequences=False))
    else:
        model.add(LSTM(args.lstm_dim_1, return_sequences=False))
    model.add(Dropout(args.dropout))
    model.add(Dense(1, activation="linear"))
    optimizer = AdamOptimizer()
    model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['acc'])


    # Train the model
    model.fit(train_x, train_y, epochs=args.epochs, verbose=0)
    
    # Validate
    test_y = test_data.iloc[:, 0]
    test_x = test_data.iloc[:, 1:]
    score = model.evaluate(test_x, test_y, verbose=0)
    print(f"Validation_loss:{score[0]};Validation_accuracy:{score[1]};")

    ## --- End of your code  --- ##

    # Save the trained model
Ejemplo n.º 19
0
    def __init__(self, args):
        self.inputs = tf.placeholder(
            tf.int32, shape=[args.batch_size, args.sequence_length])
        self.targets = tf.placeholder(
            tf.int32, shape=[args.batch_size, args.sequence_length])
        with tf.name_scope("embedding"):
            embedding_size = int(sqrt(args.vocab_source_size) + 1)
            embedding = tf.get_variable(
                'embedding',
                shape=[args.vocab_source_size,
                       embedding_size],  #embed them in a small space
                initializer=tf.contrib.layers.xavier_initializer())
            embedded = tf.nn.embedding_lookup(embedding, self.inputs)
            #tensor of shape [batch_size*sequence_length*embedding_size]
            embedded_inputs = tf.unpack(embedded, axis=0)
            #assert embedded_inputs[0].get_shape() == (args.batch_size,args.sequence_length,embedding_size)

            #reshape it to a list of timesteps
            embedded_inputs_by_timestamp = [
                tf.reshape(i, (args.batch_size, embedding_size))
                for i in tf.split(1, args.sequence_length, embedded)
            ]
            assert len(embedded_inputs_by_timestamp) == args.sequence_length
            for timestep in embedded_inputs_by_timestamp:
                assert timestep.get_shape() == (args.batch_size,
                                                embedding_size)

        with tf.variable_scope("bidi_rnn") as bidi_scope:
            cell = LSTM_factory(args.hidden_size,
                                args.num_layers,
                                dropout=args.dropout)
            outputs, fwd_state, bwd_state = tf.nn.bidirectional_rnn(
                cell_fw=cell,
                cell_bw=cell,
                inputs=embedded_inputs_by_timestamp,
                dtype=tf.float32)

        with tf.variable_scope("decoder_rnn"):
            decoder_cell = LSTM_factory(args.hidden_size,
                                        args.num_layers * 2,
                                        dropout=args.dropout)
            decoder_cell = AttentionCellWrapper(cell=decoder_cell,
                                                attn_length=args.hidden_size,
                                                state_is_tuple=True)
            final_outputs, state = tf.nn.rnn(cell=decoder_cell,
                                             inputs=outputs,
                                             dtype=tf.float32)

        with tf.variable_scope("logits") as logits_scope:
            # Reshaping to apply the same weights over the timesteps
            outputs = tf.pack(final_outputs)
            outputs = tf.transpose(outputs, [1, 0, 2])

            logits = tf.contrib.layers.fully_connected(
                inputs=outputs,
                num_outputs=args.vocab_target_size,
                activation_fn=None,
                weights_initializer=tf.contrib.layers.xavier_initializer(),
                scope=logits_scope)

            self.logits = logits

        with tf.variable_scope("loss"):
            #flat_targets = tf.reshape(self.targets, [-1])
            #flat_logits = tf.reshape(logits, [-1, args.vocab_target_size])
            assert logits.get_shape()[:-1] == self.targets.get_shape(
            ), 'l = {0} t = {1}'.format(logits.get_shape(),
                                        self.targets.get_shape())
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(
                logits, self.targets)

            batch_loss = tf.reduce_sum(losses, name="batch_loss")
            tf.contrib.losses.add_loss(batch_loss)
            total_loss = tf.contrib.losses.get_total_loss()

            # Add summaries.
            tf.scalar_summary("batch_loss", batch_loss)
            tf.scalar_summary("total_loss", total_loss)

            self.total_loss = total_loss
            self.batch_loss = batch_loss
            self.target_cross_entropy_losses = losses  # Used in evaluation.

        with tf.name_scope("optimization"):
            opt = AdamOptimizer(learning_rate=args.learning_rate)
            gvs = opt.compute_gradients(self.batch_loss)
            capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var)
                          for grad, var in gvs]
            train_op = opt.apply_gradients(capped_gvs)

        for var in tf.trainable_variables():
            tf.histogram_summary(var.op.name, var)

        for grad, var in gvs:

            if grad is not None:
                print(capped_gvs)
                tf.histogram_summary(
                    var.op.name + '/gradients',
                    grad,
                )

        with tf.name_scope("tensors"):
            self.train_op = train_op
            self.logits = logits
            self.total_loss = total_loss
            self.summaries = tf.merge_all_summaries()
Ejemplo n.º 20
0
class QuerySumModel:
    '''
    The QuerySum model itself.
    '''
    def __init__(self,
                 mode,
                 word_dict,
                 word_embedding_dim,
                 vocabulary,
                 initial_vocabulary_embeddings,
                 target_vocabulary_size,
                 cell='gru'):
        '''
        Args:
          self: QuerySumModel.
          mode: str, one of 'train', 'validate', or 'decode'.
          word_dict: dict, map from words to their embeddings.
          word_embedding_dim: int, the dimension of a single embedding.
          vocabulary: Vocabulary.
          initial_vocabulary_embeddings: np.ndarray.
          target_vocabulary_size: int.
          cell: 'gru' or 'lstm', the type of RNN unit to use.
        '''

        self.word_dict = word_dict
        self.word_embedding_dim = word_embedding_dim
        self.summary_vocabulary = vocabulary
        self.target_vocabulary_size = min(len(vocabulary.words),
                                          target_vocabulary_size)
        self.embeddings = tf.Variable(initial_vocabulary_embeddings,
                                      name='embeddings')

        self.documents_placeholder = tf.placeholder(tf.int32,
                                                    shape=[None, None])
        self.document_lengths_placeholder = tf.placeholder(tf.int32,
                                                           shape=[None])
        self.queries_placeholder = tf.placeholder(tf.int32, shape=[None, None])
        self.query_lengths_placeholder = tf.placeholder(tf.int32, shape=[None])
        self.references_placeholder = tf.placeholder(tf.int32,
                                                     shape=[None, None])
        self.reference_lengths_placeholder = tf.placeholder(tf.int32,
                                                            shape=[None])
        self.pointer_reference_placeholder = tf.placeholder(tf.int32,
                                                            shape=[None, None])
        self.pointer_switch_placeholder = tf.placeholder(tf.int32,
                                                         shape=[None, None])
        self.reference_lengths_placeholder = tf.placeholder(tf.int32,
                                                            shape=[None])

        self.epoch = tf.Variable(0, name='epoch', trainable=False)
        self.global_step = tf.Variable(0, name='global_step', trainable=False)
        self.best_validation_loss = tf.Variable(np.inf,
                                                name='best_validation_loss',
                                                trainable=False)
        self.new_best_validation = tf.placeholder(tf.float32, shape=[])
        self.best_validation_assign = self.best_validation_loss.assign(
            self.new_best_validation)

        self.increment_epoch_op = tf.assign(self.epoch, self.epoch + 1)

        self.batch_size = tf.shape(self.documents_placeholder)[0]

        self.dropout_enabled = False

        self.encoder_cell_state_size = 256
        self.encoder_output_size = 2 * self.encoder_cell_state_size
        self.decoder_cell_state_size = self.encoder_output_size

        self.decoder_vocab_hidden_size = 256

        self.attention_hidden_output_size = 256
        # Size is that of decoder state + encoder hidden state + query reader state
        self.attention_hidden_input_size = (self.decoder_cell_state_size +
                                            self.encoder_output_size +
                                            self.encoder_cell_state_size)

        self.beam_width_placeholder = tf.placeholder(tf.int32, shape=[])
        self.decode_last_output_placeholder = tf.placeholder(tf.int32,
                                                             shape=[None])

        self.initial_decoder_state_placeholder = tf.placeholder(
            tf.float32, shape=[None, self.decoder_cell_state_size])

        self.pre_computed_encoder_states_placeholder = tf.placeholder(
            tf.float32, shape=[None, None, self.encoder_output_size])

        self.pre_computed_query_state_placeholder = tf.placeholder(
            tf.float32, shape=[None, self.encoder_cell_state_size])

        self.query_attention_partial_score_placeholder = tf.placeholder(
            tf.float32, shape=[None, self.attention_hidden_output_size])

        self.encoder_state_attention_partial_scores_placeholder = tf.placeholder(
            tf.float32, shape=[None, None, self.attention_hidden_output_size])

        self.mode = mode

        if cell == 'gru':
            self.cell = GRUCell
        elif cell == 'lstm':
            self.cell = lambda *args, **kwargs: LSTMCell(
                *args, **kwargs, state_is_tuple=False)
        else:
            raise Exception('{} is not a valid RNN cell'.format(cell))

        self.output_keep_prob = 0.8  # DropoutWrapper keep probability

        self._build_graph(mode=mode)

    def _build_graph(self, mode):
        '''
        A simple wrapper for the other graph-building methods.

        Args:
          self: QuerySumModel.
          mode: str.
        '''

        self._add_encoders()
        self._add_decoder(mode)
        if mode == 'train':
            self._add_optimizer()

    def _add_encoders(self):
        '''
        Build the model's encoder and add it to the graph.

        Args:
          self: QuerySumModel.
        '''

        with tf.variable_scope('query_encoder'):
            query_encoder_cell = self.cell(self.encoder_cell_state_size)
            if self.dropout_enabled and self.mode != 'decode':
                query_encoder_cell = DropoutWrapper(
                    cell=query_encoder_cell,
                    output_keep_prob=self.output_keep_prob)

            query_embeddings = tf.nn.embedding_lookup(self.embeddings,
                                                      self.queries_placeholder)

            query_encoder_outputs, _ = rnn.dynamic_rnn(
                query_encoder_cell,
                query_embeddings,
                sequence_length=self.query_lengths_placeholder,
                swap_memory=True,
                dtype=tf.float32)

            # because the query is so short, we can store almost all the
            # information inside it using a single contex vector. thus, we
            # extract the final query encoder output and save it.
            self.query_last = query_encoder_outputs[:, -1, :]

        with tf.variable_scope('encoder'):
            fw_cell = self.cell(self.encoder_cell_state_size)
            bw_cell = self.cell(self.encoder_cell_state_size)

            if self.dropout_enabled and self.mode != 'decode':
                fw_cell = DropoutWrapper(
                    cell=fw_cell, output_keep_prob=self.output_keep_prob)
                bw_cell = DropoutWrapper(
                    cell=bw_cell, output_keep_prob=self.output_keep_prob)

            embeddings = tf.nn.embedding_lookup(self.embeddings,
                                                self.documents_placeholder)

            (encoder_outputs_fw,
             encoder_outputs_bw), _ = rnn.bidirectional_dynamic_rnn(
                 fw_cell,
                 bw_cell,
                 embeddings,
                 sequence_length=self.document_lengths_placeholder,
                 swap_memory=True,
                 dtype=tf.float32)

            # Unlike the query, the document can be very complex, making it
            # difficult to encode all of its information into a single context
            # vector. Instead, we use attention, so we need to track all the
            # cell outputs. In addition, we need to save the final encoder
            # state so we can initialize the decoder's state to it.
            self.encoder_outputs = tf.concat(
                [encoder_outputs_fw, encoder_outputs_bw], 2)
            self.final_encoder_state = self.encoder_outputs[:, -1, :]

    def _add_decoder(self, mode):
        '''
        Args:
          self: QuerySumModel.
          mode: str.
        '''

        with tf.variable_scope('decoder') as scope:
            decoder_cell = self.cell(self.decoder_cell_state_size)
            if self.dropout_enabled and self.mode != 'decode':
                decoder_cell = DropoutWrapper(
                    cell=decoder_cell, output_keep_prob=self.output_keep_prob)

            # W^{(1)}_{gen}
            self.vocabulary_project_w_1 = tf.get_variable(
                name='vocabulary_project_w_1',
                shape=[
                    decoder_cell.output_size + self.encoder_output_size,
                    self.decoder_vocab_hidden_size
                ])

            self.vocabulary_project_w_2 = tf.get_variable(
                name='vocabulary_project_w_2',
                shape=[
                    self.decoder_vocab_hidden_size, self.target_vocabulary_size
                ])

            self.vocabulary_project_b_1 = tf.get_variable(
                name='vocabulary_project_b_1',
                initializer=tf.zeros_initializer(),
                shape=[self.decoder_vocab_hidden_size])

            self.vocabulary_project_b_2 = tf.get_variable(
                name='vocabulary_project_b_2',
                initializer=tf.zeros_initializer(),
                shape=[self.target_vocabulary_size])

            self.pointer_probability_project_w = tf.get_variable(
                name='pointer_probability_project_w',
                shape=[
                    self.encoder_output_size + self.decoder_cell_state_size +
                    self.word_embedding_dim, 1
                ])

            self.pointer_probability_project_b = tf.get_variable(
                name='pointer_probability_project_b',
                initializer=tf.zeros_initializer(),
                shape=[1])

            self.attention_w = tf.get_variable(
                name='attention_w',
                shape=[
                    self.decoder_cell_state_size,
                    self.attention_hidden_output_size
                ],
                dtype=tf.float32)

            self.attention_w_e = tf.get_variable(
                name='attention_w_e',
                shape=[
                    self.word_embedding_dim, self.attention_hidden_output_size
                ],
                dtype=tf.float32)

            self.attention_w_q = tf.get_variable(
                name='attention_w_q',
                shape=[
                    self.encoder_cell_state_size,
                    self.attention_hidden_output_size
                ],
                dtype=tf.float32)

            self.attention_w_d = tf.get_variable(
                name='attention_w_d',
                shape=[
                    self.encoder_output_size, self.attention_hidden_output_size
                ],
                dtype=tf.float32)

            self.attention_v = tf.get_variable(
                name='attention_v',
                shape=[self.attention_hidden_output_size],
                dtype=tf.float32)

            self.attention_b = tf.get_variable(
                name='attention_b',
                initializer=tf.zeros_initializer(),
                shape=[self.attention_hidden_output_size],
                dtype=tf.float32)

            self._precompute_partial_attention_scores()

            if mode == 'decode':
                embedding = tf.nn.embedding_lookup(
                    self.embeddings, self.decode_last_output_placeholder)
                (decoder_outputs, self.one_step_decoder_state, context_vectors,
                 attention_logits,
                 pointer_probabilities) = self._rnn_one_step_attention_decoder(
                     decoder_cell, embedding,
                     self.initial_decoder_state_placeholder)
            else:
                if mode == 'train':
                    train_decoder_outputs, train_context_vectors, train_attention_logits, train_pointer_probabilities = \
                            self._rnn_attention_decoder(decoder_cell, training_wheels=True)
                    scope.reuse_variables()

                    self.train_attention_argmax = tf.cast(tf.argmax(
                        train_attention_logits, 1),
                                                          dtype=tf.int32)
                    self.train_pointer_enabled = tf.cast(
                        tf.round(train_pointer_probabilities), tf.int32)

                decoder_outputs, context_vectors, attention_logits, pointer_probabilities = \
                        self._rnn_attention_decoder(decoder_cell, training_wheels=False)

        self.attention_argmax = tf.cast(tf.argmax(attention_logits, 1),
                                        dtype=tf.int32)
        self.attention_softmax = tf.nn.softmax(attention_logits)
        self.pointer_enabled = tf.cast(tf.round(pointer_probabilities),
                                       tf.int32)

        if mode == 'decode':
            self.top_k_vocabulary_argmax, self.top_k_probabilities = self._extract_top_k_argmax(
                self.beam_width_placeholder, decoder_outputs, context_vectors)
        else:
            if mode == 'train':
                self.train_vocabulary_argmax, self.main_train_loss = self._compute_argmax_and_loss(
                    train_decoder_outputs, train_context_vectors,
                    train_attention_logits, train_pointer_probabilities)
            self.vocabulary_argmax, self.main_loss = self._compute_argmax_and_loss(
                decoder_outputs, context_vectors, attention_logits,
                pointer_probabilities)

    def _rnn_attention_decoder(self, decoder_cell, training_wheels):
        '''
        Args:
          self: QuerySumModel,
          decoder_cell: RNNCell or GRUCell, the RNN cell used by the decoder.
          training_wheels:

        Returns:

        '''
        loop_fn = self._custom_rnn_loop_fn(decoder_cell.output_size,
                                           training_wheels=training_wheels)

        decoder_outputs, _, (context_vectors_array, attention_logits_array, pointer_probability_array) = \
            tf.nn.raw_rnn(decoder_cell, loop_fn, swap_memory=True)

        decoder_outputs = decoder_outputs.stack()
        decoder_outputs = tf.transpose(decoder_outputs, [1, 0, 2])

        attention_logits = attention_logits_array.gather(
            tf.range(0,
                     attention_logits_array.size() - 1))
        attention_logits = tf.transpose(attention_logits, [1, 0, 2])

        context_vectors = context_vectors_array.gather(
            tf.range(0,
                     context_vectors_array.size() - 1))
        context_vectors = tf.transpose(context_vectors, [1, 0, 2])

        pointer_probabilities = pointer_probability_array.gather(
            tf.range(0,
                     pointer_probability_array.size() - 1))
        pointer_probabilities = tf.transpose(pointer_probabilities, [1, 0])

        return decoder_outputs, context_vectors, attention_logits, pointer_probabilities

    def _custom_rnn_loop_fn(self, cell_size, training_wheels):
        def loop_fn(time, cell_output, cell_state, loop_state):
            print(cell_state)
            if cell_output is None:  # time == 0
                context_vectors_array = tf.TensorArray(
                    tf.float32,
                    size=tf.shape(self.references_placeholder)[1] + 1)
                attention_logits_array = tf.TensorArray(
                    tf.float32,
                    size=tf.shape(self.references_placeholder)[1] + 1)
                pointer_probability_array = tf.TensorArray(
                    tf.float32,
                    size=tf.shape(self.references_placeholder)[1] + 1)
                next_cell_state = self.final_encoder_state
                go_id = self.summary_vocabulary.word_to_id('<GO>')
                last_output_embedding = tf.nn.embedding_lookup(
                    self.embeddings, tf.tile([go_id], [self.batch_size]))
            else:
                context_vectors_array, attention_logits_array, pointer_probability_array = loop_state
                next_cell_state = cell_state

                if training_wheels:
                    voc_indices = self.references_placeholder[:, time - 1]
                    pointer_indices = self.pointer_reference_placeholder[:,
                                                                         time -
                                                                         1]
                    pointer_switch = tf.cast(
                        self.pointer_switch_placeholder[:, time - 1], tf.bool)

                    batch_range = tf.range(self.batch_size)
                    pointer_indexer = tf.stack([batch_range, pointer_indices],
                                               axis=1)
                    attention_vocabulary_indices = tf.gather_nd(
                        self.documents_placeholder, pointer_indexer)

                    mixed_indices = tf.where(pointer_switch,
                                             attention_vocabulary_indices,
                                             voc_indices)
                    last_output_embedding = tf.nn.embedding_lookup(
                        self.embeddings, mixed_indices)
                else:
                    last_output_embedding = self._extract_argmax_and_embed(
                        cell_output, cell_size,
                        tf.shape(self.documents_placeholder)[0])

            context_vector, attention_logits = self._attention(
                next_cell_state, last_output_embedding)
            pointer_probabilities = self._pointer_probabilities(
                context_vector, next_cell_state, last_output_embedding)

            context_vectors_array = context_vectors_array.write(
                time, context_vector)
            attention_logits_array = attention_logits_array.write(
                time, attention_logits)
            pointer_probability_array = pointer_probability_array.write(
                time, pointer_probabilities)

            next_input = tf.concat(
                [last_output_embedding, context_vector, self.query_last],
                axis=1)
            elements_finished = (time >= self.reference_lengths_placeholder)

            emit_output = cell_output
            next_loop_state = (context_vectors_array, attention_logits_array,
                               pointer_probability_array)
            return elements_finished, next_input, next_cell_state, emit_output, next_loop_state

        return loop_fn

    def _precompute_partial_attention_scores(self):
        encoder_outputs_flat = tf.reshape(self.encoder_outputs,
                                          shape=[-1, self.encoder_output_size])
        self.encoder_state_attention_partial_scores = tf.matmul(
            encoder_outputs_flat, self.attention_w_d)
        self.encoder_state_attention_partial_scores = tf.reshape(
            self.encoder_state_attention_partial_scores,
            shape=[self.batch_size, -1, self.attention_hidden_output_size])
        self.encoder_state_attention_partial_scores = tf.transpose(
            self.encoder_state_attention_partial_scores, [1, 0, 2])

        self.query_attention_partial_score = tf.matmul(self.query_last,
                                                       self.attention_w_q)

    def _score(self, prev_decoder_state, prev_embedding):
        # Returns scores in a tensor of shape [batch_size, input_sequence_length]

        if self.mode == 'decode':
            query_part = self.query_attention_partial_score_placeholder
            encoder_part = self.encoder_state_attention_partial_scores_placeholder
        else:
            query_part = self.query_attention_partial_score
            encoder_part = self.encoder_state_attention_partial_scores

        embedding_part = tf.matmul(prev_embedding, self.attention_w_e)

        # XXX: this is where the shape mismatch is
        output = tf.matmul(
            prev_decoder_state, self.attention_w
        ) + embedding_part + query_part + encoder_part + self.attention_b

        output = tf.tanh(output)
        output = tf.reduce_sum(self.attention_v * output, axis=2)
        output = tf.transpose(output, [1, 0])

        # Handle input document padding by giving a large penalty, eliminating it from the weighted average
        padding_penalty = -1e20 * tf.to_float(
            1 - tf.sign(self.documents_placeholder))
        masked = output + padding_penalty

        return masked

    def _attention(self, prev_decoder_state, prev_embedding):
        with tf.variable_scope('attention') as scope:
            # e = score of shape [batch_size, output_seq_length, input_seq_length], e_{ij} = score(s_{i-1}, h_j)
            # e_i = score of shape [batch_size, input_seq_length], e_ij = score(prev_decoder_state, h_j)
            e_i = self._score(prev_decoder_state, prev_embedding)

            # alpha_i = softmax(e_i) of shape [batch_size, input_seq_length]
            alpha_i = tf.nn.softmax(e_i)

            resized_alpha_i = tf.reshape(
                tf.tile(alpha_i, [1, self.encoder_output_size]),
                [self.batch_size, -1, self.encoder_output_size])

            if self.mode == 'decode':
                c_i = tf.reduce_sum(tf.multiply(
                    resized_alpha_i,
                    self.pre_computed_encoder_states_placeholder),
                                    axis=1)
            else:
                c_i = tf.reduce_sum(tf.multiply(resized_alpha_i,
                                                self.encoder_outputs),
                                    axis=1)
            return c_i, e_i

    def _pointer_probabilities(self, attention, cell_state,
                               last_output_embedding):
        combined_input = tf.concat(
            [attention, cell_state, last_output_embedding], axis=1)
        result = tf.sigmoid(
            tf.matmul(combined_input, self.pointer_probability_project_w) +
            self.pointer_probability_project_b)
        # Remove extra dimension of size 1
        result = tf.reshape(result, shape=[self.batch_size])
        return result

    def _compute_argmax_and_loss(self, decoder_outputs, context_vectors,
                                 attention_logits, pointer_probabilities):
        # Projection onto vocabulary is based on
        # http://www.wildml.com/2016/08/rnns-in-tensorflow-a-practical-guide-and-undocumented-features/

        vocabulary_project_input = tf.concat(
            [decoder_outputs, context_vectors], axis=2)

        # Flatten output over batch dimension
        vocabulary_project_input_flat = tf.reshape(
            vocabulary_project_input,
            [-1, self.decoder_cell_state_size + self.encoder_output_size])
        vocabulary_hidden_flat = tf.matmul(
            vocabulary_project_input_flat,
            self.vocabulary_project_w_1) + self.vocabulary_project_b_1

        logits_flat = tf.matmul(
            vocabulary_hidden_flat,
            self.vocabulary_project_w_2) + self.vocabulary_project_b_2

        max_decoder_length = tf.shape(decoder_outputs)[1]

        # Reshape back to [batch_size, max_decoder_length, vocabulary_size]
        logits = tf.reshape(
            logits_flat, [-1, max_decoder_length, self.target_vocabulary_size])

        vocabulary_argmax = tf.argmax(logits, 2)

        references_placeholder_flat = tf.reshape(self.references_placeholder,
                                                 [-1, 1])

        # Calculate the losses
        losses_flat = tf.nn.sampled_softmax_loss(
            weights=tf.transpose(self.vocabulary_project_w_2),
            biases=self.vocabulary_project_b_2,
            labels=references_placeholder_flat,
            inputs=vocabulary_hidden_flat,
            num_sampled=512,
            num_classes=self.target_vocabulary_size)
        vocabulary_loss = tf.reshape(losses_flat, [-1, max_decoder_length])

        # Previous loss function for full softmax
        # vocabulary_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,
        #                                                                 labels=self.references_placeholder)

        pointer_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(
            logits=attention_logits, labels=self.pointer_reference_placeholder)

        float_pointer_switch_reference = tf.to_float(
            self.pointer_switch_placeholder)

        pointer_probability_loss = (float_pointer_switch_reference *
                                    -tf.log(pointer_probabilities + 1e-9) +
                                    (1. - float_pointer_switch_reference) *
                                    -tf.log(1. - pointer_probabilities + 1e-9))

        # Mask out padding from loss computation
        length_mask = tf.sign(tf.to_float(self.references_placeholder))

        masked_losses = length_mask * (
            pointer_probability_loss +
            (1. - float_pointer_switch_reference) * vocabulary_loss +
            float_pointer_switch_reference * pointer_loss)

        float_lengths = tf.to_float(self.reference_lengths_placeholder)

        # Calculate mean loss
        mean_loss_by_example = tf.reduce_sum(masked_losses,
                                             axis=1) / float_lengths

        mean_loss = tf.reduce_mean(mean_loss_by_example)

        return vocabulary_argmax, mean_loss

    def _extract_argmax_and_embed(self, cell_output, cell_size, batch_size):
        # Flatten output over batch dimension
        rnn_outputs_flat = tf.reshape(cell_output, [-1, cell_size])

        # Running without training wheels is currently not supported
        # TODO: Fix or remove
        logits_flat = tf.zeros([batch_size, self.target_vocabulary_size])
        # logits_flat = tf.matmul(rnn_outputs_flat, self.vocabulary_project_w) + self.vocabulary_project_b

        # Reshape back to [batch_size, vocabulary_size]
        logits = tf.reshape(logits_flat, [-1, self.target_vocabulary_size])
        vocabulary_argmax = tf.argmax(logits, 1)

        return tf.nn.embedding_lookup(self.embeddings, vocabulary_argmax)

    def _add_optimizer(self):
        self.optimizer = AdamOptimizer()

        self.final_train_loss = self.main_train_loss

        with tf.variable_scope('l2_regularization'):
            # Find variables to regularize by iterating over all variables and checking if in set. Haven't found way to
            # directly get variables by absolute path.
            l2_regularized_names = {
                'encoder/bidirectional_rnn/fw/gru_cell/gates/weights:0'
                # If used, add additional complete variables names
            }
            l2_regularized = [
                variable for variable in tf.trainable_variables()
                if variable.name in l2_regularized_names
            ]

            l2_loss = 0.001 * tf.add_n(
                [tf.nn.l2_loss(variable) for variable in l2_regularized])

        gradients = self.optimizer.compute_gradients(self.final_train_loss)

        with tf.variable_scope('gradient_clipping'):

            def clip_gradient(gradient, variable):
                # Only clip normal tensors, IndexedSlices gives warning otherwise
                if isinstance(gradient, tf.Tensor):
                    gradient = tf.clip_by_norm(gradient, 10)
                return gradient, variable

            gradients = [
                clip_gradient(gradient, variable)
                for gradient, variable in gradients
            ]
        self.minimize_operation = self.optimizer.apply_gradients(
            gradients, global_step=self.global_step)

    def _rnn_one_step_attention_decoder(self, decoder_cell,
                                        initial_input_word_embedding,
                                        initial_cell_state):
        loop_fn = self._custom_one_step_rnn_loop_fn(
            initial_input_word_embedding, initial_cell_state)
        decoder_outputs, final_state, (context_vector, attention_logits,
                                       pointer_probabilities) = tf.nn.raw_rnn(
                                           decoder_cell, loop_fn)
        decoder_outputs = decoder_outputs.stack()
        decoder_outputs = tf.transpose(decoder_outputs, [1, 0, 2])
        return decoder_outputs, final_state, context_vector, attention_logits, pointer_probabilities

    def _custom_one_step_rnn_loop_fn(self, initial_input_word_embedding,
                                     initial_cell_state):
        def loop_fn(time, cell_output, cell_state, loop_state):
            if cell_output is None:  # time == 0
                next_cell_state = initial_cell_state
                context_vector, attention_logits = self._attention(
                    next_cell_state, initial_input_word_embedding)
                pointer_probabilities = self._pointer_probabilities(
                    context_vector, next_cell_state,
                    initial_input_word_embedding)
                next_input = tf.concat([
                    initial_input_word_embedding, context_vector,
                    self.pre_computed_query_state_placeholder
                ],
                                       axis=1)
                next_loop_state = (context_vector, attention_logits,
                                   pointer_probabilities)
            else:
                next_cell_state = cell_state
                next_input = tf.zeros(shape=[
                    self.batch_size, self.word_embedding_dim +
                    self.encoder_output_size + self.encoder_cell_state_size
                ])
                next_loop_state = loop_state

            elements_finished = cell_output is not None

            print(next_cell_state.shape)
            emit_output = cell_output
            return elements_finished, next_input, next_cell_state, emit_output, next_loop_state

        return loop_fn

    def _extract_top_k_argmax(self, k, cell_output, context_vectors):
        cell_output_flat = tf.reshape(cell_output,
                                      [-1, self.decoder_cell_state_size])
        vocabulary_project_input = tf.concat(
            [cell_output_flat, context_vectors], axis=1)

        vocabulary_hidden = tf.matmul(
            vocabulary_project_input,
            self.vocabulary_project_w_1) + self.vocabulary_project_b_1

        logits = tf.matmul(
            vocabulary_hidden,
            self.vocabulary_project_w_2) + self.vocabulary_project_b_2

        top_k_probabilities, vocabulary_argmax = tf.nn.top_k(
            tf.nn.softmax(logits), k)

        return vocabulary_argmax, top_k_probabilities
Ejemplo n.º 21
0
    def __init__(self, **optimizer_kwargs):
        self._model = optimizer_kwargs["model"]

        self._individual_learning_rate = optimizer_kwargs[
            "individual_learning_rate"]

        self._learning_rate = optimizer_kwargs["learning_rate"]
        self._rescale_learning_rate = optimizer_kwargs["rescale_learning_rate"]
        self._d_p = None
        self._n_reg = None

        post_optimizer = optimizer_kwargs[
            "post_optimizer"] if "post_optimizer" in optimizer_kwargs else None
        if post_optimizer is None:
            self._post_optimizer = super()

        elif post_optimizer == "Momentum":
            self._post_optimizer = MomentumOptimizer(
                learning_rate=optimizer_kwargs["learning_rate"],
                momentum=0.95,
                use_locking=False,
                name="MomentumOptimizer")

        elif post_optimizer == "RMSProp":
            self._post_optimizer = RMSPropOptimizer(
                learning_rate=optimizer_kwargs["learning_rate"],
                decay=0.9,
                epsilon=1e-5,
                use_locking=False,
                name="RMSPropOptimizer")

        elif post_optimizer == "Adam":
            self._post_optimizer = AdamOptimizer(
                learning_rate=optimizer_kwargs["learning_rate"],
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-8,
                use_locking=False,
                name="AdamOptimizer")
        elif post_optimizer == "Nadam":
            self._post_optimizer = NadamOptimizer(
                learning_rate=optimizer_kwargs["learning_rate"],
                beta1=0.9,
                beta2=0.999,
                epsilon=1e-8,
                use_locking=False,
                name="NadamOptimizer")

        elif post_optimizer == "Nesterov":
            self._post_optimizer = MomentumOptimizer(
                learning_rate=optimizer_kwargs["learning_rate"],
                momentum=0.95,
                use_locking=False,
                use_nesterov=True,
                name="NesterovMomentumOptimizer")
        elif post_optimizer == "NesterovConst":
            self._post_optimizer = NesterovConst(
                model=self._model,
                learning_rate=optimizer_kwargs["learning_rate"],
                use_locking=False,
                name="NesterovConstOptimizer")

        else:
            raise Exception(
                "There is no such post optimizer defined. Must be: None, Adam, Momentum, RMSProp"
            )

        super().__init__(self._learning_rate)
from tensorflow.python.keras.optimizers import SGD
from tensorflow.python.training.adam import AdamOptimizer

from audio.adapter import get_audio_adapter
from dataset import DatasetBuilder
from model import model_fn
from model.KerasUnet import getUnetModel
from utils.configuration import load_configuration
import tensorflow as tf
import csv

audio_path = '../musdb_dataset/'
config_path = "../config/musdb_config.json"
INIT_LR = 1e-3
opt = AdamOptimizer(INIT_LR)
opt = SGD(lr=INIT_LR, momentum=0.9)
_instruments = ['vocals_spectrogram']
model_dict = {}
model_trainable_variables = {}

val_loss_results = []
val_metrics_results = []

export_dir = '../spleeter_saved_model_dir/'
metrics_csv = './csv_metrics/metrics_loss.csv'


def get_training_dataset(audio_params, audio_adapter, audio_path):
    """ Builds training dataset.
                                          dtype=tf.int32),
                         batch_sz,
                         name='accuracy')
    tf.summary.scalar('accuracy', accuracy)

    from tflearn.objectives import categorical_crossentropy

    loss = categorical_crossentropy(softmax_class_op, selected_gesture)
    tf.summary.scalar('classification_loss', loss)

with tf.variable_scope('optimize'):
    lr_op = tf.Variable(5e-4, False, dtype=tf.float32)
    decay_lr_op = tf.assign(lr_op, lr_op * (1 - 1e-4))
    tf.summary.scalar('learning_rate', lr_op)
    with tf.control_dependencies([decay_lr_op]):
        train_step = AdamOptimizer(learning_rate=lr_op).minimize(loss)

display_q = queue.Queue(10)


def display():
    while True:
        softmax_class, display_states = display_q.get()
        print("Prediction: ", np.max(softmax_class, axis=1))
        for states in np.transpose(display_states, axes=[1, 0, 2]):
            env.step(states)
            env.render()
            sleep(.2 / (display_q.qsize() + 1))
        env.reset()

d.load_embeddings(args.emb_type, args.word2vec_file, args.glove_file,
                           args.fasttext_file, args.custom_file, logger)
d.batch = d.batch_generator(args.mb)
m = bayesian_emb_model(d, d.K, sess, dir_name)
sigmas_list = list()


# TRAINING
n_iters, n_batches = get_n_iters(args.n_epochs, args.mb, len(d.word_target))
logger.debug('init training number of iters '+str(n_iters)+' and batches '+str(n_batches))
#kl_scaling_weights = get_kl_weights(n_batches)
learning_rates = get_learning_rates(args.clr_type, n_iters, args.clr_cycles, args.base_lr, args.max_lr, args.lr)
m.inference.initialize(n_samples=1, n_iter=n_iters, logdir=m.logdir,
                       scale={m.y_pos: n_batches, m.y_neg: n_batches / args.ns},
                       kl_scaling={m.y_pos: n_batches, m.y_neg: n_batches / args.ns},
                       optimizer=AdamOptimizer(learning_rate=m.learning_rate_placeholder)
                       )
early_stopping = EarlyStopping(patience=args.patience)
init = tf.global_variables_initializer()
sess.run(init)
logger.debug('....starting training')
iteration = 0
for epoch in range(args.n_epochs):
    for batch in range(n_batches):
        info_dict = m.inference.update(feed_dict=d.feed(m.target_placeholder,
                                                        m.context_placeholder,
                                                        m.labels_placeholder,
                                                        m.ones_placeholder,
                                                        m.zeros_placeholder,
                                                        m.learning_rate_placeholder,
                                                        args.mb,
def create_optimizer(step: Tensorflow2ModelStep, context: ExecutionContext):
    return AdamOptimizer(learning_rate=step.hyperparams['learning_rate'])
Ejemplo n.º 26
0
n_iters, n_batches = get_n_iters()
logger.debug('init training number of iters ' + str(n_iters) +
             ' and batches ' + str(n_batches))

m.inference.initialize(n_samples=1,
                       n_iter=n_iters,
                       logdir=m.logdir,
                       scale={
                           m.y_pos: n_batches,
                           m.y_neg: n_batches / args.ns
                       },
                       kl_scaling={
                           m.y_pos: n_batches,
                           m.y_neg: n_batches / args.ns
                       },
                       optimizer=AdamOptimizer(learning_rate=0.001))
init = tf.global_variables_initializer()
sess.run(init)
logger.debug('....starting training')
for i in range(m.inference.n_iter):
    info_dict = m.inference.update(feed_dict=d.feed(
        args.mb, m.target_placeholder, m.context_placeholder,
        m.labels_placeholder, m.ones_placeholder, m.zeros_placeholder, True))
    m.inference.print_progress(info_dict)
    if i % 10000 == 0:
        m.saver.save(sess, os.path.join(m.logdir, "model.ckpt"), i)
        sigmas = m.sigU.eval()[:, 0]
        sigmas_list.append(sigmas)
        pickle.dump(sigmas_list, open(dir_name + "/sigmas.dat", "wb+"))
        if is_goog_embedding(sigmas):
            break
Ejemplo n.º 27
0
    )

    n_features = 1001
    n_classes = 101
    batch_size = 32
    val_batch_size = 256

    tree = SoftDecisionTree(max_depth=6,
                            n_features=n_features,
                            n_classes=n_classes,
                            max_leafs=None)
    tree.build_tree()

    # optimizer
    optimizer = AdamOptimizer(learning_rate=0.001,
                              beta1=0.9,
                              beta2=0.999,
                              epsilon=1e-08).minimize(tree.loss)

    # Saving the model
    # saver = tf.train.Saver()

    # Initialize the variables (i.e. assign their default value)
    init = global_variables_initializer()

    EPOCHS = 1000
    TOTAL_BATCH = 16
    display_step = 100
    with tf.compat.v1.Session() as sess:
        sess.run(init)
        t0 = time.time()
Ejemplo n.º 28
0
            def __init__(self, word_vector_size):
                tf.reset_default_graph()
                self.vector_size = word_vector_size

                self.vectors = tf.placeholder(tf.float32,
                                              shape=(None, None,
                                                     word_vector_size))
                self.user_terms = tf.placeholder(tf.float32,
                                                 shape=(None, None))
                self.padding = tf.placeholder(tf.float32, shape=(None, None))
                self.output = tf.placeholder(tf.float32, shape=(None, 1))
                self.dropout_rate = tf.placeholder(tf.float32)

                xavier = tf.contrib.layers.xavier_initializer()

                # 50 tri-gram, 50 4-gram and 50 5-gram
                filter_tri = tf.Variable(xavier((1, 3, word_vector_size, 50)),
                                         name="weight")  #
                bias_tri = tf.Variable(tf.zeros((1, 50)), name="bias")  #
                self.f3 = filter_tri
                self.b3 = bias_tri

                filter_4 = tf.Variable(xavier((1, 4, word_vector_size, 50)),
                                       name="weight")  #
                bias_4 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f4 = filter_4
                self.b4 = bias_4

                filter_5 = tf.Variable(xavier((1, 5, word_vector_size, 50)),
                                       name="weight")  #
                bias_5 = tf.Variable(tf.zeros((1, 50)), name="bias")
                self.f5 = filter_5
                self.b5 = bias_5

                with tf.name_scope("relevance"):
                    hidden = 150
                    self.relevance_weight = tf.Variable(0.01 * xavier(
                        (hidden, num_classes)))
                    self.relevance_bias = tf.Variable(0.0 * xavier(
                        (1, num_classes)))

                rel, pre_max_true_dropped, pre_max_sum = self.forward(
                    self.vectors)
                self.relevance = rel

                ut = tf.expand_dims(self.user_terms, 2)  # NWC
                rel_masked, pre_max_true_masked_dropped, _ = self.forward(
                    self.vectors * ut)
                self.rel_masked = rel_masked

                self.pre_max_sum = pre_max_sum
                self.get_attribution()

                prediction_error = -tf.reduce_sum(
                    tf.one_hot(tf.cast(self.output, tf.int32), num_classes) *
                    tf.log(rel + 10**-5, name="log2rel"))

                heads = []
                for att in self.attributions:
                    heads.append(
                        tf.reduce_sum(tf.multiply(att, self.user_terms),
                                      axis=1))
                heads_all = tf.stack(heads)
                self.h = heads_all
                self.a = tf.stack(self.attributions)
                # pos_heads =
                # neg_heads = tf.reduce_sum(tf.multiply(self.neg_attribution, self.user_terms), axis=1)

                misattribution_error = 0.0
                corrective_error = 0.0
                att_reg = 0.0

                if use_attribution:
                    misattribution_error += (
                        self.h[tf.cast(self.output[0][0], tf.int32)][0] -
                        0.9)**2
                    att_reg = 0
                    for att in self.attributions:
                        att_reg += tf.reduce_sum(
                            tf.nn.relu(att - att_max_value))

                    corrective_error = -tf.reduce_sum(
                        tf.one_hot(tf.cast(self.output, tf.int32), num_classes)
                        * tf.log(rel_masked + 10**-5, name="log2rel"))

                self.error = (
                    prediction_error +
                    tf.sign(tf.reduce_sum(self.user_terms)) *
                    (misattribution_error + corrective_error + att_reg))

                self.opt = AdamOptimizer()
                self.optimizer = self.opt.minimize(self.error)

                self.sess = tf.Session()
                self.sess.run(tf.global_variables_initializer())
                self.training = False
Ejemplo n.º 29
0
    def fit(self, dataset):
        self.w = self.w_hat

        if self.train_type == 'Center':
            self.torque = self.iteration
            self.w = 0

        x = tf.placeholder(tf.float32, [None, 784])
        # dynamically reshape the input
        x_shaped = tf.reshape(x, [-1, 28, 28, 1])
        # now declare the output data placeholder - 10 digits
        y = tf.placeholder(tf.float32, [None, 10])
        # create some convolutional layers
        layer1 = create_new_conv_layer(x_shaped,
                                       self.w[:2],
                                       self.layer1_size[0],
                                       self.layer1_size[1],
                                       self.layer1_size[2],
                                       self.layer1_size[3],
                                       name='layer1')

        layer2 = create_new_conv_layer(layer1,
                                       self.w[2:4],
                                       self.layer2_size[0],
                                       self.layer2_size[1],
                                       self.layer2_size[2],
                                       self.layer2_size[3],
                                       name='layer2')

        flattened_parameter_size = self.flattend_size(
        )**2 * self.layer2_size[1]
        flattened = tf.reshape(layer2, [-1, flattened_parameter_size])

        # setup some weights and bias values for this layer, then activate with ReLU

        wd1 = tf.Variable(self.w[4].reshape(flattened_parameter_size,
                                            self.flatten1_size),
                          name='wd1')
        bd1 = tf.Variable(self.w[5], name='bd1')
        dense_layer1 = tf.matmul(flattened, wd1) + bd1
        dense_layer1 = tf.nn.relu(dense_layer1)

        # another layer with softmax activations

        wd2 = tf.Variable(self.w[6].reshape(self.flatten1_size,
                                            self.flatten2_size),
                          name='wd2')
        bd2 = tf.Variable(self.w[7], name='bd2')
        dense_layer2 = tf.matmul(dense_layer1, wd2) + bd2

        y_ = tf.nn.softmax(dense_layer2)
        #loss is cross_entropy loss
        cross_entropy = tf.reduce_mean(
            tf.nn.softmax_cross_entropy_with_logits(logits=dense_layer2,
                                                    labels=y))

        #metrics
        correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
        #Optimizer initialization
        optimizer_gradient = AdamOptimizer_Bing(
            learning_rate=self.learning_rate).minimize(cross_entropy)
        optimizer = AdamOptimizer(
            learning_rate=self.learning_rate).minimize(cross_entropy)

        # setup the initialisation operator
        init_op = tf.global_variables_initializer()
        grad = []
        with tf.Session() as sess:
            # initialise the variables
            sess.run(init_op)
            total_batch = int(len(dataset.train.labels) / self.batch_size)
            count = 0
            for epoch in range(self.torque):
                avg_cost = 0
                self.t += 1
                count += 1

                if count < self.torque:
                    for i in range(total_batch):
                        batch_x, batch_y = dataset.train.next_batch(
                            batch_size=self.batch_size)
                        _, c = sess.run([optimizer, cross_entropy],
                                        feed_dict={
                                            x: batch_x,
                                            y: batch_y
                                        })
                        avg_cost += c / total_batch

                elif count == self.torque:
                    '''
                    #self.grad saved for belta computation. 
                    #It denotes in time t(update time), the gradient of local loss of local parameters

                    '''

                    for i in range(total_batch):
                        batch_x, batch_y = dataset.train.next_batch(
                            batch_size=self.batch_size)
                        g, c = sess.run([optimizer_gradient, cross_entropy],
                                        feed_dict={
                                            x: batch_x,
                                            y: batch_y
                                        })
                        #g[1] is grad_var list
                        gradient_temp = batch_gradient_collector(g[1])
                        grad.append(gradient_temp)
                        avg_cost += c / total_batch

                    self.w = batch_parameter_collector(g[1])
                    #Sum up gradients from each batch
                    self.grad = np.array(grad).sum(axis=0)

                test_acc = sess.run(accuracy,
                                    feed_dict={
                                        x: dataset.test.images,
                                        y: dataset.test.labels
                                    })
                self.history.append([avg_cost, test_acc, str(self.t)])

            return self
Ejemplo n.º 30
0
class DBQA(DependencyParserBase):
    available_data_formats = {
        "word-based": NLPCC16DBQA,
        "character-based": NLPCC16DBQACharacterBased
    }
    default_data_format_name = "word-based"

    @classmethod
    def add_parser_arguments(cls, arg_parser):
        super(DBQA, cls).add_parser_arguments(arg_parser)
        group = arg_parser.add_argument_group(DBQA.__name__)
        group.add_argument("--external-embedding")
        group.add_argument("--batch-size", type=int, default=4096)
        group.add_argument("--embed-size", type=int, default=100)
        group.add_argument("--lstm-size", type=int, default=256)
        group.add_argument("--n-recur", type=int, default=2)
        group.add_argument("--use-bigram", type=int, default=1)
        group.add_argument("--input-keep-prob", type=int, default=1)
        group.add_argument("--recurrent-keep-prob", type=int, default=1)
        group.add_argument("--seed", type=int, default=42)
        group.add_argument("--steps", type=int, default=50000)
        group.add_argument("--merger-type",
                           choices=["rnn", "cnn"],
                           default="rnn")

    def __init__(self, options, data_train, session=None):
        self.statistics = DBQAStatistics.from_data(data_train)
        self.options = options

        self.optimizer = AdamOptimizer()
        self.global_step = tf.train.get_or_create_global_step()

        self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)

        self.question_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.question_bigram_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.answer_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.answer_bigram_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.wrong_answer_2d_pl = tf.placeholder(tf.int32, (None, None))
        self.wrong_answer_bigram_2d_pl = tf.placeholder(tf.int32, (None, None))

        self.network = PairwiseSimilarity(options, self.statistics)
        self.loss, self.accuracy = self.network.get_loss(
            self.question_2d_pl,
            self.question_bigram_2d_pl,
            self.answer_2d_pl,
            self.answer_bigram_2d_pl,
            self.wrong_answer_2d_pl,
            self.wrong_answer_bigram_2d_pl,
        )

        self.similarity = self.network.get_similarity(
            self.question_2d_pl, self.question_bigram_2d_pl, self.answer_2d_pl,
            self.answer_bigram_2d_pl)

        self.optimize_op = self.optimizer.minimize(
            self.loss, global_step=self.global_step)

        if session is None:
            self.session = self.create_session()
            self.session.run(tf.global_variables_initializer())
        else:
            self.session = session
        self.random = Random(42)

    def create_session(self):
        config_proto = tf.ConfigProto()
        # config_proto.gpu_options.per_process_gpu_memory_fraction = self.options.per_process_gpu_memory_fraction
        return tf.Session(config=config_proto)

    def train(self, data_train):
        for questions_np, questions_bigram_np, \
            corrects_np, corrects_bigram_np, \
            wrongs_np, wrongs_bigram_np in generate_train_batches(
            data_train, self.options.batch_size, self.random
        ):
            step, loss, accuracy, _ = self.session.run(
                [self.global_step, self.loss, self.accuracy, self.optimize_op],
                {
                    self.question_2d_pl: questions_np,
                    self.question_bigram_2d_pl: questions_bigram_np,
                    self.answer_2d_pl: corrects_np,
                    self.answer_bigram_2d_pl: corrects_bigram_np,
                    self.wrong_answer_2d_pl: wrongs_np,
                    self.wrong_answer_bigram_2d_pl: wrongs_bigram_np
                })
            logger.info("Train: Step {}, loss {}, accuracy {}".format(
                step, loss, accuracy))

    @classmethod
    def repeat_train_and_validate(cls, data_train, data_devs, data_test,
                                  options):
        tf.set_random_seed(options.seed)
        parser = cls(options, data_train)
        for question in data_train:
            question.fill_ids(parser.statistics)
        for file_name, data_dev in data_devs.items():
            for question in data_dev:
                question.fill_ids(parser.statistics)
        while True:
            step = parser.session.run(parser.global_step)
            if step > options.steps:
                break
            parser.random.shuffle(data_train)
            parser.train(data_train)
            for file_name, data_dev in data_devs.items():
                try:
                    prefix, suffix = os.path.basename(file_name).rsplit(".", 1)
                except ValueError:
                    prefix = os.path.basename(file_name)
                    suffix = ""
                dev_output = os.path.join(
                    options.output,
                    '{}_step_{}.{}'.format(prefix, step, suffix))
                scores = list(parser.predict(data_dev))
                with open(dev_output, "w") as f_output:
                    for score in scores:
                        f_output.write("{}\n".format(score))

    @classmethod
    def load(cls, prefix, new_options=None):
        pass

    def predict(self, data_dev):
        for questions_np, questions_bigram_np,\
            answer_np, answer_bigram_np in generate_predict_batches(
                data_dev, self.options.batch_size
        ):
            similarities = self.session.run(
                self.similarity, {
                    self.question_2d_pl: questions_np,
                    self.question_bigram_2d_pl: questions_bigram_np,
                    self.answer_2d_pl: answer_np,
                    self.answer_bigram_2d_pl: answer_bigram_np
                })
            for similarity in similarities:
                yield similarity

    def save(self, prefix):
        pass
Ejemplo n.º 31
0
def training_embedding(reverse_dictionary, with_dp=False):
    """
    # training with DP
    :param with_dp:
    :return:
    """
    batch_size = 128
    embedding_size = 300  # Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  # How many times to reuse an input to generate a label.
    # We pick a random validation set to sample nearest neighbors. here we limit the
    # validation samples to the words that have a low numeric ID, which by
    # construction are also the most frequent.
    valid_size = 16  # Random set of words to evaluate similarity on.
    valid_window = 100  # Only pick dev samples in the head of the distribution.
    valid_examples = np.array(random.sample(range(valid_window), valid_size))
    num_sampled = 64  # Number of negative examples to sample.

    learning_rate = 1

    # DP parameters
    clip_bound = 0.01  # 'the clip bound of the gradients'
    # num_steps = 160000  # 'number of steps T = E * N / L = E / q'
    sigma = 5  # 'sigma'
    delta = 1e-5  # 'delta'

    sess = tf.InteractiveSession()

    graph = tf.Graph()
    avg_loss_arr = []
    loss_arr = []
    # with graph.as_default(), tf.device('/cpu:0'):
    # Input data.
    with tf.device('/gpu:0'):
        train_dataset = tf.placeholder(tf.int32, shape=[batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)

        # Variables.
        embeddings = tf.Variable(
            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

    # Model.
    # Look up embeddings for inputs.
    embed = tf.nn.embedding_lookup(embeddings, train_dataset)

    if FLAGS.with_nce_loss:
        nce_weights = tf.Variable(
            tf.truncated_normal([vocabulary_size, embedding_size],
                                stddev=1.0 / math.sqrt(embedding_size)))
        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

        cross_entropy = tf.reduce_mean(
            tf.nn.nce_loss(weights=nce_weights,
                           biases=nce_biases,
                           labels=train_labels,
                           inputs=embed,
                           num_sampled=num_sampled,
                           num_classes=vocabulary_size))
    else:
        with tf.device('/gpu:0'):
            softmax_weights = tf.Variable(
                tf.truncated_normal([vocabulary_size, embedding_size],
                                    stddev=1.0 / math.sqrt(embedding_size)))
            softmax_biases = tf.Variable(tf.zeros([vocabulary_size]))
            # Compute the softmax loss, using a sample of the negative labels each time.
            # Read more: https://stackoverflow.com/questions/37671974/tensorflow-negative-sampling
            # When we want to compute the softmax probability for your true label,
            # we compute: logits[true_label] / sum(logits[negative_sampled_labels]
            # Other candidate sampling: https://www.tensorflow.org/extras/candidate_sampling.pdf
        cross_entropy = tf.reduce_mean(
            tf.nn.sampled_softmax_loss(weights=softmax_weights,
                                       biases=softmax_biases,
                                       inputs=embed,
                                       labels=train_labels,
                                       num_sampled=num_sampled,
                                       num_classes=vocabulary_size))

    priv_accountant = accountant.GaussianMomentsAccountant(vocabulary_size)
    privacy_accum_op = priv_accountant.accumulate_privacy_spending(
        [None, None], sigma, batch_size)

    # Optimizer.
    # Note: The optimizer will optimize the softmax_weights AND the embeddings.
    # This is because the embeddings are defined as a variable quantity and the
    # optimizer's `minimize` method will by default modify all variable quantities
    # that contribute to the tensor it is passed.
    # See docs on `tf.train.Optimizer.minimize()` for more details.
    # optimizer = tf.train.AdagradOptimizer(learning_rate).minimize(loss)
    # optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)

    optimizer = GradientDescentOptimizer(learning_rate)
    if FLAGS.optimizer == "adam":
        # cannot use adam so far. Tested and the model couldn't converge.
        optimizer = AdamOptimizer(learning_rate)
        print("##INFO: Using adam optimizer")
    if FLAGS.optimizer == "adagrad":
        # cannot use adam so far. Tested and the model couldn't converge.
        optimizer = AdagradOptimizer(learning_rate)
        print("##INFO: Using adagrad optimizer")

    log_dir = os.path.join(FLAGS.trained_models, "logs")

    # compute gradient
    if FLAGS.with_nce_loss:
        gw_Embeddings = tf.gradients(cross_entropy,
                                     embeddings)[0]  # gradient of embeddings
        gw_softmax_weights = tf.gradients(
            cross_entropy, nce_weights)[0]  # gradient of nce_weights
        gb_softmax_biases = tf.gradients(
            cross_entropy, nce_biases)[0]  # gradient of nce_biases
    else:
        with tf.device('/gpu:0'):
            gw_Embeddings = tf.gradients(
                cross_entropy, embeddings)[0]  # gradient of embeddings
            gw_softmax_weights = tf.gradients(
                cross_entropy,
                softmax_weights)[0]  # gradient of softmax_weights
            gb_softmax_biases = tf.gradients(
                cross_entropy, softmax_biases)[0]  # gradient of softmax_biases

    # clip gradient
    if FLAGS.clip_by_norm:
        # faster but takes more epochs to train
        with tf.device('/gpu:0'):
            gw_Embeddings = tf.clip_by_norm(gw_Embeddings, clip_bound)
            gw_softmax_weights = tf.clip_by_norm(gw_softmax_weights,
                                                 clip_bound)
            gb_softmax_biases = tf.clip_by_norm(gb_softmax_biases, clip_bound)
    else:
        # dp-sgd: slow and require more memory but converge faster, take less epochs.
        gw_Embeddings = utils.BatchClipByL2norm(gw_Embeddings, clip_bound)
        gw_softmax_weights = utils.BatchClipByL2norm(gw_softmax_weights,
                                                     clip_bound)
        gb_softmax_biases = utils.BatchClipByL2norm(gb_softmax_biases,
                                                    clip_bound)

    sensitivity = clip_bound  # adjacency matrix with one more tuple

    # Add noise
    if FLAGS.with_dp:
        gw_Embeddings += tf.random_normal(shape=tf.shape(gw_Embeddings),
                                          mean=0.0,
                                          stddev=sigma * (sensitivity**2),
                                          dtype=tf.float32)
        gw_softmax_weights += tf.random_normal(
            shape=tf.shape(gw_softmax_weights),
            mean=0.0,
            stddev=sigma * (sensitivity**2),
            dtype=tf.float32)
        gb_softmax_biases += tf.random_normal(
            shape=tf.shape(gb_softmax_biases),
            mean=0.0,
            stddev=sigma * (sensitivity**2),
            dtype=tf.float32)

    if FLAGS.with_nce_loss:
        train_step = optimizer.apply_gradients([
            (gw_Embeddings, embeddings), (gw_softmax_weights, nce_weights),
            (gb_softmax_biases, nce_biases)
        ])
    else:
        train_step = optimizer.apply_gradients([
            (gw_Embeddings, embeddings), (gw_softmax_weights, softmax_weights),
            (gb_softmax_biases, softmax_biases)
        ])

    # Compute the similarity between minibatch examples and all embeddings.
    # We use the cosine distance:
    with tf.device('/gpu:0'):
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
    normalized_embeddings = embeddings / norm
    valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings,
                                              valid_dataset)
    similarity = tf.matmul(valid_embeddings,
                           tf.transpose(normalized_embeddings))

    min_loss = 10**4
    per_dec_count = 0

    print('Initialized')
    average_loss = 0

    running = True
    step = 0
    average_loss_arr = []
    saving_pointer_idx = 0

    # put it here because Adam has its own variables.
    sess.run(tf.global_variables_initializer())

    # saver must be used after global_variables_initializer
    saver = tf.train.Saver()

    # Save the variables to disk.
    save_path = os.path.join(FLAGS.trained_models, "initialized_model.ckpt")
    # Sonvx: we need to make sure initialized variables are all the same for different tests.
    print("Checking on path: ", save_path)
    if not os.path.isfile(save_path + ".index"):
        saved_info = saver.save(sess, save_path)
        print("Global initialized model saved in file: %s" % saved_info)
    else:
        saver.restore(sess, save_path)
        print("Restored the global initialized model.")
    if FLAGS.DEBUG:
        input(
            "Double check whether or not the initialized model got restored then <Press enter>"
        )
    print('###INFO: Initialized in run(graph)')

    if FLAGS.RESTORE_LAST_CHECK_POINT:
        checkpoint_path = os.path.join(log_dir, "model.ckpt")
        if os.path.isfile(checkpoint_path + ".index"):
            saver.restore(sess, checkpoint_path)
            print("Restored the latest checkpoint at %s." % (checkpoint_path))

    while running:
        # for step in range(num_steps):
        batch_data, batch_labels = generate_batch(batch_size, num_skips,
                                                  skip_window)
        print("Global data_index = ", data_index)
        # feed_dict = {train_dataset: batch_data, train_labels: batch_labels}

        # old: sess.run([optimizer, cross_entropy], feed_dict=feed_dict)
        # template: train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5});
        train_step.run(feed_dict={
            train_dataset: batch_data,
            train_labels: batch_labels
        })
        loss = cross_entropy.eval(feed_dict={
            train_dataset: batch_data,
            train_labels: batch_labels
        })

        # loss_arr.append(l)
        # average_loss += l
        # current_avg_loss = average_loss/step
        # avg_loss_arr.append(current_avg_loss)

        sess.run([privacy_accum_op])
        # print(step, spent_eps_deltas)

        average_loss += loss

        if step == 0:
            step_dev = 0.1 * 5
        else:
            step_dev = step

        current_avg_loss = np.mean(average_loss) / step_dev
        average_loss_arr.append(current_avg_loss)

        if step % 200 == 0:
            # if step > 0:
            # average_loss = average_loss / 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            print('Average loss at step %d: %f' % (step, current_avg_loss))
            # TODO: turns this back on if not sure how average_loss influences training process
            print("Embedding: ")
            em_val = tf.reduce_mean(tf.abs(embeddings))
            print(sess.run(em_val))
            # average_loss = 0
        # note that this is expensive (~20% slowdown if computed every 500 steps)
        check_step = (FLAGS.NUM_STEPS * 0.2)
        if step % check_step == 0:
            # gw_emb = tf.reduce_mean(tf.abs(gw_Embeddings))
            # print("Embedding gradients: ")
            # print(sess.run(gw_emb))

            sim = similarity.eval()
            for i in range(valid_size):
                valid_word = reverse_dictionary[valid_examples[i]]
                top_k = 8  # number of nearest neighbors
                nearest = (-sim[i, :]).argsort()[1:top_k + 1]
                log = 'Nearest to %s:' % valid_word
            for k in range(top_k):
                close_word = reverse_dictionary[nearest[k]]
                log = '%s %s,' % (log, close_word)
            print(log)

        current_saving_dir = os.path.join(
            FLAGS.trained_models,
            "_%sepoch" % (saving_pointers[saving_pointer_idx]))
        # EARLY STOPPING
        if min_loss >= current_avg_loss:
            min_loss = current_avg_loss
            per_dec_count = 0

            if FLAGS.save_best_model_alltime:
                best_of_saving_point_dir = os.path.join(
                    current_saving_dir, "_best_one")
                if not os.path.exists(best_of_saving_point_dir):
                    os.makedirs(best_of_saving_point_dir)

                temp_embeddings = normalized_embeddings.eval()
                spent_eps_deltas = priv_accountant.get_privacy_spent(
                    sess, target_eps=target_eps)
                saving_state(best_of_saving_point_dir, spent_eps_deltas,
                             temp_embeddings, saver, sess)
            msg = ("Got best model so far at step %s , avg loss = %s" %
                   (step, current_avg_loss))
            logging.info(msg)
            print(msg)
        else:
            per_dec_count += 1

        step += 1

        if per_dec_count == max_early_stopping or step == num_steps:
            running = False

        if (step + 1) in saving_pointers:
            spent_eps_deltas = priv_accountant.get_privacy_spent(
                sess, target_eps=target_eps)
            folder_path = os.path.join(FLAGS.trained_models,
                                       "_%sepoch" % (step + 1))
            temp_embeddings = normalized_embeddings.eval()
            saving_state(folder_path, spent_eps_deltas, temp_embeddings, saver,
                         sess)
            # Make sure we don't increase saving_pointer_idx larger than what the total number of pointers we set.
            if saving_pointer_idx < len(saving_pointers) - 1:
                saving_pointer_idx += 1
            msg = "##INFO: STEP %s: avg_loss history: avg_loss_arr = %s" % (
                step, average_loss_arr)
            logging.info(msg)

        if step % (num_steps - 1) == 0:
            print("Final privacy spent: ", step, spent_eps_deltas)

    print("Stopped at %s, \nFinal avg_loss = %s" % (step, avg_loss_arr))
    print("loss = %s" % (loss_arr))

    # final_embeddings = normalized_embeddings.eval()
    sess.close()