Esempio n. 1
0
def test_measurement():
  opt = YFOptimizer(zero_debias=False)
  w = tf.Variable(np.ones([n_dim, ] ), dtype=tf.float32, name="w", trainable=True)
  b = tf.Variable(np.ones([1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True)
  x = tf.constant(np.ones([n_dim, ], dtype=np.float32), dtype=tf.float32)
  loss = tf.multiply(w, x) + b
  tvars = tf.trainable_variables()

  w_grad_val = tf.placeholder(tf.float32, shape=(n_dim, ) )
  b_grad_val = tf.placeholder(tf.float32, shape=(1, ) )
  apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars) )

  init_op = tf.global_variables_initializer()
  with tf.Session() as sess:
    sess.run(init_op)
    target_h_max = 0.0
    target_h_min = 0.0
    g_norm_squared_avg = 0.0
    g_norm_avg = 0.0
    g_avg = 0.0
    target_dist = 0.0
    for i in range(n_iter):
      feed_dict = {w_grad_val: (i + 1) * np.ones( [n_dim, ], dtype=np.float32),
             b_grad_val: (i + 1) * np.ones( [1, ], dtype=np.float32) }
      res = sess.run( [opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, apply_op], feed_dict=feed_dict)

      g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
        + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
      g_norm_avg = 0.999 * g_norm_avg  \
        + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
      g_avg = 0.999 * g_avg + 0.001 * (i + 1)
 
      target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2*(n_dim + 1)
      target_h_min = 0.999 * target_h_min + 0.001 * max(1, i + 2 - 20)**2*(n_dim + 1)
      target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
      target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

      # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \
      #   " var ", res[3], target_var, " dist ", res[4], target_dist
      assert np.abs(target_h_max - res[1] ) < np.abs(target_h_max) * 1e-3
      assert np.abs(target_h_min - res[2] ) < np.abs(target_h_min) * 1e-3
      assert np.abs(target_var - res[3] ) < np.abs(res[3] ) * 1e-3
      assert np.abs(target_dist - res[4] ) < np.abs(res[4] ) * 1e-3
  print "sync measurement test passed!"
Esempio n. 2
0
    def __init__(self, is_training, config):
        self.batch_size = batch_size = config.batch_size
        self.num_steps = num_steps = config.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps])
        self._targets = tf.placeholder(tf.int32, [batch_size, num_steps])

        # lstm_cell = tf.contrib.rnn.BasicLSTMCell(size, forget_bias=1.0,
        #                                          state_is_tuple=True)
        # if is_training and config.keep_prob < 1:
        #   lstm_cell = tf.contrib.rnn.DropoutWrapper(
        #       lstm_cell, output_keep_prob=config.keep_prob)
        # cell = tf.contrib.rnn.MultiRNNCell([lstm_cell] * config.num_layers,
        #                                    state_is_tuple=True)

        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.
        def lstm_cell():
            # With the latest TensorFlow source code (as of Mar 27, 2017),
            # the BasicLSTMCell will need a reuse parameter which is unfortunately not
            # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
            # an argument check here:
            if 'reuse' in inspect.getargspec(
                    tf.contrib.rnn.BasicLSTMCell.__init__).args:
                return tf.contrib.rnn.BasicLSTMCell(
                    size,
                    forget_bias=1.0,
                    state_is_tuple=True,
                    reuse=tf.get_variable_scope().reuse)
            else:
                return tf.contrib.rnn.BasicLSTMCell(size,
                                                    forget_bias=1.0,
                                                    state_is_tuple=True)

        attn_cell = lstm_cell
        if is_training and config.keep_prob < 1:

            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(
                    lstm_cell(), output_keep_prob=config.keep_prob)

        cell = tf.contrib.rnn.MultiRNNCell(
            [attn_cell() for _ in range(config.num_layers)],
            state_is_tuple=True)

        self._initial_state = cell.zero_state(batch_size, tf.float32)

        with tf.device("/cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size])
            inputs = tf.nn.embedding_lookup(embedding, self._input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        # inputs = [tf.squeeze(input_, [1])
        #           for input_ in tf.split(inputs, num_steps, 1)]
        # outputs, state = tf.contrib.rnn.static_rnn(cell, inputs, initial_state=self._initial_state)

        outputs = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)

        output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size])
        softmax_b = tf.get_variable("softmax_b", [vocab_size])
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(self._targets, [-1])],
            [tf.ones([batch_size * num_steps])])
        cost = tf.reduce_sum(loss) / batch_size
        self._norm_loss = cost / num_steps
        self._cost = loss
        self._final_state = state

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self._norm_loss, tvars),
                                          config.max_grad_norm)

        if config.opt_method == "Adam":
            print("using Adam")
            optimizer = tf.train.AdamOptimizer(self.lr)
        elif config.opt_method == "YF":
            print("using YF")
            self.optimizer = optimizer = YFOptimizer()
        elif config.opt_method == "momSGD":
            print("uisng mom SGD")
            optimizer = tf.train.MomentumOptimizer(self.lr, 0.9)
        elif config.opt_method == "SGD":
            print("uisng SGD")
            optimizer = tf.train.GradientDescentOptimizer(self.lr)
        elif config.opt_method == "Adagrad":
            print("using adagrad")
            optimizer = tf.train.AdagradOptimizer(self.lr)
        else:
            print("Optimizer is not supported")

        self._train_op = optimizer.apply_gradients(zip(grads, tvars))

        self.train_loss_summary = tf.summary.scalar('train_loss',
                                                    self._norm_loss)

        self.writer = tf.summary.FileWriter(
            os.path.join(config.log_dir, time.strftime("%Y-%m-%d-%H-%M-%S")))
Esempio n. 3
0
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self.random_seed)
            # placeholder
            self.feat_index = tf.placeholder(tf.int32,
                                             shape=[None, self.field_size],
                                             name="feat_index")  # None * F
            self.feat_value = tf.placeholder(tf.float32,
                                             shape=[None, self.field_size],
                                             name="feat_value")  # None * F
            logger.info(self.feat_index.shape)
            logger.info(self.feat_value.shape)
            self.label = tf.placeholder(tf.float32,
                                        shape=[None, 1],
                                        name="label")  # None * 1
            self.dropout_keep_fm = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="dropout_keep_fm")
            self.dropout_keep_deep = tf.placeholder(tf.float32,
                                                    shape=[None],
                                                    name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")
            # 初始化模型的参数
            self.weights = self._initialize_weights()
            pprint(self.weights)

            # model
            self.embeddings = tf.nn.embedding_lookup(
                self.weights["feature_embeddings"], self.feat_index
            )  # None * F(39) * K # feature_embeddings= 259 * k
            feat_value = tf.reshape(self.feat_value,
                                    shape=[-1, self.field_size,
                                           1])  # None * 39 * 1
            self.embeddings = tf.multiply(self.embeddings,
                                          feat_value)  # 将连续变量做一个乘法处理
            logger.info(self.embeddings)  # None * 39 * K(8)

            # ---------- first order term ----------
            self.y_first_order = tf.nn.embedding_lookup(
                self.weights["feature_bias"],
                self.feat_index)  # None * F * 1  # feature_bias 259 * 1
            self.y_first_order = tf.reduce_sum(
                tf.multiply(self.y_first_order, feat_value),
                2)  # None * F(39)  # 线性组合部分, 常数项没有?
            self.y_first_order = tf.nn.dropout(
                self.y_first_order, self.dropout_keep_fm[0])  # None * F

            # ---------- second order term ---------------
            # sum_square part  # 元素和的平方
            self.summed_features_emb = tf.reduce_sum(self.embeddings,
                                                     1)  # None * K
            self.summed_features_emb_square = tf.square(
                self.summed_features_emb)  # None * K

            # square_sum part # 平方的加和
            self.squared_features_emb = tf.square(self.embeddings)
            self.squared_sum_features_emb = tf.reduce_sum(
                self.squared_features_emb, 1)  # None * K

            # second order
            self.y_second_order = 0.5 * tf.subtract(
                self.summed_features_emb_square,
                self.squared_sum_features_emb)  # None * K
            self.y_second_order = tf.nn.dropout(
                self.y_second_order, self.dropout_keep_fm[1])  # None * K

            # ---------- Deep component ----------
            self.y_deep = tf.reshape(self.embeddings,
                                     shape=[
                                         -1,
                                         self.field_size * self.embedding_size
                                     ])  # None * (F*K)  # FM 和 deep 共享输入
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
            for i in range(0, len(self.deep_layers)):
                self.y_deep = tf.add(
                    tf.matmul(self.y_deep, self.weights["layer_%d" % i]),
                    self.weights["bias_%d" % i])  # None * layer[i] * 1
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(
                        self.y_deep,
                        train_phase=self.train_phase,
                        scope_bn="bn_%d" % i)  # None * layer[i] * 1
                self.y_deep = self.deep_layers_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(
                    self.y_deep,
                    self.dropout_keep_deep[1 +
                                           i])  # dropout at each Deep layer

            # ---------- DeepFM ----------
            if self.use_fm and self.use_deep:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order, self.y_deep],
                    axis=1)  # None *(F + K + deeplayers[-1] nodes)
            elif self.use_fm:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order], axis=1)  #
            elif self.use_deep:
                concat_input = self.y_deep
            logger.info(concat_input)
            self.out = tf.add(
                tf.matmul(concat_input, self.weights["concat_projection"]),
                self.weights["concat_bias"])

            # loss
            if self.loss_type == "logloss":
                self.out = tf.nn.sigmoid(self.out)
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            # l2 regularization on weights
            if self.l2_reg > 0:
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weights["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layers)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weights["layer_%d" % i])

            # optimizer
            if self.optimizer_type == "adam":
                self.optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate,
                    beta1=0.9,
                    beta2=0.999,
                    epsilon=1e-8).minimize(self.loss)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.train.AdagradOptimizer(
                    learning_rate=self.learning_rate,
                    initial_accumulator_value=1e-8).minimize(self.loss)
            elif self.optimizer_type == "gd":
                self.optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=self.learning_rate).minimize(self.loss)
            elif self.optimizer_type == "momentum":
                self.optimizer = tf.train.MomentumOptimizer(
                    learning_rate=self.learning_rate,
                    momentum=0.95).minimize(self.loss)
            elif self.optimizer_type == "yellowfin":
                self.optimizer = YFOptimizer(learning_rate=self.learning_rate,
                                             momentum=0.0).minimize(self.loss)

            # init
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)
            # save_path = self.saver.save(self.sess, save_path=os.path.join(SUB_DIR, "model"), global_step=0)
            # logger.info("模型初始化完成,保存路径为:{}".format(save_path))
            # writer = tf.summary.FileWriter("./logs", self.sess.graph)

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
Esempio n. 4
0
    cudnn.benchmark = True

criterion = nn.CrossEntropyLoss()
if args.opt_method == "SGD":
    logging.info("using SGD")
    optimizer = optim.SGD(net.parameters(),
                          lr=args.lr,
                          momentum=0.9,
                          weight_decay=5e-4)
elif args.opt_method == "Adam":
    logging.info("using Adam")
    optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=5e-4)
elif args.opt_method == "YF":
    logging.info("using YF")
    optimizer = YFOptimizer(net.parameters(),
                            lr=args.lr,
                            mu=args.mu,
                            weight_decay=5e-4)
else:
    raise Exception("Optimizer not supported")


# Training
def train(epoch, opt,
    loss_list,\
    local_curv_list,\
    max_curv_list,\
    min_curv_list,\
    lr_list,\
    lr_t_list,\
    mu_t_list,\
    dr_list,\
Esempio n. 5
0
def test_measurement():
    opt = YFOptimizer(zero_debias=False)
    w = tf.Variable(np.ones([
        n_dim,
    ]),
                    dtype=tf.float32,
                    name="w",
                    trainable=True)
    b = tf.Variable(np.ones([
        1,
    ], dtype=np.float32),
                    dtype=tf.float32,
                    name="b",
                    trainable=True)
    x = tf.constant(np.ones([
        n_dim,
    ], dtype=np.float32), dtype=tf.float32)
    loss = tf.multiply(w, x) + b
    tvars = tf.trainable_variables()

    w_grad_val = tf.placeholder(tf.float32, shape=(n_dim, ))
    b_grad_val = tf.placeholder(tf.float32, shape=(1, ))
    apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars))

    init_op = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init_op)
        target_h_max = 0.0
        target_h_min = 0.0
        g_norm_squared_avg = 0.0
        g_norm_avg = 0.0
        g_avg = 0.0
        target_dist = 0.0
        for i in range(n_iter):
            feed_dict = {
                w_grad_val: (i + 1) * np.ones([
                    n_dim,
                ], dtype=np.float32),
                b_grad_val: (i + 1) * np.ones([
                    1,
                ], dtype=np.float32)
            }
            res = sess.run([
                opt._curv_win, opt._h_max, opt._h_min, opt._grad_var,
                opt._dist_to_opt_avg, apply_op
            ],
                           feed_dict=feed_dict)

            g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
              + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
            g_norm_avg = 0.999 * g_norm_avg  \
              + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
            g_avg = 0.999 * g_avg + 0.001 * (i + 1)

            target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim +
                                                                        1)
            target_h_min = 0.999 * target_h_min + 0.001 * max(
                1, i + 2 - 20)**2 * (n_dim + 1)
            target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
            target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

            # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \
            #   " var ", res[3], target_var, " dist ", res[4], target_dist
            assert np.abs(target_h_max - res[1]) < np.abs(target_h_max) * 1e-3
            assert np.abs(target_h_min - res[2]) < np.abs(target_h_min) * 1e-3
            assert np.abs(target_var - res[3]) < np.abs(res[3]) * 1e-3
            assert np.abs(target_dist - res[4]) < np.abs(res[4]) * 1e-3
    print "sync measurement test passed!"
Esempio n. 6
0
    def __init__(self, is_training, config, input_, opt_method='sgd'):
        self._input = input_

        batch_size = input_.batch_size
        num_steps = input_.num_steps
        size = config.hidden_size
        vocab_size = config.vocab_size

        # Slightly better results can be obtained with forget gate biases
        # initialized to 1 but the hyperparameters of the model would need to be
        # different than reported in the paper.
        def lstm_cell():
            # With the latest TensorFlow source code (as of Mar 27, 2017),
            # the BasicLSTMCell will need a reuse parameter which is unfortunately not
            # defined in TensorFlow 1.0. To maintain backwards compatibility, we add
            # an argument check here:
            if 'reuse' in inspect.getargspec(
                    tf.contrib.rnn.BasicLSTMCell.__init__).args:
                return tf.contrib.rnn.BasicLSTMCell(
                    size,
                    forget_bias=0.0,
                    state_is_tuple=True,
                    reuse=tf.get_variable_scope().reuse)
            else:
                return tf.contrib.rnn.BasicLSTMCell(size,
                                                    forget_bias=0.0,
                                                    state_is_tuple=True)

        attn_cell = lstm_cell
        if is_training and config.keep_prob < 1:

            def attn_cell():
                return tf.contrib.rnn.DropoutWrapper(
                    lstm_cell(), output_keep_prob=config.keep_prob)

        cell = tf.contrib.rnn.MultiRNNCell(
            [attn_cell() for _ in range(config.num_layers)],
            state_is_tuple=True)

        self._initial_state = cell.zero_state(batch_size, data_type())

        with tf.device("cpu:0"):
            embedding = tf.get_variable("embedding", [vocab_size, size],
                                        dtype=data_type())
            inputs = tf.nn.embedding_lookup(embedding, input_.input_data)

        if is_training and config.keep_prob < 1:
            inputs = tf.nn.dropout(inputs, config.keep_prob)

        # Simplified version of tensorflow.models.rnn.rnn.py's rnn().
        # This builds an unrolled LSTM for tutorial purposes only.
        # In general, use the rnn() or state_saving_rnn() from rnn.py.
        #
        # The alternative version of the code below is:
        #
        # inputs = tf.unstack(inputs, num=num_steps, axis=1)
        # outputs, state = tf.nn.rnn(cell, inputs, initial_state=self._initial_state)
        outputs = []
        state = self._initial_state
        with tf.variable_scope("RNN"):
            for time_step in range(num_steps):
                if time_step > 0: tf.get_variable_scope().reuse_variables()
                (cell_output, state) = cell(inputs[:, time_step, :], state)
                outputs.append(cell_output)

        output = tf.reshape(tf.stack(axis=1, values=outputs), [-1, size])
        softmax_w = tf.get_variable("softmax_w", [size, vocab_size],
                                    dtype=data_type())
        softmax_b = tf.get_variable("softmax_b", [vocab_size],
                                    dtype=data_type())
        logits = tf.matmul(output, softmax_w) + softmax_b
        loss = tf.contrib.legacy_seq2seq.sequence_loss_by_example(
            [logits], [tf.reshape(input_.targets, [-1])],
            [tf.ones([batch_size * num_steps], dtype=data_type())])
        # self._cost = cost = tf.reduce_sum(loss) / batch_size
        self._cost = cost = tf.reduce_sum(loss) / (batch_size * num_steps)
        self._final_state = state

        if not is_training:
            return

        self._lr = tf.Variable(0.0, trainable=False)
        self._mu = tf.Variable(0.0, trainable=False)
        self._grad_norm_thresh = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        self.tvars = tvars

        self.grads = tf.gradients(cost, tvars)

        grads_clip, self.grad_norm = tf.clip_by_global_norm(
            self.grads, self._grad_norm_thresh)
        if opt_method == 'sgd':
            optimizer = tf.train.GradientDescentOptimizer(self._lr)
            self._train_op = optimizer.apply_gradients(
                zip(grads_clip, tvars),
                global_step=tf.contrib.framework.get_or_create_global_step())

        elif opt_method == 'mom':
            print("using sgd mom")
            optimizer = tf.train.MomentumOptimizer(self._lr, self._mu)
            self._train_op = optimizer.apply_gradients(
                zip(grads_clip, tvars),
                global_step=tf.contrib.framework.get_or_create_global_step())
        elif opt_method == 'adam':
            optimizer = tf.train.AdamOptimizer(self._lr)
            self._train_op = optimizer.apply_gradients(
                zip(grads_clip, tvars),
                global_step=tf.contrib.framework.get_or_create_global_step())
        elif opt_method == 'YF':
            optimizer = YFOptimizer(lr=1.0, mu=0.0)
            self._train_op = optimizer.apply_gradients(zip(self.grads, tvars))
        else:
            raise Exception("optimizer not supported")

        self._new_lr = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_learning_rate")
        self._lr_update = tf.assign(self._lr, self._new_lr)

        self._new_mu = tf.placeholder(tf.float32,
                                      shape=[],
                                      name="new_momentum")
        self._mu_update = tf.assign(self._mu, self._new_mu)

        self._new_grad_norm_thresh = tf.placeholder(
            tf.float32, shape=[], name="new_grad_norm_thresh")
        self._grad_norm_thresh_update = tf.assign(self._grad_norm_thresh,
                                                  self._new_grad_norm_thresh)
def learn(dataset,
          rank=2,
          scale=1.,
          learning_rate=1e-1,
          tol=1e-8,
          epochs=100,
          use_yellowfin=False,
          use_adagrad=False,
          print_freq=1,
          model_save_file=None,
          model_load_file=None,
          batch_size=16,
          num_workers=None,
          lazy_generation=False,
          log_name=None,
          warm_start=None,
          learn_scale=False,
          checkpoint_freq=1000,
          sample=1.,
          subsample=None,
          exponential_rescale=None,
          extra_steps=1,
          use_svrg=False,
          T=10,
          use_hmds=False):
    # Log configuration
    formatter = logging.Formatter('%(asctime)s %(message)s')
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s %(message)s',
        datefmt='%FT%T',
    )
    if log_name is not None:
        logging.info(f"Logging to {log_name}")
        log = logging.getLogger()
        fh = logging.FileHandler(log_name)
        fh.setFormatter(formatter)
        log.addHandler(fh)

    logging.info(f"Commandline {sys.argv}")
    if model_save_file is None: logging.warn("No Model Save selected!")
    G = load_graph.load_graph(dataset)
    GM = nx.to_scipy_sparse_matrix(G)

    # grab scale if warm starting:
    if warm_start:
        scale = pandas.read_csv(warm_start, index_col=0).as_matrix()[0, -1]

    n = G.order()
    logging.info(f"Loaded Graph {dataset} with {n} nodes scale={scale}")

    Z = None

    def collate(ls):
        x, y = zip(*ls)
        return torch.cat(x), torch.cat(y)

    if lazy_generation:
        if subsample is not None:
            z = DataLoader(GraphRowSubSampler(G, scale, subsample),
                           batch_size,
                           shuffle=True,
                           collate_fn=collate)
        else:
            z = DataLoader(GraphRowSampler(G, scale),
                           batch_size,
                           shuffle=True,
                           collate_fn=collate)
        logging.info("Built Data Sampler")
    else:
        Z = gh.build_distance(G,
                              scale,
                              num_workers=int(num_workers) if num_workers
                              is not None else 16)  # load the whole matrix
        logging.info(f"Built distance matrix with {scale} factor")

        if subsample is not None:
            z = DataLoader(GraphRowSubSampler(G, scale, subsample, Z=Z),
                           batch_size,
                           shuffle=True,
                           collate_fn=collate)
        else:
            idx = torch.LongTensor([(i, j) for i in range(n)
                                    for j in range(i + 1, n)])
            Z_sampled = gh.dist_sample_rebuild_pos_neg(
                Z, sample) if sample < 1 else Z
            vals = torch.DoubleTensor(
                [Z_sampled[i, j] for i in range(n) for j in range(i + 1, n)])
            z = DataLoader(TensorDataset(idx, vals),
                           batch_size=batch_size,
                           shuffle=True,
                           pin_memory=torch.cuda.is_available())
        logging.info("Built data loader")

    if model_load_file is not None:
        logging.info(f"Loading {model_load_file}...")
        m = cudaify(torch.load(model_load_file))
        logging.info(
            f"Loaded scale {m.scale.data[0]} {torch.sum(m.w.data)} {m.epoch}")
    else:
        logging.info(f"Creating a fresh model warm_start?={warm_start}")

        m_init = None
        if warm_start:
            # load from DataFrame; assume that the julia combinatorial embedding has been saved
            ws_data = pandas.read_csv(warm_start, index_col=0).as_matrix()
            scale = ws_data[0, ws_data.shape[1] - 1]
            m_init = torch.DoubleTensor(ws_data[:,
                                                range(ws_data.shape[1] - 1)])
        elif use_hmds:
            # m_init = torch.DoubleTensor(mds_warmstart.get_normalized_hyperbolic(mds_warmstart.get_model(dataset,rank,scale)[1]))
            m_init = torch.DoubleTensor(
                mds_warmstart.get_model(dataset, rank, scale)[1])

        logging.info(
            f"\t Warmstarting? {warm_start} {m_init.size() if warm_start else None} {G.order()}"
        )
        m = cudaify(
            Hyperbolic_Emb(G.order(),
                           rank,
                           initialize=m_init,
                           learn_scale=learn_scale,
                           exponential_rescale=exponential_rescale))
        m.normalize()
        m.epoch = 0
    logging.info(
        f"Constructed model with rank={rank} and epochs={m.epoch} isnan={np.any(np.isnan(m.w.cpu().data.numpy()))}"
    )

    #
    # Build the Optimizer
    #
    # TODO: Redo this in a sensible way!!
    #
    opt = torch.optim.SGD(m.parameters(), lr=learning_rate)
    if use_yellowfin:
        from yellowfin import YFOptimizer
        opt = YFOptimizer(m.parameters())

    if use_adagrad:
        opt = torch.optim.Adagrad(m.parameters())

    if use_svrg:
        from svrg import SVRG
        base_opt = torch.optim.Adagrad if use_adagrad else torch.optim.SGD
        opt = SVRG(m.parameters(),
                   lr=learning_rate,
                   T=T,
                   data_loader=z,
                   opt=base_opt)

    logging.info(opt)

    # Log stats from import: when warmstarting, check that it matches Julia's stats
    logging.info(f"*** Initial Checkpoint. Computing Stats")
    major_stats(GM, 1 + m.scale.data[0], n, m, lazy_generation, Z, z)
    logging.info("*** End Initial Checkpoint\n")

    for i in range(m.epoch, m.epoch + epochs):
        l = 0.0
        m.train(True)
        if use_svrg:
            for data in z:

                def closure(data=data, target=None):
                    _data = data if target is None else (data, target)
                    c = m.loss(cu_var(_data))
                    c.backward()
                    return c.data[0]

                l += opt.step(closure)

                # Projection
                m.normalize()

        else:
            opt.zero_grad()  # This is handled by the SVRG.
            for the_step in range(extra_steps):
                # Accumulate the gradient
                for u in z:
                    _loss = m.loss(cu_var(u, requires_grad=False))
                    _loss.backward()
                    l += _loss.data[0]
                Hyperbolic_Parameter.correct_metric(
                    m.parameters())  # NB: THIS IS THE NEW CALL
                # print("Scale before step: ", m.scale.data)
                opt.step()
                # print("Scale after step: ", m.scale.data)
                # Projection
                m.normalize()

                #l += step(m, opt, u).data[0]

        # Logging code
        if l < tol:
            logging.info("Found a {l} solution. Done at iteration {i}!")
            break
        if i % print_freq == 0:
            logging.info(f"{i} loss={l}")
        if i % checkpoint_freq == 0:
            logging.info(f"\n*** Major Checkpoint. Computing Stats and Saving")
            major_stats(GM, 1 + m.scale.data[0], n, m, True, Z, z)
            if model_save_file is not None:
                fname = f"{model_save_file}.{m.epoch}"
                logging.info(
                    f"Saving model into {fname} {torch.sum(m.w.data)} ")
                torch.save(m, fname)
            logging.info("*** End Major Checkpoint\n")
        m.epoch += 1

    logging.info(f"final loss={l}")

    if model_save_file is not None:
        fname = f"{model_save_file}.final"
        logging.info(
            f"Saving model into {fname}-final {torch.sum(m.w.data)} {m.scale.data[0]}"
        )
        torch.save(m, fname)

    major_stats(GM, 1 + m.scale.data[0], n, m, lazy_generation, Z, z)
Esempio n. 8
0
    def _init_graph(self):

        # 新生成的图作为整个 tensorflow 运行环境的默认图
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self.random_seed)

            # shape说明是一个二维矩阵,第一维是样本个数吗?像.
            # batch_size * 39
            self.feat_index = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name="feat_index")  # None * F

            # shape说明是一个二维矩阵: batch_size * 39
            self.feat_value = tf.placeholder(tf.float32,
                                             shape=[None, None],
                                             name="feat_value")  # None * F
            self.label = tf.placeholder(tf.float32,
                                        shape=[None, 1],
                                        name="label")  # None * 1
            self.dropout_keep_fm = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="dropout_keep_fm")
            self.dropout_keep_deep = tf.placeholder(tf.float32,
                                                    shape=[None],
                                                    name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            # 随机初始化weight
            self.weights = self._initialize_weights()

            # print(self.feat_index.shape): (1024, 39)
            # batch_size * 39 * 8(vi、vj的维度???感觉这样不对,这是V的shape,vi、vj的维度应该是batch_size*39*1)
            self.embeddings = tf.nn.embedding_lookup(
                self.weights["feature_embeddings"], self.feat_index
            )  # 之后,self.embeddings的shape:None * F * K(embbed之后的维度)
            # 变成了一个列向量
            # 原来是 batch_size * 39====》 batch_size * 39 * 1
            # reshape将shape变的跟self.embeddings的一致,为了后面两者做multiply
            feat_value = tf.reshape(self.feat_value,
                                    shape=[-1, self.field_size, 1])
            print(self.embeddings.shape, feat_value.shape)

            # batch_size * 39 * 8《==  batch_size * 39 * 8,batch_size * 39 * 1
            # broadcast了feature value
            # 这里与embedding没有什么关系了
            self.embeddings = tf.multiply(self.embeddings, feat_value)

            # ----------wide fm: first order term ----------
            # None * 39 * 1 ,如(1024, 39, 1)
            # 从# 259 * 1的矩阵(self.weights["feature_bias"])中找39行(self.feat_index)。
            self.y_first_order = tf.nn.embedding_lookup(
                self.weights["feature_bias"], self.feat_index)  # None * F * 1
            # self.y_first_order_weights = self.y_first_order

            # 临时加上去的。后面删掉
            self.y_first_order_tmp = self.y_first_order

            # self.y_first_order, feat_value都是batch_size * 39 * 1
            # 算完之后就是batch_size * 39
            self.y_first_order = tf.reduce_sum(
                tf.multiply(self.y_first_order, feat_value), 2)  # None * F

            # 这个是一个向量,跟二阶、和dnn output出来的高阶,concatenate到一起。根据fm原理不太一样,应该output出来一个scalar才是,
            # 但是看concatenate之后,算了一个内积,那么也就是这个计算,作为一阶,是多余的,后面concatenate之后才是真的一阶。
            # 但算了两次线性,跟一次线性计算,最终效果是一样的。
            self.y_first_order = tf.nn.dropout(
                self.y_first_order, self.dropout_keep_fm[0])  # None * F

            # ----------wide fm: second order term ---------------
            # sum_square part
            # batch_size * 8
            self.summed_features_emb = tf.reduce_sum(self.embeddings,
                                                     1)  # None * K
            # batch_size * 8
            self.summed_features_emb_square = tf.square(
                self.summed_features_emb)  # None * K

            # square_sum part
            # batch_size * 8
            self.squared_features_emb = tf.square(self.embeddings)
            # batch_size * 8
            self.squared_sum_features_emb = tf.reduce_sum(
                self.squared_features_emb, 1)  # None * K

            # second order
            # batch_size * 8。每个样本成了一个vector了
            self.y_second_order = 0.5 * tf.subtract(
                self.summed_features_emb_square,
                self.squared_sum_features_emb)  # None * K
            # batch_size * 8
            self.y_second_order = tf.nn.dropout(
                self.y_second_order, self.dropout_keep_fm[1])  # None * K

            # ---------- Deep component ----------
            self.y_deep = tf.reshape(
                self.embeddings,
                shape=[-1,
                       self.field_size * self.embedding_size])  # None * (F*K)
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
            for i in range(0, len(self.deep_layers)):
                self.y_deep = tf.add(
                    tf.matmul(self.y_deep, self.weights["layer_%d" % i]),
                    self.weights["bias_%d" % i])  # None * layer[i] * 1
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(
                        self.y_deep,
                        train_phase=self.train_phase,
                        scope_bn="bn_%d" % i)  # None * layer[i] * 1
                self.y_deep = self.deep_layers_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(
                    self.y_deep,
                    self.dropout_keep_deep[1 +
                                           i])  # dropout at each Deep layer

            # ---------- DeepFM ----------
            if self.use_fm and self.use_deep:
                # 一阶和二阶的concat到一起
                # (1024, 39)+(1024, 8)+(1024, 32)===>(1024, 79)
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order, self.y_deep],
                    axis=1)
            elif self.use_fm:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order], axis=1)
            elif self.use_deep:
                concat_input = self.y_deep

            self.concat_input = concat_input

            # 这是一个scalar
            neiji = tf.matmul(concat_input, self.weights["concat_projection"])

            # 获得了一个scalar
            self.out = tf.add(neiji, self.weights["concat_bias"])

            # loss
            if self.loss_type == "logloss":
                self.out = tf.nn.sigmoid(self.out)
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            # l2 regularization on weights
            if self.l2_reg > 0:
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weights["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layers)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weights["layer_%d" % i])

            # optimizer
            if self.optimizer_type == "adam":
                self.optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate,
                    beta1=0.9,
                    beta2=0.999,
                    epsilon=1e-8).minimize(self.loss)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.train.AdagradOptimizer(
                    learning_rate=self.learning_rate,
                    initial_accumulator_value=1e-8).minimize(self.loss)
            elif self.optimizer_type == "gd":
                self.optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=self.learning_rate).minimize(self.loss)
            elif self.optimizer_type == "momentum":
                self.optimizer = tf.train.MomentumOptimizer(
                    learning_rate=self.learning_rate,
                    momentum=0.95).minimize(self.loss)
            elif self.optimizer_type == "yellowfin":
                self.optimizer = YFOptimizer(learning_rate=self.learning_rate,
                                             momentum=0.0).minimize(self.loss)

            # init
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            writer = tf.summary.FileWriter("logs/",
                                           self.sess.graph)  # 第一个参数指定生成文件的目录。

            self.sess.run(init)
            # print(self.embeddings.shape, feat_value.shape)

            # 统计weight个数(number of params)
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
Esempio n. 9
0
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self.random_seed)

            self.feat_index = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name="feat_index")  # None * F
            self.feat_value = tf.placeholder(tf.float32,
                                             shape=[None, None],
                                             name="feat_value")  # None * F
            self.label = tf.placeholder(tf.float32,
                                        shape=[None, 1],
                                        name="label")  # None * 1
            self.dropout_keep_fm = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="dropout_keep_fm")
            self.dropout_keep_deep = tf.placeholder(tf.float32,
                                                    shape=[None],
                                                    name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            self.weights = self._initialize_weights()

            # model
            self.embeddings = tf.nn.embedding_lookup(
                self.weights["feature_embeddings"],
                self.feat_index)  # None * F * K
            feat_value = tf.reshape(self.feat_value,
                                    shape=[-1, self.field_size, 1])
            self.embeddings = tf.multiply(self.embeddings, feat_value)

            # ---------- first order term ----------
            self.y_first_order = tf.nn.embedding_lookup(
                self.weights["feature_bias"], self.feat_index)  # None * F * 1
            self.y_first_order = tf.reduce_sum(
                tf.multiply(self.y_first_order, feat_value), 2)  # None * F
            self.y_first_order = tf.nn.dropout(
                self.y_first_order, self.dropout_keep_fm[0])  # None * F

            # ---------- second order term ---------------
            # sum_square part
            self.summed_features_emb = tf.reduce_sum(self.embeddings,
                                                     1)  # None * K
            self.summed_features_emb_square = tf.square(
                self.summed_features_emb)  # None * K

            # square_sum part
            self.squared_features_emb = tf.square(self.embeddings)
            self.squared_sum_features_emb = tf.reduce_sum(
                self.squared_features_emb, 1)  # None * K

            # second order
            self.y_second_order = 0.5 * tf.subtract(
                self.summed_features_emb_square,
                self.squared_sum_features_emb)  # None * K
            self.y_second_order = tf.nn.dropout(
                self.y_second_order, self.dropout_keep_fm[1])  # None * K

            # high order
            if self.use_fm and self.use_deep:
                z = tf.layers.Dense(
                    self.embedding_size,
                    kernel_initializer=tf.glorot_uniform_initializer(
                        seed=2017),
                    dtype=tf.float32,
                    bias_initializer=tf.zeros_initializer())(
                        self.y_second_order)
                z = tf.nn.relu(z)
                y_second_order = tf.nn.dropout(z, 0.5)

            if self.use_xfm:
                field_nums = [self.field_size]
                final_len = 0
                # self.embeddings = None * F * K
                hidden_nn_layers = [self.embeddings]
                final_result = []
                split_tensor0 = tf.split(hidden_nn_layers[-1],
                                         self.embedding_size * [1], 2)

                for idx, layer_size in enumerate([self.field_size] * 3):
                    split_tensor = tf.split(hidden_nn_layers[-1],
                                            self.embedding_size * [1], 2)
                    dot_result_m = tf.matmul(split_tensor0,
                                             split_tensor,
                                             transpose_b=True)
                    dot_result_o = tf.reshape(
                        dot_result_m,
                        shape=[
                            self.embedding_size, -1,
                            field_nums[0] * field_nums[-1]
                        ])
                    dot_result = tf.transpose(dot_result_o, perm=[1, 0, 2])

                    filters = tf.get_variable(
                        name="f_" + str(idx),
                        shape=[1, field_nums[-1] * field_nums[0], layer_size],
                        dtype=tf.float32)
                    curr_out = tf.nn.conv1d(dot_result,
                                            filters=filters,
                                            stride=1,
                                            padding='VALID')

                    # if bians:
                    b = tf.get_variable(name="f_b" + str(idx),
                                        shape=[layer_size],
                                        dtype=tf.float32,
                                        initializer=tf.zeros_initializer())
                    curr_out = tf.nn.bias_add(curr_out, b)

                    curr_out = tf.nn.relu(curr_out)
                    curr_out = tf.transpose(curr_out, perm=[0, 2, 1])

                    direct_connect = curr_out
                    next_hidden = curr_out
                    final_len += layer_size
                    field_nums.append(int(layer_size))

                    final_result.append(direct_connect)
                    hidden_nn_layers.append(next_hidden)

                result = tf.concat(final_result, axis=1)
                result = tf.reduce_sum(result, -1)

                # res net
                w_nn_output1 = tf.get_variable(name='w_nn_output1',
                                               shape=[final_len, 128],
                                               dtype=tf.float32)
                b_nn_output1 = tf.get_variable(
                    name='b_nn_output1',
                    shape=[128],
                    dtype=tf.float32,
                    initializer=tf.zeros_initializer())

                exFM_out0 = tf.nn.xw_plus_b(result, w_nn_output1, b_nn_output1)
                exFM_out1 = tf.nn.dropout(exFM_out0, 0.3)
                exFM_out1 = tf.nn.relu(exFM_out1)
                w_nn_output2 = tf.get_variable(
                    name='w_nn_output2',
                    shape=[128 + final_len, self.embedding_size],
                    dtype=tf.float32)
                b_nn_output2 = tf.get_variable(
                    name='b_nn_output2',
                    shape=[self.embedding_size],
                    dtype=tf.float32,
                    initializer=tf.zeros_initializer())

                exFM_in = tf.concat([exFM_out1, result],
                                    axis=1,
                                    name="user_emb")
                self.exFM_out = tf.nn.xw_plus_b(exFM_in, w_nn_output2,
                                                b_nn_output2)

            # ---------- Deep component ----------
            self.y_deep = tf.reshape(
                self.embeddings,
                shape=[-1,
                       self.field_size * self.embedding_size])  # None * (F*K)
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
            for i in range(0, len(self.deep_layers)):
                self.y_deep = tf.add(
                    tf.matmul(self.y_deep, self.weights["layer_%d" % i]),
                    self.weights["bias_%d" % i])  # None * layer[i] * 1
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(
                        self.y_deep,
                        train_phase=self.train_phase,
                        scope_bn="bn_%d" % i)  # None * layer[i] * 1
                self.y_deep = self.deep_layers_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(
                    self.y_deep,
                    self.dropout_keep_deep[1 +
                                           i])  # dropout at each Deep layer

            # ---------- DeepFM ----------
            if self.use_fm and self.use_deep:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order, self.y_deep],
                    axis=1)
            elif self.use_xfm and self.use_deep:
                concat_input = tf.concat(
                    [self.y_first_order, self.exFM_out, self.y_deep], axis=1)
            elif self.use_fm:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order], axis=1)
            elif self.use_deep:
                concat_input = self.y_deep

            self.out = tf.add(
                tf.matmul(concat_input, self.weights["concat_projection"]),
                self.weights["concat_bias"])

            # loss
            if self.loss_type == "logloss":
                self.out = tf.nn.sigmoid(self.out)
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            # l2 regularization on weights
            if self.l2_reg > 0:
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weights["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layers)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weights["layer_%d" % i])

            # optimizer
            if self.optimizer_type == "adam":
                self.optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate,
                    beta1=0.9,
                    beta2=0.999,
                    epsilon=1e-8).minimize(self.loss)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.train.AdagradOptimizer(
                    learning_rate=self.learning_rate,
                    initial_accumulator_value=1e-8).minimize(self.loss)
            elif self.optimizer_type == "gd":
                self.optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=self.learning_rate).minimize(self.loss)
            elif self.optimizer_type == "momentum":
                self.optimizer = tf.train.MomentumOptimizer(
                    learning_rate=self.learning_rate,
                    momentum=0.95).minimize(self.loss)
            elif self.optimizer_type == "yellowfin":
                self.optimizer = YFOptimizer(learning_rate=self.learning_rate,
                                             momentum=0.0).minimize(self.loss)

            # init
            self.model_path = './model'
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
Esempio n. 10
0
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():
            tf.set_random_seed(self.rand_seed)

            self.feat_index = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name="feat_index")
            self.feat_value = tf.placeholder(tf.float32,
                                             shape=[None, None],
                                             name="feat_value")
            self.label = tf.placeholder(tf.float32,
                                        shape=[None, 1],
                                        name="label")
            self.dropout_keep_fm = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="dropout_keep_fm")
            self.dropout_keep_deep = tf.placeholder(tf.float32,
                                                    shape=[None],
                                                    name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            self.weight = self._init_weight()

            # 创建DeepFM模型图
            self.embeddings = tf.nn.embedding_lookup(
                self.weight['feat_embeddings'], self.feat_index)
            feat_value = tf.reshape(self.feat_value,
                                    shape=[-1, self.field_size, 1])
            self.embeddings = tf.multiply(self.embeddings,
                                          feat_value)  # None*F*K

            # 一阶项
            self.y_first_order = tf.nn.embedding_lookup(
                self.weight["feat_bias"], self.feat_index)
            self.y_first_order = tf.reduce_sum(
                tf.multiply(self.y_first_order, feat_value), 2)
            self.y_first_order = tf.nn.dropout(self.y_first_order,
                                               self.dropout_keep_fm[0])

            # 二阶项
            # 先求和后平方
            sum_feat_emb = tf.reduce_sum(self.embeddings, 1)
            self.sum_square_feat_emb = tf.square(sum_feat_emb)

            # 先平方后求和
            sqrt_feat_emb = tf.square(self.embeddings)
            self.sqrt_sum_feat_emb = tf.reduce_sum(sqrt_feat_emb, 1)

            # 二阶项
            self.y_second_order = 0.5 * tf.subtract(self.sum_square_feat_emb,
                                                    self.sqrt_sum_feat_emb)
            self.y_second_order = tf.nn.dropout(self.y_second_order,
                                                self.dropout_keep_fm[1])

            # deep部分
            self.y_deep = tf.reshape(
                self.embeddings, [-1, self.field_size * self.embedding_size])
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
            for i in range(0, len(self.deep_layer)):
                self.y_deep = tf.add(
                    tf.matmul(self.y_deep, self.weight["layer_%d" % i]),
                    self.weight["bias_%d" % i])
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(self.y_deep,
                                                        self.train_phase,
                                                        scope_bn="bn_%d" % i)
                self.y_deep = self.deep_layer_activation(self.y_deep)

            # 三部分向量拼接
            if self.use_deep and self.use_fm:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order, self.y_deep],
                    axis=1)
            elif self.use_fm:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order], axis=1)
            elif self.use_deep:
                concat_input = self.y_deep
            self.out = tf.add(
                tf.matmul(concat_input, self.weight['concat_projection']),
                self.weight['concat_bias'])

            # 损失函数
            if self.loss_type == 'logloss':
                self.out = tf.sigmoid(self.out)
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == 'mse':
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))

            # 是否加正则
            if self.l2_reg > 0:
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weight["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layer)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weight["layer_%d" % i])

            # 优化器
            if self.opt_type == 'adam':
                self.opt = tf.train.AdamOptimizer(self.learning_rate).minimize(
                    self.loss)
            elif self.opt_type == 'adagrade':
                self.opt = tf.train.AdagradOptimizer(
                    self.learning_rate).minimize(self.loss)
            elif self.opt_type == 'gd':
                self.opt = tf.train.GradientDescentOptimizer(
                    self.learning_rate).minimize(self.loss)
            elif self.opt_type == 'momentum':
                self.opt = tf.train.MomentumOptimizer(
                    self.learning_rate).minimize(self.loss)
            elif self.opt_type == 'yellowfin':
                self.opt = YFOptimizer(
                    learning_rate=self.learning_rate).minimize(self.loss)

            # 建立init
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)

            # 打印参数数量
            total_parametres = 0
            for variable in self.weight.values():
                shape = variable.get_shape()
                variable_parametres = 1
                for dim in shape:
                    variable_parametres *= dim
                total_parametres += variable_parametres
            if self.verbose > 0:
                print("# params: %d" % total_parametres)
            for p in m2.parameters():
                p.requires_grad = args.learn_bn
    if args.learn_inhibition:
        for p in model.module.parameters():
            p.requires_grad=True
    
    params = trainableParams(model)
    print('    Total params: %.2fM' % (sum(p.numel() for p in model.parameters() if p.requires_grad)/1000000.0))
    if opt_ == 'sgd':
        print('optimizer.... - sgd')
        optimizer = optim.SGD(params , lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
    elif opt_ == 'adam':
        optimizer = optim.Adam(params)
    elif opt_ == 'yf':
        print('USING YF OPTIMIZER')
        optimizer = YFOptimizer(
            params, lr=args.lr, mu=0.0, weight_decay=args.weight_decay, clip_thresh=2.0, curv_win_width=20)
        optimizer._sparsity_debias = False
    else:
        raise Exception('unsupported optimizer type',opt_)

    nParamsPath = os.path.join(args.checkpoint, 'n_params.txt')
    with open(nParamsPath, 'w') as f:
        s1 = 'active_params {} \n'.format(sum(p.numel() for p in model.parameters() if p.requires_grad))
        f.write(s1)
        s2 = 'total_params {} \n'.format(sum(p.numel() for p in model.parameters()))
        f.write(s2)
    if args.print_params_and_exit:
        exit()

    # Resume
    title = 'cifar-10-' + args.arch
Esempio n. 12
0
def train_eval_model(graph_hyper_params):
    def construct_train_data(pos_train_data, neg_train_data, graph_hyper_params):
        # global pos_train_data, neg_train_data, start_neg
        pos_len, neg_len = len(pos_train_data), len(neg_train_data)
        # print start_neg, pos_len, neg_len
        if graph_hyper_params['neg_start'] * pos_len + graph_hyper_params['neg_size'] * pos_len < neg_len:
            this_neg_train_data = neg_train_data[graph_hyper_params['neg_start'] * pos_len: \
                                                 graph_hyper_params['neg_start'] * pos_len + graph_hyper_params[
                                                     'neg_size'] * pos_len]
        else:
            print 'fianl ! fianl ! fianl ! fianl !'
            this_neg_train_data = pd.concat([neg_train_data[graph_hyper_params['neg_start'] * pos_len:], neg_train_data[: pos_len - max(0,neg_len - graph_hyper_params['neg_start'] * pos_len)]])
        train_data = pd.concat([pos_train_data, this_neg_train_data])
        return shuffle(train_data)

    print graph_hyper_params

    print 'read data start !'
    pos_train_data, neg_train_data, predict_data1, predict_data2, user_data, ad_data, feature_conf_dict, uid_map, aid_map = get_prod_dataset(graph_hyper_params['formal'])
    print 'read data done !'

    # 重新 split train dev
    o_dev_size = graph_hyper_params['o_dev_size']
    dev_data = pd.concat([pos_train_data[:o_dev_size], neg_train_data[:o_dev_size]])
    pos_train_data, neg_train_data = pos_train_data[o_dev_size:], neg_train_data[o_dev_size:]
    print 'dev_size:', len(dev_data)
    print 'pos-neg-len:', len(pos_train_data), len(neg_train_data)

    train_data = construct_train_data(pos_train_data, neg_train_data, graph_hyper_params)
    if graph_hyper_params['only_train']:
        if graph_hyper_params['formal']:
            formal_set = set(list(train_data['uid']) + list(dev_data['uid']))
        else:
            formal_set = set(list(train_data['uid']) + list(dev_data['uid']) + [1, 2, 3, 4])
        user_data = user_data[user_data['uid'].isin(formal_set)]
    import gc
    gc.collect()

    print 'map row start'
    uid_map_row, aid_map_row = dict(zip(user_data['uid'].values, np.arange(len(user_data)))), dict(zip(ad_data['aid'].values, np.arange(len(ad_data))))
    print 'map row end'
    print feature_conf_dict


    graph = tf.Graph()
    with graph.as_default():
        # 对 creativeSize 这一个连续特征的处理
        if graph_hyper_params['creativeSize_pro'] == 'min_max':
            print 'min-max norm creativeSize', ad_data['creativeSize'].max(), ad_data['creativeSize'].min()
            norm_cs = (ad_data['creativeSize'] * 1.0 - ad_data['creativeSize'].min()) / (
                    ad_data['creativeSize'].max() - ad_data['creativeSize'].min())
            ad_data = ad_data.drop(['creativeSize'], axis=1)
            ad_data['creativeSize'] = norm_cs
            creativesize_p = tf.placeholder(tf.float32, [None, 1], name="creativeSize")
        elif graph_hyper_params['creativeSize_pro'] == 'li_san':
            print '离散化 creativeSize'
            sh = ShrinkSep()
            ad_data['creativeSize'] = ad_data['creativeSize'].apply(sh)
            feature_conf_dict['creativeSize'] = len(sh.d) + 1
            creativesize_p = tf.placeholder(tf.int32, [None, 1], name="creativeSize")
        else:
            print 'no process creativeSize'
        # ****************************************************************** place holder start
        uid_p = tf.placeholder(tf.int32, [None, 1], name="uid")
        lbs_p = tf.placeholder(tf.int32, [None, 1], name="LBS")
        age_p = tf.placeholder(tf.int32, [None, 1], name="age")

        carrier_p = tf.placeholder(tf.int32, [None, 1], name="carrier")
        consumptionability_p = tf.placeholder(tf.int32, [None, 1], name="consumptionAbility")
        education_p = tf.placeholder(tf.int32, [None, 1], name="education")
        gender_p = tf.placeholder(tf.int32, [None, 1], name="gender")
        house_p = tf.placeholder(tf.int32, [None, 1], name="house")
        os_p = tf.placeholder(tf.int32, [None, 1], name="os")
        ct_p = tf.placeholder(tf.int32, [None, 1], name="ct")
        # marriagestatus_p = tf.placeholder(tf.int32, [None, 1], name="marriageStatus")


        appidaction_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['appIdAction'][1]], name="appidaction_index")
        appidaction_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['appIdAction'][1]], name="appidaction_val")
        appIdInstall_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['appIdInstall'][1]], name="appIdInstall_index")
        appIdInstall_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['appIdInstall'][1]], name="appIdInstall_val")


        marriagestatus_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['marriageStatus'][0]], name="marriageStatus_index")
        marriagestatus_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['marriageStatus'][0]], name="marriageStatus_val")
        interest1_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest1'][0]], name="interest1_index")
        interest1_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest1'][0]], name="interest1_val")
        interest2_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest2'][0]], name="interest2_index")
        interest2_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest2'][0]], name="interest2_val")
        interest3_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest3'][0]], name="interest3_index")
        interest3_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest3'][0]], name="interest3_val")
        interest4_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest4'][0]], name="interest4_index")
        interest4_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest4'][0]], name="interest4_val")
        interest5_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['interest5'][0]], name="interest5_index")
        interest5_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['interest5'][0]], name="interest5_val")

        # kmeans type
        # clu_200_p = tf.placeholder(tf.int32, [None, 1], name="clu_200_p")
        # clu_400_p = tf.placeholder(tf.int32, [None, 1], name="clu_400_p")


        kw1_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['kw1'][1]], name="kw1_index")
        kw1_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['kw1'][1]], name="kw1_val")
        kw2_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['kw2'][1]], name="kw2_index")
        kw2_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['kw2'][1]], name="kw2_val")
        kw3_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['kw3'][1]], name="kw3_index")
        kw3_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['kw3'][1]], name="kw3_val")

        topic1_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['topic1'][1]], name="topic1_index")
        topic1_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['topic1'][1]], name="topic1_val")
        topic2_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['topic2'][1]], name="topic2_index")
        topic2_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['topic2'][1]], name="topic2_val")
        topic3_index_p = tf.placeholder(tf.int32, [None, feature_conf_dict['topic3'][1]], name="topic3_index")
        topic3_val_p = tf.placeholder(tf.float32, [None, 1, feature_conf_dict['topic3'][1]], name="topic3_val")

        aid_p = tf.placeholder(tf.int32, [None, 1], name="aid")
        advertiserid_p = tf.placeholder(tf.int32, [None, 1], name="advertiserId")
        campaignid_p = tf.placeholder(tf.int32, [None, 1], name="campaignId")
        creativeid_p = tf.placeholder(tf.int32, [None, 1], name="creativeId")
        adcategoryid_p = tf.placeholder(tf.int32, [None, 1], name="adCategoryId")
        productid_p = tf.placeholder(tf.int32, [None, 1], name="productId")
        producttype_p = tf.placeholder(tf.int32, [None, 1], name="productType")

        true_label = tf.placeholder(tf.float32, [None, 1], name="true_label")
        # ****************************************************************** place holder end

        pred_val, model_loss, network_params = inference(uid_p, lbs_p, age_p, carrier_p, consumptionability_p, education_p,
                                                         gender_p, house_p, os_p, ct_p, marriagestatus_index_p, marriagestatus_val_p, appidaction_index_p, appidaction_val_p, appIdInstall_index_p,
                                                         appIdInstall_val_p, interest1_index_p, interest1_val_p, interest2_index_p, interest2_val_p, interest3_index_p, interest3_val_p, interest4_index_p,
                                                         interest4_val_p, interest5_index_p, interest5_val_p, kw1_index_p, kw1_val_p, kw2_index_p, kw2_val_p,
                                                         kw3_index_p, kw3_val_p, topic1_index_p, topic1_val_p, topic2_index_p, topic2_val_p, topic3_index_p,
                                                         topic3_val_p, aid_p, advertiserid_p, campaignid_p, creativeid_p, adcategoryid_p, productid_p, producttype_p, creativesize_p, true_label, feature_conf_dict,
                                                         graph_hyper_params)


        global_step = tf.Variable(0, name="global_step", trainable=False)
        train_step = None
        learning_rate = tf.Variable(float(graph_hyper_params['learn_rate']), trainable=False, dtype=tf.float32)
        learning_rate_decay_op = learning_rate.assign(learning_rate * 0.5)
        if graph_hyper_params['opt'] == 'adam':
            train_step = tf.train.AdamOptimizer(learning_rate).minimize(model_loss, global_step=global_step)
        elif graph_hyper_params['opt'] == 'adgrad':
            train_step = tf.train.AdagradOptimizer(learning_rate).minimize(model_loss, global_step=global_step)
        elif graph_hyper_params['opt'] == 'adadelta':
            train_step = tf.train.AdadeltaOptimizer(learning_rate).minimize(model_loss, global_step=global_step)
        elif graph_hyper_params['opt'] == 'ftrl':
            train_step = tf.train.FtrlOptimizer(learning_rate).minimize(model_loss, global_step=global_step)
        elif graph_hyper_params['opt'] == 'sgd':
            train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(model_loss, global_step=global_step)
        elif graph_hyper_params['opt'] == "yellowfin":
            train_step = YFOptimizer(learning_rate=learning_rate, momentum=0.0).minimize(model_loss, global_step=global_step)
        else:
            print 'No optimizer !'

        time_now = 'model_' + str(graph_hyper_params['model']) + datetime.now().strftime("_%Y_%m_%d_%H_%M_%S")
        checkpoint_dir = os.path.abspath("./checkpoints/dmf_tencent/" + time_now)
        checkpoint_prefix = os.path.join(checkpoint_dir, "model")
        if not os.path.exists(checkpoint_dir):
            os.makedirs(checkpoint_dir)
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)


        def get_fed_dict(b_data, split_vector_data, feature_conf_dict, predict=False):
            if graph_hyper_params['formal']:
                aid_list = b_data['aid'].values
                uid_list = b_data['uid'].values
            else:
                if len(b_data) == 4:
                    aid_list, uid_list = [1, 2, 3, 4], [1, 2, 3, 4]
                elif len(b_data) == 3:
                    aid_list, uid_list = [1, 2, 3], [1, 2, 3]
                else:
                    aid_list, uid_list = [1], [1]

            # print 11
            # d1 = datetime.now()
            b_u_d, b_a_d = [], []
            for b_uid in uid_list:
                b_u_d.append(user_data.iloc[uid_map_row[b_uid]])
            for b_aid in aid_list:
                b_a_d.append(ad_data.iloc[aid_map_row[b_aid]])
            b_u_d = pd.concat(b_u_d, axis=1).transpose()
            b_a_d = pd.concat(b_a_d, axis=1).transpose()
            # d3 = datetime.now()

            # print 12
            # pd.concat([data.iloc[1].to_frame(), data.iloc[2].to_frame()], axis=1).transpose()
            fed_dict = {}
            fed_dict[uid_p] = np.expand_dims(b_u_d['uid'], axis=1)
            fed_dict[lbs_p] = np.expand_dims(b_u_d['LBS'], axis=1)
            fed_dict[age_p] = np.expand_dims(b_u_d['age'], axis=1)
            fed_dict[carrier_p] = np.expand_dims(b_u_d['carrier'], axis=1)
            fed_dict[consumptionability_p] = np.expand_dims(b_u_d['consumptionAbility'], axis=1)
            fed_dict[education_p] = np.expand_dims(b_u_d['education'], axis=1)
            fed_dict[gender_p] = np.expand_dims(b_u_d['gender'], axis=1)
            fed_dict[house_p] = np.expand_dims(b_u_d['house'], axis=1)
            fed_dict[os_p] = np.expand_dims(b_u_d['os'], axis=1)
            fed_dict[ct_p] = np.expand_dims(b_u_d['ct'], axis=1)
            # fed_dict[marriagestatus_p] = np.expand_dims(b_u_d['marriageStatus'], axis=1)
            # print 121
            appidaction_li = split_vector_data(b_u_d['appIdAction'])
            # print 1212
            fed_dict[appidaction_index_p], fed_dict[appidaction_val_p] = appidaction_li[0], appidaction_li[1]
            appIdInstall_li = split_vector_data(b_u_d['appIdInstall'])
            fed_dict[appIdInstall_index_p], fed_dict[appIdInstall_val_p] = appIdInstall_li[0], appIdInstall_li[1]
            # print 122
            marriagestatus_li = split_vector_data(b_u_d['marriageStatus'], interest='marriageStatus', feature_config=feature_conf_dict)
            fed_dict[marriagestatus_index_p], fed_dict[marriagestatus_val_p] = marriagestatus_li[0], marriagestatus_li[1]
            interest1_li = split_vector_data(b_u_d['interest1'], interest='interest1', feature_config=feature_conf_dict)
            fed_dict[interest1_index_p], fed_dict[interest1_val_p]  = interest1_li[0], interest1_li[1]
            interest2_li = split_vector_data(b_u_d['interest2'], interest='interest2', feature_config=feature_conf_dict)
            fed_dict[interest2_index_p], fed_dict[interest2_val_p] = interest2_li[0], interest2_li[1]
            interest3_li = split_vector_data(b_u_d['interest3'], interest='interest3', feature_config=feature_conf_dict)
            fed_dict[interest3_index_p], fed_dict[interest3_val_p] = interest3_li[0], interest3_li[1]
            interest4_li = split_vector_data(b_u_d['interest4'], interest='interest4', feature_config=feature_conf_dict)
            fed_dict[interest4_index_p], fed_dict[interest4_val_p] = interest4_li[0], interest4_li[1]
            interest5_li = split_vector_data(b_u_d['interest5'], interest='interest5', feature_config=feature_conf_dict)
            fed_dict[interest5_index_p], fed_dict[interest5_val_p] = interest5_li[0], interest5_li[1]
            # print 123
            kw1_li = split_vector_data(b_u_d['kw1'])
            fed_dict[kw1_index_p], fed_dict[kw1_val_p] = kw1_li[0], kw1_li[1]
            kw2_li = split_vector_data(b_u_d['kw2'])
            fed_dict[kw2_index_p], fed_dict[kw2_val_p] = kw2_li[0], kw2_li[1]
            kw3_li = split_vector_data(b_u_d['kw3'])
            fed_dict[kw3_index_p], fed_dict[kw3_val_p] = kw3_li[0], kw3_li[1]
            # print 124
            topic1_li = split_vector_data(b_u_d['topic1'])
            fed_dict[topic1_index_p], fed_dict[topic1_val_p] = topic1_li[0], topic1_li[1]
            topic2_li = split_vector_data(b_u_d['topic2'])
            fed_dict[topic2_index_p], fed_dict[topic2_val_p] = topic2_li[0], topic2_li[1]
            topic3_li = split_vector_data(b_u_d['topic3'])
            fed_dict[topic3_index_p], fed_dict[topic3_val_p] = topic3_li[0], topic3_li[1]
            # print 125

            # # ad
            fed_dict[aid_p] = np.expand_dims(b_a_d['aid'], axis=1)
            fed_dict[advertiserid_p] = np.expand_dims(b_a_d['advertiserId'], axis=1)
            fed_dict[campaignid_p] = np.expand_dims(b_a_d['campaignId'], axis=1)
            fed_dict[creativeid_p] = np.expand_dims(b_a_d['creativeId'], axis=1)
            fed_dict[adcategoryid_p] = np.expand_dims(b_a_d['adCategoryId'], axis=1)
            fed_dict[productid_p] = np.expand_dims(b_a_d['productId'], axis=1)
            fed_dict[producttype_p] = np.expand_dims(b_a_d['productType'], axis=1)

            # print 13
            # fed_dict[creativesize_p] = np.expand_dims(b_a_d['creativeSize'], axis=1)
            if graph_hyper_params['creativeSize_pro'] == 'min_max':
                fed_dict[creativesize_p] = np.expand_dims(b_a_d['creativeSize'], axis=1).astype(np.float32)
            elif graph_hyper_params['creativeSize_pro'] == 'li_san':
                fed_dict[creativesize_p] = np.expand_dims(b_a_d['creativeSize'], axis=1)
            else:
                print 'wrong feed'

            # label
            # print 14
            if not predict:
                fed_dict[true_label] = np.expand_dims(b_data['label'].values, axis=1).astype(np.float32)
            # print 15
            # d4 = datetime.now()
            # print d2-d1, d3-d2, d4-d3
            # print fed_dict[true_label]
            # print len(fed_dict[true_label]), len(fed_dict[aid_p]), len(fed_dict[uid_p]),
            return fed_dict

        def eval_on_dev(split_vector_data):
            e_b_s = len(dev_data) / graph_hyper_params['batch_size']
            auc_true, auc_pre = [], []
            # auc = []
            for index in tqdm(range(e_b_s)):
                start = index * graph_hyper_params['batch_size']
                end = (index + 1) * graph_hyper_params['batch_size'] if (index + 1) * graph_hyper_params['batch_size'] < len(dev_data) else len(dev_data)
                b_dev_data = dev_data[start:end]
                fed_dict = get_fed_dict(b_dev_data, split_vector_data, feature_conf_dict)
                pred_value, pre_pred_value, final_vec, uu, vv = sess.run([pred_val, network_params[0], network_params[1], network_params[2], network_params[3]], feed_dict=fed_dict)

                pre_real_val = np.array(pred_value).reshape((-1))
                auc_true = auc_true + list(b_dev_data['label'].values)
                auc_pre = auc_pre + pre_real_val.tolist()

                if True in np.isnan(pre_real_val):
                    print 'contain nan: ', np.array(pre_pred_value).reshape((-1))
                    print np.array(final_vec).reshape((-1))
                    print np.array(uu).reshape((-1))
                    print np.array(vv).reshape((-1))

                # auc.append()
            # auc_pre = np.array(auc_pre)
            # auc_pre = np.exp(auc_pre) / np.exp(auc_pre).sum()
            # print auc_true
            # print auc_pre
            fpr, tpr, thresholds = metrics.roc_curve(auc_true, auc_pre, pos_label=1)
            auc_v, gni = metrics.auc(fpr, tpr), gini_norm(auc_true, auc_pre)

            auc_pre_2 = np.array(auc_pre)
            auc_pre_2.sort()
            print('dev_pre_top2=%.4f %.4f min2=%.4f %.4f' %
                  (auc_pre_2.tolist()[-1], auc_pre_2.tolist()[-2], auc_pre_2.tolist()[0], auc_pre_2.tolist()[1]))
            return auc_v, gni


        best_auc = 0.0
        split_vector_data = SplitClass()
        sess = tf.Session()
        sess.run(tf.global_variables_initializer())
        for epoch in range(graph_hyper_params['epoch']): # 只训练 1 轮
            e_b_s = len(train_data) / graph_hyper_params['batch_size']
            one_epoch_loss, one_epoch_batchnum = 0.0, 0.0
            for index in tqdm(range(e_b_s)):
                # print 0
                start = index * graph_hyper_params['batch_size']
                end = (index + 1) * graph_hyper_params['batch_size'] if (index + 1) * graph_hyper_params['batch_size'] < len(train_data) else len(train_data)
                b_data = train_data[start:end]

                # print 1
                # d1 = datetime.now()
                fed_dict = get_fed_dict(b_data, split_vector_data, feature_conf_dict)
                # d2 = datetime.now()
                # print 2
                _, loss_val, pre_tr_val = sess.run([train_step, model_loss, network_params[0]], feed_dict=fed_dict)
                # print 3
                # d3 = datetime.now()
                # print d2-d1, d3-d2
                one_epoch_loss += loss_val
                one_epoch_batchnum += 1.

                if graph_hyper_params['debug']:
                    print datetime.now(), index, loss_val
                pre_tr_val = np.array(pre_tr_val).reshape((-1))
                if graph_hyper_params['debug'] or True in np.isnan(pre_tr_val):
                    print pre_tr_val

                if index != 0 and index % ((e_b_s - 1) / graph_hyper_params['show_peroid']) == 0:
                    split_vector_data.clean()
                    auc, gn = eval_on_dev(split_vector_data)
                    best_auc = max(auc, best_auc)
                    format_str = '%s epoch=%.2f avg_loss=%.4f auc=%.4f best_auc=%.4f gn=%.4f'
                    print (format_str % (datetime.now().strftime("%Y-%m-%d %H:%M:%S"), (epoch + 1.0 * (index+1) / e_b_s), one_epoch_loss / one_epoch_batchnum, auc, best_auc, gn))
                    one_epoch_loss = one_epoch_batchnum = 0.0


        # pass
        # predict_data = predict_data1
        # if graph_hyper_params['formal']:
        #     graph_hyper_params['batch_size'] = 1024
        # e_b_s = len(predict_data) / graph_hyper_params['batch_size'] if len(predict_data) % graph_hyper_params[
        #     'batch_size'] == 0 else len(predict_data) / graph_hyper_params['batch_size'] + 1
        # # del split_vector_data
        # # gc.collect()
        # # split_vector_data = SplitClass()
        # split_vector_data.clean()
        # pred = []
        # for index in tqdm(range(e_b_s)):
        #     start = index * graph_hyper_params['batch_size']
        #     end = (index + 1) * graph_hyper_params['batch_size'] if (index + 1) * graph_hyper_params['batch_size'] < len(predict_data) else len(predict_data) + 1
        #     b_predict_data = predict_data[start:end]
        #     # print len(b_predict_data), start, end
        #     # fed_dict = get_fed_dict(b_dev_data, split_vector_data, feature_conf_dict)
        #     fed_dict = get_fed_dict(b_predict_data, split_vector_data, feature_conf_dict, predict=True)
        #     fed_dict[train_p] = False
        #     fed_dict[dropout_p] = np.array([1.0])
        #     pred_value = sess.run([pred_val], feed_dict=fed_dict)
        #     # print pred_value
        #     pre_real_val = np.array(pred_value).reshape((-1))
        #     pred = pred + pre_real_val.tolist()
        #
        # predict_data['pred_label'] = pred
        # csv_data = predict_data[['ori_aid', 'ori_uid', 'pred_label']]
        # csv_data.columns = ['aid', 'uid', 'score']
        # csv_path = os.path.join(checkpoint_dir, 'n' + str(graph_hyper_params['neg_start']) + '_submission.csv')
        # csv_data.to_csv(csv_path, index=False)
        # print 'submission_path:', csv_path
    pass
Esempio n. 13
0
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():

            # todo tf.set_random_seed()函数,使用之后后面设置的随机数都不需要设置seed,而可以跨会话生成相同的随机数。
            tf.set_random_seed(self.random_seed)

            self.feat_index = tf.placeholder(tf.int32,
                                             shape=[None, None],
                                             name="feat_index")  # None * F
            self.feat_value = tf.placeholder(tf.float32,
                                             shape=[None, None],
                                             name="feat_value")  # None * F
            self.label = tf.placeholder(tf.float32,
                                        shape=[None, 1],
                                        name="label")  # None * 1
            self.dropout_keep_fm = tf.placeholder(tf.float32,
                                                  shape=[None],
                                                  name="dropout_keep_fm")
            self.dropout_keep_deep = tf.placeholder(tf.float32,
                                                    shape=[None],
                                                    name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            self.weights = self._initialize_weights()

            # model
            # todo 下面这一步就是根据原始特征和embedding矩阵相乘得到的每个特征的embedding。
            #  embedding的形成方式,首先经过FM也好Deep也好得到一个size为(M,k)的权重矩阵。这里field是尚未做onehot前特征矩阵的一维,
            #  onehot后每个field只有1个数值不为0,这个不为0特征对应的(M,k)的权重矩阵就是这个field的embedding,这里说的都是类别类型特征。
            #  开始对field有些误解,说一下自己对类别类型特征的embedding的形成方式的理解:首先经过FM也好Deep也好得到一个size为(M,K)的权重矩阵。这里field是尚未做onehot前特征矩阵的一个特征,onehot后每个field只有一个数值不为0(等于1),这个等于1对应的(M,k)的权重矩阵的一行就是这个field的embedding。
            self.embeddings = tf.nn.embedding_lookup(
                self.weights["feature_embeddings"],
                self.feat_index)  # None * F * K
            feat_value = tf.reshape(self.feat_value,
                                    shape=[-1, self.field_size, 1])
            # todo self.embeddings=Tensor("embedding_lookup/Identity:0", shape=(?, ?, 8), dtype=float32)
            # print(f"self.embeddings={self.embeddings}")
            # todo multiply 是元素级别的相乘,也就是两个相乘的数元素各自相乘,而不是矩阵乘法,注意和tf.matmul区别。
            #  这里embedding和feat_value相乘主要是考虑到数值型特征,类别型特征value都是1乘不乘都一样。
            self.embeddings = tf.multiply(self.embeddings, feat_value)
            # todo self.embeddings=Tensor("Mul:0", shape=(?, 39, 8), dtype=float32),
            #  feat_value=Tensor("Reshape:0", shape=(?, 39, 1), dtype=float32)
            # print(f"self.embeddings={self.embeddings},feat_value={feat_value}")

            # ---------- first order term ----------
            self.y_first_order = tf.nn.embedding_lookup(
                self.weights["feature_bias"], self.feat_index)  # None * F * 1
            # todo 这里为什么会喂入稀疏矩阵
            self.y_first_order = tf.reduce_sum(
                tf.multiply(self.y_first_order, feat_value), 2)  # None * F
            self.y_first_order = tf.nn.dropout(
                self.y_first_order, self.dropout_keep_fm[0])  # None * F

            # ---------- second order term ---------------
            # todo FM的输入确实是稀疏矩阵,但是DeepFM的图中画不是FM的输入是稠密矩阵吗?
            #  FM的输入是稀疏矩阵,deep输入是稠密矩阵(这里deep的稠密矩阵是基于FM的矩阵分解得到的对吧?)
            # sum_square part
            self.summed_features_emb = tf.reduce_sum(self.embeddings,
                                                     1)  # None * K
            self.summed_features_emb_square = tf.square(
                self.summed_features_emb)  # None * K

            # square_sum part
            self.squared_features_emb = tf.square(self.embeddings)
            self.squared_sum_features_emb = tf.reduce_sum(
                self.squared_features_emb, 1)  # None * K

            # second order
            self.y_second_order = 0.5 * tf.subtract(
                self.summed_features_emb_square,
                self.squared_sum_features_emb)  # None * K
            self.y_second_order = tf.nn.dropout(
                self.y_second_order, self.dropout_keep_fm[1])  # None * K

            # ---------- Deep component ----------
            # todo 这里反向传播的时候会更新embedding的值吗?-- 应该会更新吧,如果只使用deep的时候肯定会更新,FM和deep一起使用怎么会不更新呢?
            self.y_deep = tf.reshape(
                self.embeddings,
                shape=[-1,
                       self.field_size * self.embedding_size])  # None * (F*K)
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
            for i in range(0, len(self.deep_layers)):
                self.y_deep = tf.add(
                    tf.matmul(self.y_deep, self.weights["layer_%d" % i]),
                    self.weights["bias_%d" % i])  # None * layer[i] * 1
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(
                        self.y_deep,
                        train_phase=self.train_phase,
                        scope_bn="bn_%d" % i)  # None * layer[i] * 1
                self.y_deep = self.deep_layers_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(
                    self.y_deep,
                    self.dropout_keep_deep[1 +
                                           i])  # dropout at each Deep layer

            # ---------- DeepFM ----------
            if self.use_fm and self.use_deep:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order, self.y_deep],
                    axis=1)
            elif self.use_fm:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order], axis=1)
            elif self.use_deep:
                concat_input = self.y_deep
            self.out = tf.add(
                tf.matmul(concat_input, self.weights["concat_projection"]),
                self.weights["concat_bias"])

            # loss
            if self.loss_type == "logloss":
                self.out = tf.nn.sigmoid(self.out)
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            # l2 regularization on weights
            if self.l2_reg > 0:
                # todo 这里FM的正则只加FM之后全连接的正则,没有加前面FM中的正则
                #  regularizer = tf.contrib.layers.l2_regularizer(scale=0.1)  #scale代表正则化系数的值
                #  tf中通过tf.contrib.layers.l2_regularizer(scale=0.1) 创建一个正则化方法,此处是L2正则化,#scale代表正则化系数的值。
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weights["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layers)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weights["layer_%d" % i])

            # optimizer
            if self.optimizer_type == "adam":
                self.optimizer = tf.train.AdamOptimizer(
                    learning_rate=self.learning_rate,
                    beta1=0.9,
                    beta2=0.999,
                    epsilon=1e-8).minimize(self.loss)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.train.AdagradOptimizer(
                    learning_rate=self.learning_rate,
                    initial_accumulator_value=1e-8).minimize(self.loss)
            elif self.optimizer_type == "gd":
                self.optimizer = tf.train.GradientDescentOptimizer(
                    learning_rate=self.learning_rate).minimize(self.loss)
            elif self.optimizer_type == "momentum":
                self.optimizer = tf.train.MomentumOptimizer(
                    learning_rate=self.learning_rate,
                    momentum=0.95).minimize(self.loss)
            elif self.optimizer_type == "yellowfin":
                pass
                self.optimizer = YFOptimizer(learning_rate=self.learning_rate,
                                             momentum=0.0).minimize(self.loss)

            # init
            # 这里建了saver没用到吧
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)

            # number of params
            # todo 对模型训练没什么帮助
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
Esempio n. 14
0
                            verbose=False)
if torch.cuda.is_available():
    newmodel.cuda()
print('newmodel', newmodel)

#######################################
# INSTANTIATE LOSS AND OPTIMIZER CLASS#
#######################################
criterion = nn.CrossEntropyLoss()
params = [p for p in newmodel.parameters() if p.requires_grad]
wnd_size = 40
learning_rate = .5  # TODO: here learning rate is fixed, so need to find out some methods, maybe not fixed?
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
optimizer = YFOptimizer(params,
                        lr=learning_rate,
                        mu=0.0,
                        weight_decay=5e-4,
                        clip_thresh=2.0,
                        curv_win_width=wnd_size)
optimizer._sparsity_debias = True

#########################
# TRAINING WITH NEWMODEL#
#########################
iter = 0
for epoch in range(num_epoches):
    for i, (images, labels) in enumerate(train_loader):
        if torch.cuda.is_available():
            images = Variable(images.cuda())
            labels = Variable(labels.cuda())
        else:
            images = Variable(images)
Esempio n. 15
0
    def __init__(self, args, training=True, opt_method="Adam"):
        self.args = args
        if not training:
            args.batch_size = 1
            args.seq_length = 1

        if args.model == 'rnn':
            cell_fn = rnn.BasicRNNCell
        elif args.model == 'gru':
            cell_fn = rnn.GRUCell
        elif args.model == 'lstm':
            cell_fn = rnn.BasicLSTMCell
        elif args.model == 'nas':
            cell_fn = rnn.NASCell
        else:
            raise Exception("model type not supported: {}".format(args.model))

        cells = []
        for _ in range(args.num_layers):
            cell = cell_fn(args.rnn_size)
            if training and (args.output_keep_prob < 1.0 or args.input_keep_prob < 1.0):
                cell = rnn.DropoutWrapper(cell,
                                          input_keep_prob=args.input_keep_prob,
                                          output_keep_prob=args.output_keep_prob)
            cells.append(cell)

        self.cell = cell = rnn.MultiRNNCell(cells, state_is_tuple=True)

        self.input_data = tf.placeholder(
            tf.int32, [args.batch_size, args.seq_length])
        self.targets = tf.placeholder(
            tf.int32, [args.batch_size, args.seq_length])
        self.initial_state = cell.zero_state(args.batch_size, tf.float32)

        with tf.variable_scope('rnnlm'):
            softmax_w = tf.get_variable("softmax_w",
                                        [args.rnn_size, args.vocab_size])
            softmax_b = tf.get_variable("softmax_b", [args.vocab_size])

        embedding = tf.get_variable("embedding", [args.vocab_size, args.rnn_size])
        inputs = tf.nn.embedding_lookup(embedding, self.input_data)

        # dropout beta testing: double check which one should affect next line
        if training and args.output_keep_prob:
            inputs = tf.nn.dropout(inputs, args.output_keep_prob)

        inputs = tf.split(inputs, args.seq_length, 1)
        inputs = [tf.squeeze(input_, [1]) for input_ in inputs]

        def loop(prev, _):
            prev = tf.matmul(prev, softmax_w) + softmax_b
            prev_symbol = tf.stop_gradient(tf.argmax(prev, 1))
            return tf.nn.embedding_lookup(embedding, prev_symbol)

        outputs, last_state = legacy_seq2seq.rnn_decoder(inputs, self.initial_state, cell, loop_function=loop if not training else None, scope='rnnlm')
        output = tf.reshape(tf.concat(outputs, 1), [-1, args.rnn_size])


        self.logits = tf.matmul(output, softmax_w) + softmax_b
        self.probs = tf.nn.softmax(self.logits)
        loss = legacy_seq2seq.sequence_loss_by_example(
                [self.logits],
                [tf.reshape(self.targets, [-1])],
                [tf.ones([args.batch_size * args.seq_length])])
        self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
        with tf.name_scope('cost'):
            self.cost = tf.reduce_sum(loss) / args.batch_size / args.seq_length
            # for eval visualization in tensorboard
            self.eval_cost = tf.identity(self.cost)

        self.final_state = last_state
        self.lr = tf.Variable(0.0, trainable=False)
        tvars = tf.trainable_variables()
        grads, _ = tf.clip_by_global_norm(tf.gradients(self.cost, tvars),
                args.grad_clip)
        with tf.name_scope('optimizer'):
            if opt_method == "Adam":
                print "using Adam"
                self.optimizer = optimizer = tf.train.AdamOptimizer(self.lr)
            elif opt_method == "YF":
                print "using YF"
                self.optimizer = optimizer = YFOptimizer(learning_rate=args.learning_rate, momentum=0.0)
            elif opt_method == "SGD":
		print "using SGD"
                self.optimizer = optimizer = tf.train.MomentumOptimizer(self.lr, 0.9)
            else:
                raise Exception("please use either adam or YF")

        self.train_op = optimizer.apply_gradients(zip(grads, tvars))

        # instrument tensorboard
        self.train_summary = [ \
            tf.summary.histogram('logits', self.logits),
            tf.summary.histogram('loss', loss),
            tf.summary.scalar('train_loss', self.cost) ]
Esempio n. 16
0
model.add(Activation('relu'))
model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

# Modified to use yellowfin or adam optimizer
if yellowfinopt:
    opt = keras.optimizers.TFOptimizer(YFOptimizer())
else:
    opt = keras.optimizers.Adam()

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['accuracy'])

x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train /= 255
x_test /= 255

# subsamples for quick test
#x_train2 = x_train[0:2000, :,:,:]
Esempio n. 17
0
# mix_softmax = MixtureSoftmax(batch_size=batch_size, word_gru_hidden = 50, feature_dim = 0, n_classes=num_classes)

if use_cuda:
    word_attn.cuda()
    mix_softmax.cuda()

softmax = nn.Softmax()
sigmoid = nn.Sigmoid()

learning_rate = 0.0001
print("lr thresh", args.lr_thresh)
optimizer = YFOptimizer(mix_softmax.parameters(),
                        beta=0.999,
                        lr=learning_rate,
                        mu=0.0,
                        zero_debias=False,
                        clip_thresh=None,
                        auto_clip_fac=None,
                        curv_win_width=20,
                        force_non_inc_step=False,
                        use_disk_checkpoint=True)

# word_optmizer = YFOptimizer(word_attn.parameters(), lr=learning_rate, mu=0.0, auto_clip_fac=2.0)
# mix_optimizer = YFOptimizer(mix_softmax.parameters(), lr=learning_rate, mu=0.0, auto_clip_fac=2.0)

criterion = nn.MultiLabelSoftMarginLoss(size_average=True)

import time
import math


def timeSince(since):
Esempio n. 18
0
def test_lr_mu():
    opt = YFOptimizer(learning_rate=0.5, momentum=0.5, zero_debias=False)
    w = tf.Variable(np.ones([
        n_dim,
    ]),
                    dtype=tf.float32,
                    name="w",
                    trainable=True)
    b = tf.Variable(np.ones([
        1,
    ], dtype=np.float32),
                    dtype=tf.float32,
                    name="b",
                    trainable=True)
    x = tf.constant(np.ones([
        n_dim,
    ], dtype=np.float32), dtype=tf.float32)
    loss = tf.multiply(w, x) + b
    tvars = tf.trainable_variables()

    w_grad_val = tf.Variable(np.zeros([
        n_dim,
    ]),
                             dtype=tf.float32,
                             trainable=False)
    b_grad_val = tf.Variable(np.zeros([
        1,
    ]),
                             dtype=tf.float32,
                             trainable=False)
    apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars))

    init_op = tf.global_variables_initializer()
    with tf.Session() as sess:
        sess.run(init_op)
        target_h_max = 0.0
        target_h_min = 0.0
        g_norm_squared_avg = 0.0
        g_norm_avg = 0.0
        g_avg = 0.0
        target_dist = 0.0
        target_lr = 0.5
        target_mu = 0.5
        for i in range(n_iter):

            sess.run(
                tf.assign(w_grad_val,
                          (i + 1) * np.ones([
                              n_dim,
                          ], dtype=np.float32)))
            sess.run(
                tf.assign(b_grad_val,
                          (i + 1) * np.ones([
                              1,
                          ], dtype=np.float32)))

            res = sess.run([
                opt._curv_win, opt._h_max, opt._h_min, opt._grad_var,
                opt._dist_to_opt_avg, opt._lr_var, opt._mu_var, apply_op
            ])

            res[5] = opt._lr_var.eval()
            res[6] = opt._mu_var.eval()

            g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
              + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
            g_norm_avg = 0.999 * g_norm_avg  \
              + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
            g_avg = 0.999 * g_avg + 0.001 * (i + 1)

            target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim +
                                                                        1)
            target_h_min = 0.999 * target_h_min + 0.001 * max(
                1, i + 2 - 20)**2 * (n_dim + 1)
            target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
            target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

            if i > 0:
                lr, mu = tune_everything(target_dist**2, target_var, 1,
                                         target_h_min, target_h_max)
                target_lr = 0.999 * target_lr + 0.001 * lr
                target_mu = 0.999 * target_mu + 0.001 * mu

            # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \
    #                              " var ", res[3], target_var, " dist ", res[4], target_dist
    # print "iter ", i, " lr ", res[5], target_lr, " mu ", res[6], target_mu

            assert np.abs(target_h_max - res[1]) < np.abs(target_h_max) * 1e-3
            assert np.abs(target_h_min - res[2]) < np.abs(target_h_min) * 1e-3
            assert np.abs(target_var - res[3]) < np.abs(res[3]) * 1e-3
            assert np.abs(target_dist - res[4]) < np.abs(res[4]) * 1e-3
            assert target_lr == 0.0 or np.abs(target_lr -
                                              res[5]) < np.abs(res[5]) * 1e-3
            assert target_mu == 0.0 or np.abs(target_mu -
                                              res[6]) < np.abs(res[6]) * 5e-3
    print "lr and mu computing test passed!"
Esempio n. 19
0
def test_lr_mu(zero_debias=False):
    dtype = torch.FloatTensor
    w = Variable(torch.ones(n_dim, 1).type(dtype), requires_grad=True)
    b = Variable(torch.ones(1).type(dtype), requires_grad=True)
    x = Variable(torch.ones(1, n_dim).type(dtype), requires_grad=False)
    opt = YFOptimizer([w, b], lr=1.0, mu=0.0, zero_debias=zero_debias)

    target_h_max = 0.0
    target_h_min = 0.0
    g_norm_squared_avg = 0.0
    g_norm_avg = 0.0
    g_avg = 0.0
    target_dist = 0.0
    target_lr = 1.0
    target_mu = 0.0
    for i in range(n_iter):
        opt.zero_grad()

        loss = (x.mm(w) + b).sum()
        loss.backward()
        w.grad.data = (i + 1) * torch.ones([
            n_dim,
        ]).type(dtype)
        b.grad.data = (i + 1) * torch.ones([
            1,
        ]).type(dtype)

        opt.step()
        res = [
            opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt, opt._lr,
            opt._mu
        ]

        g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
          + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
        g_norm_avg = 0.999 * g_norm_avg  \
          + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
        g_avg = 0.999 * g_avg + 0.001 * (i + 1)

        target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim + 1)
        target_h_min = 0.999 * target_h_min + 0.001 * max(
            1, i + 2 - 20)**2 * (n_dim + 1)
        if zero_debias:
            target_var = g_norm_squared_avg/(1-0.999**(i + 1) ) \
               - g_avg**2 * (n_dim + 1) / (1-0.999**(i + 1) )**2
        else:
            target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
        target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

        if i == 0:
            continue
        if zero_debias:
            # print "iter ", i, " h max ", res[0], target_h_max/(1-0.999**(i + 1) ), \
            #   " h min ", res[1], target_h_min/(1-0.999**(i + 1) ), \
            #   " var ", res[2], target_var, \
            #   " dist ", res[3], target_dist/(1-0.999**(i + 1) )
            assert np.abs(target_h_max / (1 - 0.999**(i + 1)) -
                          res[0]) < np.abs(res[0]) * 1e-3
            assert np.abs(target_h_min / (1 - 0.999**(i + 1)) -
                          res[1]) < np.abs(res[1]) * 1e-3
            assert np.abs(target_var - res[2]) < np.abs(target_var) * 1e-3
            assert np.abs(target_dist / (1 - 0.999**(i + 1)) -
                          res[3]) < np.abs(res[3]) * 1e-3
        else:
            # print "iter ", i, " h max ", res[0], target_h_max, " h min ", res[1], target_h_min, \
            # " var ", res[2], target_var, " dist ", res[3], target_dist
            assert np.abs(target_h_max - res[0]) < np.abs(target_h_max) * 1e-3
            assert np.abs(target_h_min - res[1]) < np.abs(target_h_min) * 1e-3
            assert np.abs(target_var - res[2]) < np.abs(res[2]) * 1e-3
            assert np.abs(target_dist - res[3]) < np.abs(res[3]) * 1e-3

        if i > 0:
            if zero_debias:
                lr, mu = tune_everything(
                    (target_dist / (1 - 0.999**(i + 1)))**2, target_var, 1,
                    target_h_min / (1 - 0.999**(i + 1)),
                    target_h_max / (1 - 0.999**(i + 1)))
            else:
                lr, mu = tune_everything(target_dist**2, target_var, 1,
                                         target_h_min, target_h_max)
            lr = np.real(lr)
            mu = np.real(mu)
            target_lr = 0.999 * target_lr + 0.001 * lr
            target_mu = 0.999 * target_mu + 0.001 * mu
            # print "lr ", target_lr, res[4], " mu ", target_mu, res[5]
            assert target_lr == 0.0 or np.abs(target_lr -
                                              res[4]) < np.abs(res[4]) * 1e-3
            assert target_mu == 0.0 or np.abs(target_mu -
                                              res[5]) < np.abs(res[5]) * 5e-3
    print "lr and mu computing test passed!"
Esempio n. 20
0
    dist_list,\
    grad_var_list,\
    lr_g_norm_list,\
    lr_g_norm_squared_list,\
    move_lr_g_norm_list,\
    move_lr_g_norm_squared_list,\
    lr_grad_norm_clamp_act_list,\
    fast_view_act_list


if args.opt_method == "SGD":
    optimizer = torch.optim.SGD(model.parameters(), lr, momentum=0.0)
elif args.opt_method == "momSGD":
    optimizer = torch.optim.SGD(model.parameters(), lr, momentum=0.9)
elif args.opt_method == "YF":
    optimizer = YFOptimizer(model.parameters() )
elif args.opt_method == "Adam":
    optimizer = torch.optim.Adam(model.parameters(), lr)


best_val_loss = None
train_loss_list = []
val_loss_list = []
lr_list = []
mu_list = []

loss_list = []
local_curv_list = []
max_curv_list = []
min_curv_list = []
lr_g_norm_list = []
Esempio n. 21
0
def test_measurement(zero_debias=True):
    dtype = torch.FloatTensor
    w = Variable(torch.ones(n_dim, 1).type(dtype), requires_grad=True)
    b = Variable(torch.ones(1).type(dtype), requires_grad=True)
    x = Variable(torch.ones(1, n_dim).type(dtype), requires_grad=False)
    opt = YFOptimizer([w, b], lr=1.0, mu=0.0, zero_debias=zero_debias)

    target_h_max = 0.0
    target_h_min = 0.0
    g_norm_squared_avg = 0.0
    g_norm_avg = 0.0
    g_avg = 0.0
    target_dist = 0.0
    for i in range(n_iter):
        opt.zero_grad()

        loss = (x.mm(w) + b).sum()
        loss.backward()
        w.grad.data = (i + 1) * torch.ones([
            n_dim,
        ]).type(dtype)
        b.grad.data = (i + 1) * torch.ones([
            1,
        ]).type(dtype)

        opt.step()

        res = [opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt]

        g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
          + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
        g_norm_avg = 0.999 * g_norm_avg  \
          + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
        g_avg = 0.999 * g_avg + 0.001 * (i + 1)

        target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2 * (n_dim + 1)
        target_h_min = 0.999 * target_h_min + 0.001 * max(
            1, i + 2 - 20)**2 * (n_dim + 1)
        if zero_debias:
            target_var = g_norm_squared_avg/(1-0.999**(i + 1) ) \
               - g_avg**2 * (n_dim + 1) / (1-0.999**(i + 1) )**2
        else:
            target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
        target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

        if i == 0:
            continue
        if zero_debias:
            # print "iter ", i, " h max ", res[0], target_h_max/(1-0.999**(i + 1) ), \
            #   " h min ", res[1], target_h_min/(1-0.999**(i + 1) ), \
            #   " var ", res[2], target_var, \
            #   " dist ", res[3], target_dist/(1-0.999**(i + 1) )
            assert np.abs(target_h_max / (1 - 0.999**(i + 1)) -
                          res[0]) < np.abs(res[0]) * 1e-3
            assert np.abs(target_h_min / (1 - 0.999**(i + 1)) -
                          res[1]) < np.abs(res[1]) * 1e-3
            assert np.abs(target_var - res[2]) < np.abs(target_var) * 1e-3
            assert np.abs(target_dist / (1 - 0.999**(i + 1)) -
                          res[3]) < np.abs(res[3]) * 1e-3
        else:
            # print "iter ", i, " h max ", res[0], target_h_max, " h min ", res[1], target_h_min, \
            # " var ", res[2], target_var, " dist ", res[3], target_dist
            assert np.abs(target_h_max - res[0]) < np.abs(target_h_max) * 1e-3
            assert np.abs(target_h_min - res[1]) < np.abs(target_h_min) * 1e-3
            assert np.abs(target_var - res[2]) < np.abs(res[2]) * 1e-3
            assert np.abs(target_dist - res[3]) < np.abs(res[3]) * 1e-3
    print "sync measurement test passed!"
Esempio n. 22
0
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self.random_seed)

            self.feat_index = tf.placeholder(tf.int32, shape=[None, None],
                                                 name="feat_index")  # None * F    batch_size * field_size
            self.feat_value = tf.placeholder(tf.float32, shape=[None, None],
                                                 name="feat_value")  # None * F    batch_size * field_size
            self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label")  # None * 1
            self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm")
            self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            #todo 初始化权重
            self.weights = self._initialize_weights()

            # model
            self.embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"], #[self.feature_size, self.embedding_size]
                                                             self.feat_index)  # None * F * K 由于one-hot是01编码 所以取出的大小为batch_size*field_size*embedding_size
            feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1])
            self.embeddings = tf.multiply(self.embeddings, feat_value) #todo 两个矩阵中对应元素各自相乘 这个就是就是每个值和自己的隐向量的乘积

            # ---------- todo first order term 其实就是wx----------
            #得到的大小为batch_size*field_size*1
            self.y_first_order = tf.nn.embedding_lookup(self.weights["feature_bias"], self.feat_index) # None * F * 1  [self.feature_size, 1]
            #todo feat_value大小为 batch_size*field_size*1 得到的结果的大小 batch_size*field_size
            self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2)  # None * F
            self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F

            # ---------- todo 二次项second order term ---------------
            # sum_square part   (batch_size*field_size*embedding_size)
            self.summed_features_emb = tf.reduce_sum(self.embeddings, 1)  # None * K
            self.summed_features_emb_square = tf.square(self.summed_features_emb)  # None * K

            # square_sum part
            self.squared_features_emb = tf.square(self.embeddings)
            self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1)  # None * K

            # second order
            self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb)  # None * K
            self.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1])  # None * K

            # ---------- todo Deep component ----------
            #todo emdeddings的大小为 batch_size*field_size*embedding_size
            self.y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K)
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
            for i in range(0, len(self.deep_layers)):
                self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights["layer_%d" %i]), self.weights["bias_%d"%i]) # None * layer[i] * 1
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1
                self.y_deep = self.deep_layers_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1+i]) # dropout at each Deep layer

            # ---------- DeepFM ----------
            if self.use_fm and self.use_deep:
                concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1)
            elif self.use_fm:
                concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1)
            elif self.use_deep:
                concat_input = self.y_deep
            #todo 将结果进行投影
            self.out = tf.add(tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"])

            # loss 损失函数
            if self.loss_type == "logloss":
                self.out = tf.nn.sigmoid(self.out)
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            # l2 regularization on weights 正则化项目
            if self.l2_reg > 0:
                self.loss += tf.contrib.layers.l2_regularizer(
                    self.l2_reg)(self.weights["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layers)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weights["layer_%d"%i])

            # optimizer
            if self.optimizer_type == "adam":
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
                                                        epsilon=1e-8).minimize(self.loss)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
                                                           initial_accumulator_value=1e-8).minimize(self.loss)
            elif self.optimizer_type == "gd":
                self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
            elif self.optimizer_type == "momentum":
                self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
                    self.loss)
            elif self.optimizer_type == "yellowfin":
                self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(
                    self.loss)

            # init
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
Esempio n. 23
0
curParams = [p for p in net.parameters() if p.requires_grad]
if args.optimizer.lower() == 'sgd':
    optimizer = optim.SGD(curParams,
                          lr=args.lr,
                          momentum=args.momentum,
                          weight_decay=args.weight_decay)
elif args.optimizer.lower() == 'adam':
    #args.lr=1e-3
    optimizer = optim.Adam(params=curParams)
elif args.optimizer.lower() == 'rmsprop':
    #args.lr=1e-2
    optimizer = optim.RMSprop(params=curParams)
elif args.optimizer.lower() in ['yellowfin', 'yf']:
    optimizer = YFOptimizer(curParams,
                            lr=args.lr,
                            mu=0.0,
                            weight_decay=weight_decay,
                            clip_thresh=2.0,
                            curv_win_width=20)
else:
    raise Exception('Unsupported optimizer type encountered:' + args.optimizer)
criterion = MultiBoxLoss(num_classes, 0.5, True, 0, True, 3, 0.5, False,
                         args.cuda)


def train():
    #import cProfile, pstats
    #from io import StringIO
    #pr = cProfile.Profile()
    #pr.enable()

    net.train()
Esempio n. 24
0
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self.random_seed)

            with tf.name_scope('input'):
                self.feat_index = tf.placeholder(tf.int32,
                                                 shape=[None, None],
                                                 name="feat_index")  # None * F
                self.feat_value = tf.placeholder(tf.float32,
                                                 shape=[None, None],
                                                 name="feat_value")  # None * F
                self.label = tf.placeholder(tf.float32,
                                            shape=[None, 1],
                                            name="label")  # None * 1
                self.dropout_keep_fm = tf.placeholder(tf.float32,
                                                      shape=[None],
                                                      name="dropout_keep_fm")
                self.dropout_keep_deep = tf.placeholder(
                    tf.float32, shape=[None], name="dropout_keep_deep")
                self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            self.weights = self._initialize_weights()

            # model
            self.embeddings = tf.nn.embedding_lookup(
                self.weights["feature_embeddings"],
                self.feat_index)  # None * F * K
            feat_value = tf.reshape(self.feat_value,
                                    shape=[-1, self.field_size, 1],
                                    name="reshape_feat_value")
            self.embeddings = tf.multiply(self.embeddings, feat_value)

            with tf.name_scope("FM-model"):
                # ---------- first order term ----------
                self.y_first_order = tf.nn.embedding_lookup(
                    self.weights["feature_bias"],
                    self.feat_index)  # None * F * 1
                self.y_first_order = tf.reduce_sum(
                    tf.multiply(self.y_first_order, feat_value), 2)  # None * F
                self.y_first_order = tf.nn.dropout(
                    self.y_first_order, self.dropout_keep_fm[0])  # None * F

                # ---------- second order term ---------------
                # sum_square part
                self.summed_features_emb = tf.reduce_sum(self.embeddings,
                                                         1)  # None * K
                self.summed_features_emb_square = tf.square(
                    self.summed_features_emb)  # None * K

                # square_sum part
                self.squared_features_emb = tf.square(self.embeddings)
                self.squared_sum_features_emb = tf.reduce_sum(
                    self.squared_features_emb, 1)  # None * K

                # second order
                self.y_second_order = 0.5 * tf.subtract(
                    self.summed_features_emb_square,
                    self.squared_sum_features_emb)  # None * K
                self.y_second_order = tf.nn.dropout(
                    self.y_second_order, self.dropout_keep_fm[1])  # None * K

            with tf.name_scope("Deep-model"):
                # ---------- Deep component ----------
                self.y_deep = tf.reshape(
                    self.embeddings,
                    shape=[-1, self.field_size * self.embedding_size
                           ])  # None * (F*K)
                self.y_deep = tf.nn.dropout(self.y_deep,
                                            self.dropout_keep_deep[0])
                for i in range(0, len(self.deep_layers)):
                    self.y_deep = tf.add(
                        tf.matmul(self.y_deep, self.weights["layer_%d" % i]),
                        self.weights["bias_%d" % i])  # None * layer[i] * 1
                    if self.batch_norm:
                        self.y_deep = self.batch_norm_layer(
                            self.y_deep,
                            train_phase=self.train_phase,
                            scope_bn="bn_%d" % i)  # None * layer[i] * 1
                    self.y_deep = self.deep_layers_activation(self.y_deep)
                    self.y_deep = tf.nn.dropout(
                        self.y_deep, self.dropout_keep_deep[
                            1 + i])  # dropout at each Deep layer

            with tf.name_scope("DeepFM-model"):
                # ---------- DeepFM ----------
                if self.use_fm and self.use_deep:
                    concat_input = tf.concat(
                        [self.y_first_order, self.y_second_order, self.y_deep],
                        axis=1)
                elif self.use_fm:
                    concat_input = tf.concat(
                        [self.y_first_order, self.y_second_order], axis=1)
                elif self.use_deep:
                    concat_input = self.y_deep
                self.out = tf.add(
                    tf.matmul(concat_input, self.weights["concat_projection"]),
                    self.weights["concat_bias"])

            # loss
            with tf.name_scope("loss"):
                if self.loss_type == "logloss":
                    self.out = tf.nn.sigmoid(self.out)
                    self.loss = tf.losses.log_loss(self.label, self.out)
                elif self.loss_type == "mse":
                    self.loss = tf.nn.l2_loss(tf.subtract(
                        self.label, self.out))
                # l2 regularization on weights
                if self.l2_reg > 0:
                    self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                        self.weights["concat_projection"])
                    if self.use_deep:
                        for i in range(len(self.deep_layers)):
                            self.loss += tf.contrib.layers.l2_regularizer(
                                self.l2_reg)(self.weights["layer_%d" % i])

            # optimizer
            with tf.name_scope("train"):
                if self.optimizer_type == "adam":
                    self.optimizer = tf.train.AdamOptimizer(
                        learning_rate=self.learning_rate,
                        beta1=0.9,
                        beta2=0.999,
                        epsilon=1e-8).minimize(self.loss)
                elif self.optimizer_type == "adagrad":
                    self.optimizer = tf.train.AdagradOptimizer(
                        learning_rate=self.learning_rate,
                        initial_accumulator_value=1e-8).minimize(self.loss)
                elif self.optimizer_type == "gd":
                    self.optimizer = tf.train.GradientDescentOptimizer(
                        learning_rate=self.learning_rate).minimize(self.loss)
                elif self.optimizer_type == "momentum":
                    self.optimizer = tf.train.MomentumOptimizer(
                        learning_rate=self.learning_rate,
                        momentum=0.95).minimize(self.loss)
                elif self.optimizer_type == "yellowfin":
                    self.optimizer = YFOptimizer(
                        learning_rate=self.learning_rate,
                        momentum=0.0).minimize(self.loss)

            # init
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)

            # save model
            self.saver = tf.train.Saver()
            # save summary
            tf.summary.scalar('log_loss', self.loss)
            self.merge_summary = tf.summary.merge_all(
            )  #调用sess.run运行图,生成一步的训练过程数据, 是一个option
            self.writer = tf.summary.FileWriter("./graphs", self.sess.graph)

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
Esempio n. 25
0
def learn(dataset,
          dim=2,
          hyp=1,
          edim=1,
          euc=0,
          sdim=1,
          sph=0,
          scale=1.,
          riemann=False,
          learning_rate=1e-1,
          decay_length=1000,
          decay_step=1.0,
          momentum=0.0,
          tol=1e-8,
          epochs=100,
          burn_in=0,
          use_yellowfin=False,
          use_adagrad=False,
          resample_freq=1000,
          print_freq=1,
          model_save_file=None,
          model_load_file=None,
          batch_size=16,
          num_workers=None,
          lazy_generation=False,
          log_name=None,
          log=False,
          warm_start=None,
          learn_scale=False,
          checkpoint_freq=100,
          sample=1.,
          subsample=None,
          logloss=False,
          distloss=False,
          squareloss=False,
          symloss=False,
          exponential_rescale=None,
          extra_steps=1,
          use_svrg=False,
          T=10,
          use_hmds=False,
          visualize=False):
    # Log configuration
    formatter = logging.Formatter('%(asctime)s %(message)s')
    logging.basicConfig(
        level=logging.DEBUG,
        format='%(asctime)s %(message)s',
        datefmt='%FT%T',
    )
    if log_name is None and log:
        log_name = f"{os.path.splitext(dataset)[0]}.H{dim}-{hyp}.E{edim}-{euc}.S{sdim}-{sph}.lr{learning_rate}.log"
    if log_name is not None:
        logging.info(f"Logging to {log_name}")
        log = logging.getLogger()
        fh = logging.FileHandler(log_name)
        fh.setFormatter(formatter)
        log.addHandler(fh)

    logging.info(f"Commandline {sys.argv}")
    if model_save_file is None: logging.warn("No Model Save selected!")
    G = load_graph.load_graph(dataset)
    GM = nx.to_scipy_sparse_matrix(G, nodelist=list(range(G.order())))

    # grab scale if warm starting:
    if warm_start:
        scale = pandas.read_csv(warm_start, index_col=0).as_matrix()[0, -1]

    n = G.order()
    logging.info(f"Loaded Graph {dataset} with {n} nodes scale={scale}")

    if exponential_rescale is not None:
        # torch.exp(exponential_rescale * -d)
        def weight_fn(d):
            if d <= 2.0: return 5.0
            elif d > 4.0: return 0.01
            else: return 1.0
    else:

        def weight_fn(d):
            return 1.0

    Z, z = build_dataset(G, lazy_generation, sample, subsample, scale,
                         batch_size, weight_fn, num_workers)

    if model_load_file is not None:
        logging.info(f"Loading {model_load_file}...")
        m = torch.load(model_load_file).to(device)
        logging.info(
            f"Loaded scale {unwrap(m.scale())} {torch.sum(m.embedding().data)} {m.epoch}"
        )
    else:
        logging.info(f"Creating a fresh model warm_start?={warm_start}")

        m_init = None
        if warm_start:
            # load from DataFrame; assume that the julia combinatorial embedding has been saved
            raise NotImplementedError("Removed from this branch")
        elif use_hmds:
            # m_init = torch.DoubleTensor(mds_warmstart.get_normalized_hyperbolic(mds_warmstart.get_model(dataset,dim,scale)[1]))
            raise NotImplementedError("Removed from this branch")

        logging.info(
            f"\t Warmstarting? {warm_start} {m_init.size() if warm_start else None} {G.order()}"
        )
        initial_scale = z.dataset.max_dist / 3.0
        print("MAX DISTANCE", z.dataset.max_dist)
        print("AVG DISTANCE", torch.mean(z.dataset.val_cache))
        initial_scale = 0.0
        m = ProductEmbedding(G.order(),
                             dim,
                             hyp,
                             edim,
                             euc,
                             sdim,
                             sph,
                             initialize=m_init,
                             learn_scale=learn_scale,
                             initial_scale=initial_scale,
                             logrel_loss=logloss,
                             dist_loss=distloss,
                             square_loss=squareloss,
                             sym_loss=symloss,
                             exponential_rescale=exponential_rescale,
                             riemann=riemann).to(device)
        m.normalize()
        m.epoch = 0
    logging.info(
        f"Constructed model with dim={dim} and epochs={m.epoch} isnan={np.any(np.isnan(m.embedding().cpu().data.numpy()))}"
    )

    if visualize:
        name = 'animations/' + f"{os.path.split(os.path.splitext(dataset)[0])[1]}.H{dim}-{hyp}.E{edim}-{euc}.S{sdim}-{sph}.lr{learning_rate}.ep{epochs}.seed{seed}"
        fig, ax, writer = vis.setup_plot(m=m, name=name, draw_circle=True)
    else:
        fig = None
        ax = None
        writer = None

    #
    # Build the Optimizer
    #
    # TODO: Redo this in a sensible way!!

    # per-parameter learning rates
    exp_params = [p for p in m.embed_params if p.use_exp]
    learn_params = [p for p in m.embed_params if not p.use_exp]
    hyp_params = [p for p in m.hyp_params if not p.use_exp]
    euc_params = [p for p in m.euc_params if not p.use_exp]
    sph_params = [p for p in m.sph_params if not p.use_exp]
    scale_params = m.scale_params
    # model_params = [{'params': m.embed_params}, {'params': m.scale_params, 'lr': 1e-4*learning_rate}]
    # model_params = [{'params': learn_params}, {'params': m.scale_params, 'lr': 1e-4*learning_rate}]
    model_params = [{
        'params': hyp_params
    }, {
        'params': euc_params
    }, {
        'params': sph_params,
        'lr': 0.1 * learning_rate
    }, {
        'params': m.scale_params,
        'lr': 1e-4 * learning_rate
    }]

    # opt = None
    if len(model_params) > 0:
        opt = torch.optim.SGD(model_params,
                              lr=learning_rate / 10,
                              momentum=momentum)
        # opt = torch.optim.SGD(learn_params, lr=learning_rate/10, momentum=momentum)
    # opt = torch.optim.SGD(model_params, lr=learning_rate/10, momentum=momentum)
    # exp = None
    # if len(exp_params) > 0:
    #     exp = torch.optim.SGD(exp_params, lr=1.0) # dummy for zeroing
    if len(scale_params) > 0:
        scale_opt = torch.optim.SGD(scale_params, lr=1e-3 * learning_rate)
        scale_decay = torch.optim.lr_scheduler.StepLR(scale_opt,
                                                      step_size=1,
                                                      gamma=.99)
    else:
        scale_opt = None
        scale_decay = None
    lr_burn_in = torch.optim.lr_scheduler.MultiStepLR(opt,
                                                      milestones=[burn_in],
                                                      gamma=10)
    # lr_decay = torch.optim.lr_scheduler.StepLR(opt, decay_length, decay_step) #TODO reconcile multiple LR schedulers
    if use_yellowfin:
        from yellowfin import YFOptimizer
        opt = YFOptimizer(model_params)

    if use_adagrad:
        opt = torch.optim.Adagrad(model_params)

    if use_svrg:
        from svrg import SVRG
        base_opt = torch.optim.Adagrad if use_adagrad else torch.optim.SGD
        opt = SVRG(m.parameters(),
                   lr=learning_rate,
                   T=T,
                   data_loader=z,
                   opt=base_opt)
        # TODO add ability for SVRG to take parameter groups

    logging.info(opt)

    # Log stats from import: when warmstarting, check that it matches Julia's stats
    logging.info(f"*** Initial Checkpoint. Computing Stats")
    major_stats(GM, n, m, lazy_generation, Z, z, fig, ax, writer, visualize,
                subsample)
    logging.info("*** End Initial Checkpoint\n")

    # track best stats
    best_loss = 1.0e10
    best_dist = 1.0e10
    best_wcdist = 1.0e10
    best_map = 0.0
    for i in range(m.epoch + 1, m.epoch + epochs + 1):
        lr_burn_in.step()
        # lr_decay.step()
        # scale_decay.step()
        # print(scale_opt.param_groups[0]['lr'])
        # for param_group in opt.param_groups:
        #     print(param_group['lr'])
        # print(type(opt.param_groups), opt.param_groups)

        l, n_edges = 0.0, 0.0  # track average loss per edge
        m.train(True)
        if use_svrg:
            for data in z:

                def closure(data=data, target=None):
                    _data = data if target is None else (data, target)
                    c = m.loss(_data.to(device))
                    c.backward()
                    return c.data[0]

                l += opt.step(closure)

                # Projection
                m.normalize()

        else:
            # scale_opt.zero_grad()
            for the_step in range(extra_steps):
                # Accumulate the gradient
                for u in z:
                    # Zero out the gradients
                    # if opt is not None: opt.zero_grad() # This is handled by the SVRG.
                    # if exp is not None: exp.zero_grad()
                    opt.zero_grad()
                    for p in exp_params:
                        if p.grad is not None:
                            p.grad.detach_()
                            p.grad.zero_()
                    # Compute loss
                    _loss = m.loss(cu_var(u))
                    _loss.backward()
                    l += _loss.item() * u[0].size(0)
                    # print(weight)
                    n_edges += u[0].size(0)
                    # modify gradients if necessary
                    RParameter.correct_metric(m.parameters())
                    # step
                    opt.step()
                    for p in exp_params:
                        lr = opt.param_groups[0]['lr']
                        p.exp(lr)
                    # Projection
                    m.normalize()
            # scale_opt.step()

        l /= n_edges

        # m.epoch refers to num of training epochs finished
        m.epoch += 1

        # Logging code
        # if l < tol:
        #         logging.info("Found a {l} solution. Done at iteration {i}!")
        #         break
        if i % print_freq == 0:
            logging.info(f"{i} loss={l}")
        if (i <= burn_in and i %
            (checkpoint_freq / 5) == 0) or i % checkpoint_freq == 0:
            logging.info(f"\n*** Major Checkpoint. Computing Stats and Saving")
            avg_dist, wc_dist, me, mc, mapscore = major_stats(
                GM, n, m, True, Z, z, fig, ax, writer, visualize, subsample)
            best_loss = min(best_loss, l)
            best_dist = min(best_dist, avg_dist)
            best_wcdist = min(best_wcdist, wc_dist)
            best_map = max(best_map, mapscore)
            if model_save_file is not None:
                fname = f"{model_save_file}.{m.epoch}"
                logging.info(
                    f"Saving model into {fname} {torch.sum(m.embedding().data)} "
                )
                torch.save(m, fname)
            logging.info("*** End Major Checkpoint\n")
        if i % resample_freq == 0:
            if sample < 1. or subsample is not None:
                Z, z = build_dataset(G, lazy_generation, sample, subsample,
                                     scale, batch_size, weight_fn, num_workers)

    logging.info(f"final loss={l}")
    logging.info(
        f"best loss={best_loss}, distortion={best_dist}, map={best_map}, wc_dist={best_wcdist}"
    )

    final_dist, final_wc, final_me, final_mc, final_map = major_stats(
        GM, n, m, lazy_generation, Z, z, fig, ax, writer, False, subsample)

    if log_name is not None:
        with open(log_name + '.stat', "w") as f:
            f.write("Best-loss MAP dist wc Final-_MAP dist wc me mc\n")
            f.write(
                f"{best_loss:10.6f} {best_map:8.4f} {best_dist:8.4f} {best_wcdist:8.4f} {l:10.6f} {final_map:8.4f} {final_dist:8.4f} {final_wc:8.4f} {final_me:8.4f} {final_mc:8.4f}"
            )

    if visualize:
        writer.finish()

    if model_save_file is not None:
        fname = f"{model_save_file}.final"
        logging.info(
            f"Saving model into {fname}-final {torch.sum(m.embedding().data)} {unwrap(m.scale())}"
        )
        torch.save(m, fname)
# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, H),
    torch.nn.ReLU(),
    torch.nn.Linear(H, D_out),
)
loss_fn = torch.nn.MSELoss(size_average=False)

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use Adam; the optim package contains many other
# optimization algoriths. The first argument to the Adam constructor tells the
# optimizer which Variables it should update.

min_loss_so_far = np.inf

optimizer = YFOptimizer(model.parameters(), lr=0.0001, mu=0.0)
for t in range(6600):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(x)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable weights
    # of the model)
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model parameters
    loss.backward()
Esempio n. 27
0
    def _init_graph(self):
        self.graph = tf.Graph()  # 生成新的计算图
        with self.graph.as_default():  # 作为默认的计算图

            tf.compat.v1.set_random_seed(self.random_seed)  # 设置随机数种子
            # 占位符部分
            self.feat_index = tf.compat.v1.placeholder(
                tf.int32, shape=[None, None], name="feat_index")  # None *F
            self.feat_value = tf.compat.v1.placeholder(
                tf.float32, shape=[None, None], name="feat_value")  # None *F
            self.label = tf.compat.v1.placeholder(tf.float32,
                                                  shape=[None, 1],
                                                  name="label")  # None*1
            self.dropout_keep_fm = tf.compat.v1.placeholder(
                tf.float32, shape=[None], name="dropout_keep_fm")
            self.dropout_keep_deep = tf.compat.v1.placeholder(
                tf.float32, shape=[None], name="dropout_keep_deep")

            self.train_phase = tf.compat.v1.placeholder(
                tf.bool, name="train_phase")  # 默认为shape= None

            #
            self.weights = self._initialize_weights()  # 初始化权重

            # model
            self.embeddings = tf.nn.embedding_lookup(
                self.weights["feature_embeddings"],
                self.feat_index)  # None *F *k

            feat_value = tf.reshape(self.feat_value,
                                    shape=[-1, self.field_size, 1])
            self.embeddings = tf.multiply(self.embeddings, feat_value)

            # -------------------first order term------------------------------
            self.y_first_order = tf.nn.embedding_lookup(
                self.weights["feature_bias"], self.feat_index)  # None*F*1
            self.y_first_order = tf.reduce_sum(
                tf.multiply(self.y_first_order, feat_value), 2)
            self.y_first_order = tf.compat.v1.nn.dropout(
                self.y_first_order,
                rate=1 - self.dropout_keep_fm[0])  # None *F

            # --------------------second order term-----------------------------
            # sum_square part
            self.summed_features_emb = tf.reduce_sum(self.embeddings,
                                                     1)  # None *k
            self.summed_features_emb_square = tf.square(
                self.summed_features_emb)  # None *k

            # square_sum part
            self.squared_features_emb = tf.square(self.embeddings)  # None*k
            self.squared_sum_features_emb = tf.reduce_sum(
                self.squared_features_emb, 1)  # None *k

            # second order
            self.y_second_order = 0.5 * tf.subtract(
                self.summed_features_emb_square, self.squared_sum_features_emb)
            # None*k
            self.y_second_order = tf.nn.dropout(
                self.y_second_order, self.dropout_keep_fm[1])  # None *k

            # -------------------------Deep component---------------------------------------
            self.y_deep = tf.reshape(
                self.embeddings,
                shape=[-1,
                       self.field_size * self.embedding_size])  # None *(F*k)
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
            for i in range(0, len(self.deep_layers)):
                self.y_deep = tf.add(
                    tf.matmul(self.y_deep, self.weights["layer_%d" % i]),
                    self.weights["bias_%d" % i])
                # None * layer[i] * 1
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(
                        self.y_deep,
                        train_phase=self.train_phase,
                        scope_bn="bn_%d" % i)
                    # None *layer[i] *1
                self.y_deep = self.deep_layers_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(
                    self.y_deep,
                    self.dropout_keep_deep[1 +
                                           i])  # dropout at each Deep layer

            # --------------------------------------DeepFM-----------------------------------------
            if self.use_fm and self.use_deep:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order, self.y_deep],
                    axis=1)
            elif self.use_fm:
                concat_input = tf.concat(
                    [self.y_first_order, self.y_second_order], axis=1)
            elif self.use_deep:
                concat_input = self.y_deep
            else:
                raise AttributeError  # 没选择式,导致错误

            self.out = tf.add(
                tf.matmul(concat_input, self.weights["concat_projection"]),
                self.weights["concat_bias"])

            # ---------------------------------------loss-----------------------------------------
            if self.loss_type == "logloss":
                self.out = tf.nn.sigmoid(self.out)
                self.loss = tf.compat.v1.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            # l2 regularization on weights
            if self.l2_reg > 0:
                self.loss += tf.contrib.layers.l2_regularizer(self.l2_reg)(
                    self.weights["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layers)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weights["layer_%d" % i])

            # -----------------------------------optimizer优化器----------------------------------
            if self.optimizer_type == "adam":
                self.optimizer = tf.compat.v1.train.AdamOptimizer(
                    learning_rate=self.learning_rate,
                    beta1=0.9,
                    beta2=0.999,
                    epsilon=1e-8).minimize(self.loss)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.compat.v1.train.AdagradOptimizer(
                    learning_rate=self.learning_rate,
                    initial_accumulator_value=1e-8).minimize(self.loss)

            elif self.optimizer_type == "gd":
                self.optimizer = tf.compat.v1.train.GradientDescentOptimizer(
                    learning_rate=self.learning_rate).minimize(self.loss)

            elif self.optimizer_type == "momentum":
                self.optimizer = tf.compat.v1.train.MomentumOptimizer(
                    learning_rate=self.learning_rate,
                    momentum=0.95).minimize(self.loss)

            elif self.optimizer_type == "yellowfin":
                self.optimizer = YFOptimizer(learning_rate=self.learning_rate,
                                             momentum=0.0).minimize(self.loss)

            # init
            self.save = tf.compat.v1.train.Saver()  # 实例化对象,进行声明
            init = tf.compat.v1.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
Esempio n. 28
0
    if not os.path.isdir(args.logdir):
        os.mkdir(args.logdir)

    train_loss_list = []
    val_loss_list = []
    lr_list = []
    mu_list = []
    if args.opt_method == "SGD":
        print("using SGD")
        optimizer = torch.optim.SGD(model.parameters(), lr, momentum=0.0)
    elif args.opt_method == "momSGD":
        print("using mom SGD")
        optimizer = torch.optim.SGD(model.parameters(), lr, momentum=0.9)
    elif args.opt_method == "YF":
        print("using YF")
        optimizer = YFOptimizer(model.parameters(), lr=1.0, mu=0.0)
    elif args.opt_method == "Adagrad":
        print("using Adagrad")
        optimizer = torch.optim.Adagrad(model.parameters(), lr)
    elif args.opt_method == "Adam":
        print("using Adam")
        optimizer = torch.optim.Adam(model.parameters(), lr)
    for epoch in range(1, args.epochs+1):
        epoch_start_time = time.time()
        train_loss = train()
        train_loss_list += train_loss
        val_loss = evaluate(val_data)
        val_loss_list.append(val_loss)
        print('-' * 89)
        print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f} | '
              'valid ppl {:8.2f}'.format(epoch,
Esempio n. 29
0
    def _init_graph(self):
        self.graph = tf.Graph()
        with self.graph.as_default():

            tf.set_random_seed(self.random_seed)
            #稀疏存储,取值范围是[0, feature_size],但是取值数目是field_size,即每个field只有一个值;多值离散型field怎么处理呢?
            self.feat_index = tf.placeholder(tf.int32, shape=[None, None],
                                                 name="feat_index")  # None * F
            self.feat_value = tf.placeholder(tf.float32, shape=[None, None],
                                                 name="feat_value")  # None * F
            self.label = tf.placeholder(tf.float32, shape=[None, 1], name="label")  # None * 1
            self.dropout_keep_fm = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_fm")
            self.dropout_keep_deep = tf.placeholder(tf.float32, shape=[None], name="dropout_keep_deep")
            self.train_phase = tf.placeholder(tf.bool, name="train_phase")

            self.weights = self._initialize_weights()

            # 从feature_size * embedding_size的参数中查到对应的field_size个embedding向量,结合embedding * feat_value操作相当于MLP的第一层
            self.embeddings = tf.nn.embedding_lookup(self.weights["feature_embeddings"],
                                                             self.feat_index)  # None * F * K
            feat_value = tf.reshape(self.feat_value, shape=[-1, self.field_size, 1])
            self.embeddings = tf.multiply(self.embeddings, feat_value)

            # ---------- first order term ----------
            # weights['feature_bias']这里其实就是w,真正的bias:w0这里并没有
            self.y_first_order = tf.nn.embedding_lookup(self.weights["feature_bias"], self.feat_index) # None * F * 1
            self.y_first_order = tf.reduce_sum(tf.multiply(self.y_first_order, feat_value), 2)  # None * F
            self.y_first_order = tf.nn.dropout(self.y_first_order, self.dropout_keep_fm[0]) # None * F

            # ---------- second order term ---------------
            # v_i已经把x_i吸收进去了,所以这里feature_value没有显示的出现
            # sum_square part
            self.summed_features_emb = tf.reduce_sum(self.embeddings, 1)  # None * K
            self.summed_features_emb_square = tf.square(self.summed_features_emb)  # None * K

            # square_sum part
            self.squared_features_emb = tf.square(self.embeddings)
            self.squared_sum_features_emb = tf.reduce_sum(self.squared_features_emb, 1)  # None * K

            # second order
            self.y_second_order = 0.5 * tf.subtract(self.summed_features_emb_square, self.squared_sum_features_emb)  # None * K
            self.y_second_order = tf.nn.dropout(self.y_second_order, self.dropout_keep_fm[1])  # None * K

            # ---------- Deep component ----------
            #[field_size, embedding_size]的输入做flatten
            self.y_deep = tf.reshape(self.embeddings, shape=[-1, self.field_size * self.embedding_size]) # None * (F*K)
            self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[0])
            #下面就是正常的MLP
            for i in range(0, len(self.deep_layers)):
                self.y_deep = tf.add(tf.matmul(self.y_deep, self.weights["layer_%d" %i]), self.weights["bias_%d"%i]) # None * layer[i] * 1
                if self.batch_norm:
                    self.y_deep = self.batch_norm_layer(self.y_deep, train_phase=self.train_phase, scope_bn="bn_%d" %i) # None * layer[i] * 1
                self.y_deep = self.deep_layers_activation(self.y_deep)
                self.y_deep = tf.nn.dropout(self.y_deep, self.dropout_keep_deep[1+i]) # dropout at each Deep layer

            # ---------- DeepFM ----------
            if self.use_fm and self.use_deep:
                concat_input = tf.concat([self.y_first_order, self.y_second_order, self.y_deep], axis=1)
            elif self.use_fm:
                concat_input = tf.concat([self.y_first_order, self.y_second_order], axis=1)
            elif self.use_deep:
                concat_input = self.y_deep
            self.out = tf.add(tf.matmul(concat_input, self.weights["concat_projection"]), self.weights["concat_bias"])

            # loss
            if self.loss_type == "logloss":
                self.out = tf.nn.sigmoid(self.out)
                self.loss = tf.losses.log_loss(self.label, self.out)
            elif self.loss_type == "mse":
                self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
            # l2 regularization on weights
            if self.l2_reg > 0:
                self.loss += tf.contrib.layers.l2_regularizer(
                    self.l2_reg)(self.weights["concat_projection"])
                if self.use_deep:
                    for i in range(len(self.deep_layers)):
                        self.loss += tf.contrib.layers.l2_regularizer(
                            self.l2_reg)(self.weights["layer_%d"%i])

            # optimizer
            if self.optimizer_type == "adam":
                self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate, beta1=0.9, beta2=0.999,
                                                        epsilon=1e-8).minimize(self.loss)
            elif self.optimizer_type == "adagrad":
                self.optimizer = tf.train.AdagradOptimizer(learning_rate=self.learning_rate,
                                                           initial_accumulator_value=1e-8).minimize(self.loss)
            elif self.optimizer_type == "gd":
                self.optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
            elif self.optimizer_type == "momentum":
                self.optimizer = tf.train.MomentumOptimizer(learning_rate=self.learning_rate, momentum=0.95).minimize(
                    self.loss)
            elif self.optimizer_type == "yellowfin":
                self.optimizer = YFOptimizer(learning_rate=self.learning_rate, momentum=0.0).minimize(
                    self.loss)

            # init
            self.saver = tf.train.Saver()
            init = tf.global_variables_initializer()
            self.sess = self._init_session()
            self.sess.run(init)

            # number of params
            total_parameters = 0
            for variable in self.weights.values():
                shape = variable.get_shape()
                variable_parameters = 1
                for dim in shape:
                    variable_parameters *= dim.value
                total_parameters += variable_parameters
            if self.verbose > 0:
                print("#params: %d" % total_parameters)
Esempio n. 30
0
def test_lr_mu():
  opt = YFOptimizer(zero_debias=False)
  w = tf.Variable(np.ones([n_dim, ] ), dtype=tf.float32, name="w", trainable=True)
  b = tf.Variable(np.ones([1, ], dtype=np.float32), dtype=tf.float32, name="b", trainable=True)
  x = tf.constant(np.ones([n_dim, ], dtype=np.float32), dtype=tf.float32)
  loss = tf.multiply(w, x) + b
  tvars = tf.trainable_variables()

  w_grad_val = tf.Variable(np.zeros( [n_dim, ] ), dtype=tf.float32, trainable=False)
  b_grad_val = tf.Variable(np.zeros([1, ] ), dtype=tf.float32, trainable=False)
  apply_op = opt.apply_gradients(zip([w_grad_val, b_grad_val], tvars) )

  init_op = tf.global_variables_initializer()
  with tf.Session() as sess:
    sess.run(init_op)
    target_h_max = 0.0
    target_h_min = 0.0
    g_norm_squared_avg = 0.0
    g_norm_avg = 0.0
    g_avg = 0.0
    target_dist = 0.0
    target_lr = 0.1
    target_mu = 0.0
    for i in range(n_iter):
    
      sess.run(tf.assign(w_grad_val, (i + 1) * np.ones( [n_dim, ], dtype=np.float32) ) )
      sess.run(tf.assign(b_grad_val, (i + 1) * np.ones( [1, ], dtype=np.float32) ) )
  
      res = sess.run( [opt._curv_win, opt._h_max, opt._h_min, opt._grad_var, opt._dist_to_opt_avg, 
        opt._lr_var, opt._mu_var, apply_op] )
    
      res[5] = opt._lr_var.eval()
      res[6] = opt._mu_var.eval()
  
      g_norm_squared_avg = 0.999 * g_norm_squared_avg  \
        + 0.001 * np.sum(( (i + 1)*np.ones( [n_dim + 1, ] ) )**2)
      g_norm_avg = 0.999 * g_norm_avg  \
        + 0.001 * np.linalg.norm( (i + 1)*np.ones( [n_dim + 1, ] ) )
      g_avg = 0.999 * g_avg + 0.001 * (i + 1)
 
      target_h_max = 0.999 * target_h_max + 0.001 * (i + 1)**2*(n_dim + 1)
      target_h_min = 0.999 * target_h_min + 0.001 * max(1, i + 2 - 20)**2*(n_dim + 1)
      target_var = g_norm_squared_avg - g_avg**2 * (n_dim + 1)
      target_dist = 0.999 * target_dist + 0.001 * g_norm_avg / g_norm_squared_avg

      if i > 0:
        lr, mu = tune_everything(target_dist**2, target_var, 1, target_h_min, target_h_max)
        target_lr = 0.999 * target_lr + 0.001 * lr
        target_mu = 0.999 * target_mu + 0.001 * mu

      # print "iter ", i, " h max ", res[1], target_h_max, " h min ", res[2], target_h_min, \
   #                              " var ", res[3], target_var, " dist ", res[4], target_dist
      # print "iter ", i, " lr ", res[5], target_lr, " mu ", res[6], target_mu

      assert np.abs(target_h_max - res[1] ) < np.abs(target_h_max) * 1e-3
      assert np.abs(target_h_min - res[2] ) < np.abs(target_h_min) * 1e-3
      assert np.abs(target_var - res[3] ) < np.abs(res[3] ) * 1e-3
      assert np.abs(target_dist - res[4] ) < np.abs(res[4] ) * 1e-3
      assert target_lr == 0.0 or np.abs(target_lr - res[5] ) < np.abs(res[5] ) * 1e-3
      assert target_mu == 0.0 or np.abs(target_mu - res[6] ) < np.abs(res[6] ) * 5e-3 
  print "lr and mu computing test passed!"
                     maxlen=maxlen,
                     minlen=length)
valid = TextIterator(valid_dataset,
                     dictionary,
                     n_words_source=n_words,
                     batch_size=valid_batch_size,
                     maxlen=maxlen,
                     minlen=length)

rnn = RNN_LSTM(input_size, rnn_dim, num_layers, num_classes)
rnn.cuda()

criterion = nn.CrossEntropyLoss()

#opt = torch.optim.Adam(rnn.parameters(), lr=lr)
opt = YFOptimizer(rnn.parameters())


def evaluate_valid(valid):
    valid_loss = []
    acc = 0.0
    N = 0
    for x in valid:
        x = numpy.asarray(x, dtype=numpy.float32)
        x = torch.from_numpy(x)
        x = x.view(x.size()[0], x.size()[1], input_size)
        y = torch.cat((x[:, 1:, :], torch.zeros([x.size()[0], 1, input_size])),
                      1)
        images = Variable(x).cuda()
        labels = Variable(y).long().cuda()
        opt.zero_grad()