def create_graph(self):
        create_placeholders(self)
        create_discriminator(self, size=1)
        if (self.model_params.model_type
                == "SS") or (self.model_params.model_type == "BCE"):
            self.d_loss2 = -discriminator_adversarial_loss(self)
            self.disc_optimizer_adv, self.adv_grad = LazyAdamOptimizer(
                1.5e-3, beta1=0.8, beta2=0.9,
                epsilon=1e-5), tf.gradients(self.d_loss2, self.d_weights)
            self.d_train_adversarial = self.disc_optimizer_adv.minimize(
                self.d_loss2, var_list=self.d_weights)

        lr = 1e-3
        global_step = tf.Variable(0, trainable=False)
        rate = tf.train.exponential_decay(lr, global_step, 3, 0.9999)
        self.disc_optimizer = LazyAdamOptimizer(lr,
                                                beta1=0.8,
                                                beta2=0.9,
                                                epsilon=1e-5)
        self.d_baseline = self.disc_optimizer.minimize(self.d_loss1,
                                                       var_list=self.d_weights,
                                                       global_step=global_step)
        self.d_softmax, self.d_mle = self.disc_optimizer.minimize(
            self.softmax_loss,
            var_list=self.d_weights,
            global_step=global_step), self.disc_optimizer.minimize(
                -self.mle_lossD,
                var_list=self.d_weights,
                global_step=global_step)
        self.softmax_grad = tf.gradients(self.softmax_loss, self.d_weights)
Exemple #2
0
def get_optimizer(
    network_config,
    default_optimizer=train.AdadeltaOptimizer(learning_rate=1.0)):
    """
    Return the optimizer given by the input network configuration, or a default optimizer.
    :param network_config: network configuration
    :param default_optimizer: default optimization algorithm
    :return: configured optimizer
    """
    try:
        optimizer = network_config.optimizer
    except KeyError:
        logging.info("Using Adadelta as default optimizer.")
        return default_optimizer
    if isinstance(optimizer.lr, numbers.Number):
        lr = optimizer.lr
    else:
        optimizer.lr.num_train_steps = network_config.max_steps
        optimizer.lr.steps_per_epoch = network_config.steps_per_epoch
        lr = get_learning_rate(optimizer.lr, train.get_global_step())

    name = optimizer.name
    params = optimizer.params
    if "Adadelta" == name:
        opt = train.AdadeltaOptimizer(lr, **params)
    elif "Adam" == name:
        opt = train.AdamOptimizer(lr, **params)
    elif "LazyAdam" == name:
        opt = LazyAdamOptimizer(lr, **params)
    elif "LazyNadam" == name:
        opt = LazyNadamOptimizer(lr, **params)
    elif "SGD" == name:
        opt = train.GradientDescentOptimizer(lr)
    elif "Momentum" == name:
        opt = train.MomentumOptimizer(lr, **params)
    elif "Nadam" == name:
        opt = NadamOptimizerSparse(lr, **params)
    elif "bert" == name:
        opt = AdamWeightDecayOptimizer(
            lr,
            weight_decay_rate=0.01,
            beta_1=0.9,
            beta_2=0.999,
            epsilon=1e-6,
            exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
    else:
        raise ValueError("Invalid optimizer name: {}".format(name))
    return opt
Exemple #3
0
def get_opt(name, learning_rate, decay_steps=None):
    if name == 'momentum':
        optimizer = tf.train.MomentumOptimizer(learning_rate, 0.9)
    elif name == 'adam':
        optimizer = tf.train.AdamOptimizer(learning_rate,
                                           beta2=0.98,
                                           epsilon=1e-9)
    elif name == 'sgd':
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    elif name == 'rms':
        optimizer = tf.train.RMSPropOptimizer(learning_rate)
    elif name == 'adagrad':
        optimizer = tf.train.AdagradOptimizer(learning_rate)
    elif name == 'lazyadam':
        optimizer = LazyAdamOptimizer(learning_rate)
    elif name == 'powersign':
        optimizer = PowerSignOptimizer(learning_rate)
    elif name == 'powersign-ld':
        optimizer = PowerSignOptimizer(
            learning_rate, sign_decay_fn=get_linear_decay_fn(decay_steps))
    elif name == 'powersign-cd':
        optimizer = PowerSignOptimizer(
            learning_rate, sign_decay_fn=get_cosine_decay_fn(decay_steps))
    elif name == 'powersign-rd':
        optimizer = PowerSignOptimizer(
            learning_rate, sign_decay_fn=get_restart_decay_fn(decay_steps))
    elif name == 'addsign':
        optimizer = AddSignOptimizer(learning_rate)
    elif name == 'addsign-ld':
        optimizer = AddSignOptimizer(
            learning_rate, sign_decay_fn=get_linear_decay_fn(decay_steps))
    elif name == 'addsign-cd':
        optimizer = AddSignOptimizer(
            learning_rate, sign_decay_fn=get_cosine_decay_fn(decay_steps))
    elif name == 'addsign-rd':
        optimizer = AddSignOptimizer(
            learning_rate, sign_decay_fn=get_restart_decay_fn(decay_steps))
    else:
        optimizer = None

    return optimizer
imgs, filenames = prepare_imgs('test_data_4', 'test_data_4_25')
model_name = 'inception_v3'
beta = 1
learning_rate = 0.01
iteration_num = 50000

checkpoint_iter = 50000

sess, graph, mask_var, sig_mask_op, masked_input = build_masking_graph(
    model_name, 4)

# list_tensors()

cost_op, last_feat_map_op, loss_terms = masking_graph_cost(sig_mask_op)

optimizer = LazyAdamOptimizer(learning_rate)
opt_op = optimizer.minimize(cost_op, var_list=[mask_var])

iter = Iteration(max=iteration_num, log=50, checkpoint=checkpoint_iter)

# tensorboard
# loss_terms_placeholder = tf.placeholder(tf.float32)
# tf.summary.scalar('loss_terms', loss_terms_placeholder)
# writers = tensorboard_writers(experiment_file.save_directory, loss_terms)
# merged_summary = tf.summary.merge_all()
#

AM_LOSS_THRESHOLD = 1
MASK_CONVERGENCE_THRESHOLD = 10

class Self_Basket_Completion_Model(object):
    def __init__(self, model):
        self.model_params = model
        self.train_data, self.test_data, self.X_train, self.Y_train, self.X_test, self.Y_test = list(
        ), list(), list(), list(), list(), list()
        self.LSTM_labels_train, self.LSTM_labels_test = list(), list()
        self.index, self.index_words = 0, 0
        self.neg_sampled = model.neg_sampled
        self.neg_sampled_pretraining = 1 if self.neg_sampled < 1 else self.neg_sampled

        self.training_data, self.test_data = model.training_data, model.test_data
        self.num_epochs, self.batch_size, self.vocabulary_size, self.vocabulary_size2 = model.epoch, model.batch_size, model.vocabulary_size, model.vocabulary_size2
        self.seq_length, self.epoch = model.seq_length, model.epoch
        self.embedding_size, self.embedding_matrix, self.use_pretrained_embeddings = model.embedding_size, model.embedding_matrix, model.use_pretrained_embeddings
        self.adv_generator_loss, self.adv_discriminator_loss = model.adv_generator_loss, model.adv_discriminator_loss
        self.negD = model.negD
        self.discriminator_type = model.D_type
        self.one_guy_sample = np.random.choice(self.vocabulary_size - 1)
        self.dataD = [list(), list(), list(), list(), list(), list(), list()]
        self.Gen_loss1, self.Gen_loss2, self.Disc_loss1, self.Disc_loss2, self.pic_number = 0, 0, 0, 0, 0

    def create_graph(self):
        create_placeholders(self)
        create_discriminator(self, size=1)
        if (self.model_params.model_type
                == "SS") or (self.model_params.model_type == "BCE"):
            self.d_loss2 = -discriminator_adversarial_loss(self)
            self.disc_optimizer_adv, self.adv_grad = LazyAdamOptimizer(
                1.5e-3, beta1=0.8, beta2=0.9,
                epsilon=1e-5), tf.gradients(self.d_loss2, self.d_weights)
            self.d_train_adversarial = self.disc_optimizer_adv.minimize(
                self.d_loss2, var_list=self.d_weights)

        lr = 1e-3
        global_step = tf.Variable(0, trainable=False)
        rate = tf.train.exponential_decay(lr, global_step, 3, 0.9999)
        self.disc_optimizer = LazyAdamOptimizer(lr,
                                                beta1=0.8,
                                                beta2=0.9,
                                                epsilon=1e-5)
        self.d_baseline = self.disc_optimizer.minimize(self.d_loss1,
                                                       var_list=self.d_weights,
                                                       global_step=global_step)
        self.d_softmax, self.d_mle = self.disc_optimizer.minimize(
            self.softmax_loss,
            var_list=self.d_weights,
            global_step=global_step), self.disc_optimizer.minimize(
                -self.mle_lossD,
                var_list=self.d_weights,
                global_step=global_step)
        self.softmax_grad = tf.gradients(self.softmax_loss, self.d_weights)

    def train_model_with_tensorflow(self):
        self.create_graph()
        self._sess = tf.Session()
        self._sess.run(tf.global_variables_initializer())
        self.options, self.run_metadata = create_options_and_metadata(self)
        step, cont = 0, True
        disc_loss1, disc_loss2 = 0, 0

        timee = time.time()
        while cont:
            try:
                if (self.model_params.model_type == "baseline"):
                    _, disc_loss1 = training_step(
                        self, [self.d_baseline, self.d_loss1])
                elif (self.model_params.model_type == "softmax"):
                    _, disc_loss1 = training_step(
                        self, [self.d_softmax, self.softmax_loss])
                elif (self.model_params.model_type == "MLE"):
                    _, disc_loss1 = training_step(
                        self, [self.d_mle, -self.mle_lossD])
                else:
                    _, disc_loss1, disc_loss2 = training_step(
                        self,
                        [self.d_train_adversarial, self.d_loss1, self.d_loss2])

                self.Disc_loss1, self.Disc_loss2 = (self.Disc_loss1 +
                                                    disc_loss1,
                                                    self.Disc_loss2 +
                                                    disc_loss2)

                if (math.isnan(disc_loss1)) or (math.isnan(disc_loss2)):
                    cont = False

                if ((step > self.model_params.min_steps) and (early_stopping(self.dataD[0], 4) or early_stopping(self.dataD[2], 4))) or \
                    (step>self.model_params.max_steps):
                    cont = False

                if (step % self.model_params.printing_step == 0):
                    print(time.time() - timee)

                self.save_data(step)
                testing_step(self, step)
                if (step < 10):
                    create_timeline_object(self)

                if (step % self.model_params.printing_step == 0):
                    timee = time.time()
                step += 1

            except KeyboardInterrupt:
                cont = False

        self.save_data(step)
        tf.reset_default_graph()

    def save_data(self, step):
        if (step % self.model_params.saving_step == 0):
            data = np.array(self.dataD)
            np.save(self.model_params.name, data)
Exemple #6
0
        embedded_a = embedded_attentive_a
        out_embeddings = embedding_tables[out_node_type]["out"]
        #out_embeddings = attentive_out_embeddings#embedding_tables[out_node_type]["out"]

        embedded_b = tf.nn.embedding_lookup(out_embeddings, b_placeholder)
        embedded_neg_b = tf.nn.embedding_lookup(out_embeddings, neg_b_placeholder)
        # embedded_b = embedded_attentive_b
        # embedded_neg_b = embedded_attentive_neg_b

        meta_losses = build_line_losses(embedded_a, embedded_b, embedded_neg_b, drop_rate_placeholder) + attention_func.l2_loss() * l2_coe
        # optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(meta_losses, var_list=attention_func.variables)
        var_list = None if out_node_type == TYPE_LABEL else attention_func.variables
        # optimizer = tf.train.AdamOptimizer(learning_rate=attention_decayed_learning_rate).minimize(meta_losses, var_list=var_list, global_step=global_step)

        optimizer = LazyAdamOptimizer(learning_rate=attention_learning_rate).minimize(meta_losses,
                                                                                                   var_list=var_list,
                                                                                                   global_step=global_step)
        # optimizer = tf.train.GradientDescentOptimizer(learning_rate=1e-2).minimize(meta_losses,
        #                                                                                            var_list=var_list,
        #                                                                                            global_step=global_step)

        # optimizer = tf.train.AdamOptimizer(learning_rate=5e-3).minimize(meta_losses, var_list=var_list)#, var_list=attention_func.variables + [label_out_embeddings]),
            # tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(meta_losses, var_list=embedding_vars)

        # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(meta_losses)#, var_list=attention_func.variables)
        # optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(meta_losses)
    else:
        in_node_type, out_node_type = node_types
        in_embeddings = embedding_tables[in_node_type]["in"]
        out_embeddings = embedding_tables[out_node_type]["out"]
        meta_losses = build_line_losses_by_indices(in_embeddings, out_embeddings, a_placeholder, b_placeholder, neg_b_placeholder, drop_rate_placeholder)# + tf.nn.l2_loss(in_embeddings) * l2_coe
Exemple #7
0
def estimator_model_fn(features, labels, mode, params):
    """
    Parameters
    ----------
    features : list
        Each entry is a dict sent to one of the towers. Keys are {uid, iid, delta_t, seq_lens, user_ids}.
        Vals are tf.float32/tf.int32 tensors with first dimension of size batch_size_for_one_tower.
    labels : list
        Each entry is a tensor sent to one of the towers. The tf.float32 tensor is of the shape
        batch_size_for_one_tower x timesteps.
    mode : tf.estimator.ModeKeys object
        Passed by Estimator - either TRAIN, EVAL or PREDICT
    params : tf.contrib.training.HParams object
        Contains all parameters for the run - extracted from json by init_basic_argument_parser
    Returns
    -------
    tf.estimator.EstimatorSpec
        Object containing the built model
    """

    # Hacky fix for model_fn accepting lists as features whereas serving_input_receiver_fn requires a dict
    # Assumes predictions are served with only one tower
    if type(features) != list:
        features = [features]
        labels = [labels]

    # Flag whether weights are provided as a part of the inputs
    use_weights = "weights" in features[0].keys()

    # tower_features and labels are lists of dicts. Each item in the list goes to one tower,
    # each entry in a dict is a pair in {uid, iid, delta_t, seq_lens, user_ids} and {y} and its batch
    tower_features = features
    tower_labels = labels
    num_gpus = params.num_gpu if mode != tf.estimator.ModeKeys.PREDICT else 0

    # When not 1 GPU then always all results combined on CPU, if 1 GPU then combined on device according to param
    variable_strategy = "CPU" if (params.variable_strategy_CPU or mode
                                  == tf.estimator.ModeKeys.PREDICT) else "GPU"

    # Outputs of all towers
    tower_losses = []
    tower_gradvars = []
    tower_preds = []

    # Devices on which towers are built are either CPU if no GPUs or GPU if any available
    if num_gpus == 0:
        num_devices = 1
        device_type = "cpu"
    else:
        num_devices = num_gpus
        device_type = "gpu"

    # Build towers
    for i in range(num_devices):
        worker_device = "/{}:{}".format(device_type, i)

        # Strategy of instantiating variables on appropriate devices
        if variable_strategy == "CPU":
            device_setter = estimator_utils.local_device_setter(
                worker_device=worker_device)
        elif variable_strategy == 'GPU':
            device_setter = estimator_utils.local_device_setter(
                ps_device_type='gpu',
                worker_device=worker_device,
                ps_strategy=tf.contrib.training.GreedyLoadBalancingStrategy(
                    num_gpus, tf.contrib.training.byte_size_load_fn))

        # Reuse variables between towers - only init once on the first tower
        with tf.variable_scope("model", reuse=bool(i != 0)):
            with tf.name_scope("tower_%d" % i) as name_scope:
                with tf.device(device_setter):

                    # No labels available for PREDICT
                    tower_labs_or_None = tower_labels[
                        i] if tower_labels else None

                    # Parameters for regularisation
                    regularization = {
                        "user_reg_weight": params.user_reg_weight,
                        "user_related_weights": params.user_related_weights
                    }

                    # Dict of outputs - always tower_predictions, gradvars and loss during training
                    tower_outputs = _tower_fn(
                        features=tower_features[i],
                        labels=tower_labs_or_None,
                        params=params,
                        num_towers=num_devices,
                        variable_strategy=variable_strategy,
                        regularization=regularization,
                        mode=mode)

                    if mode == tf.estimator.ModeKeys.TRAIN:
                        tower_gradvars.append(tower_outputs["gradvars"])
                        tower_losses.append(tower_outputs["tower_loss"])
                    if mode == tf.estimator.ModeKeys.EVAL:
                        tower_losses.append(tower_outputs["tower_loss"])
                        tower_preds.append(tower_outputs["tower_predictions"])
                    if mode == tf.estimator.ModeKeys.PREDICT:
                        tower_preds.append(tower_outputs["tower_predictions"])

    # Combine the outputs on the master node
    consolidation_device = "/gpu:0" if variable_strategy == "GPU" else "/cpu:0"
    with tf.device(consolidation_device):

        if mode != tf.estimator.ModeKeys.TRAIN:
            preds = {
                k: tf.concat([x[k] for x in tower_preds], axis=0)
                for k in tower_preds[0].keys()
            }

        # Combine non-feature inputs from all towers
        with tf.name_scope("merge_tower_inputs"):
            stacked_seq_lens = tf.concat(
                [t["seq_lens"] for t in tower_features], axis=0)
            stacked_batch_user_ids = tf.concat(
                [t["uid"][:, 0] for t in tower_features], axis=0)
            stacked_weights = None
            if use_weights:
                stacked_weights = tf.concat(
                    [t["weights"] for t in tower_features], axis=0)

        if mode == tf.estimator.ModeKeys.PREDICT:

            # If only interested in the last prediction (e.g. real recommendation)
            if params.last_prediction_only:
                # For each sequence slice the last real timestep
                # preds = {k: tf.gather_nd(v, stacked_seq_lens-1) for k, v in preds.items()}
                batch_size = tf.shape(stacked_seq_lens)[0]
                slices = tf.concat([
                    tf.expand_dims(tf.range(batch_size), 1),
                    tf.expand_dims(stacked_seq_lens, 1) - 1
                ],
                                   axis=1)
                preds = {k: tf.gather_nd(v, slices) for k, v in preds.items()}

            # If want recommendations to be traceable back to specific users
            if params.prediction_include_uid:
                preds = merge_dicts(preds,
                                    {"user_ids": stacked_batch_user_ids})

            return tf.estimator.EstimatorSpec(
                mode, predictions=preds,
                export_outputs=None)  #TODO Specify my own outputs

        # Counts of individual user interactions per tower -
        # used to offset effects of differing sequence lens on things like metrics
        with tf.name_scope("total_interactions_count"):
            # If using weights: sequence mask's 0 and weight non-1 values have to be accounted for
            if use_weights:
                sequence_mask = tf.sequence_mask(
                    stacked_seq_lens,
                    params.timesteps,
                    dtype=tf.float32,
                    name="total_interactions_seq_mask")
                total_interactions_op = tf.reduce_sum(
                    tf.multiply(sequence_mask, stacked_weights),
                    name="total_interactions_op_weights")
            else:
                total_interactions_op = tf.reduce_sum(
                    stacked_seq_lens, name="total_interactions_op_no_weights")

        # Combine all labels from all towers
        with tf.name_scope("merge_tower_labels"):
            stacked_labels = tf.concat(labels, axis=0)

        # Calculate total batch loss
        with tf.name_scope("merge_tower_losses"):
            loss = reduce_tower_losses(tower_losses, total_interactions_op)

    if mode == tf.estimator.ModeKeys.TRAIN:

        # Calculate total gradients to apply (scaled by number of interactions in each batch)
        with tf.name_scope('average_gradients'):
            gradvars = average_gradients(tower_gradvars, total_interactions_op)

        with tf.device(consolidation_device):
            # Apply gradients
            with tf.name_scope("apply_gradients"):
                optimizer = LazyAdamOptimizer(params.learning_rate)

                # TODO Check if need params.sync
                train_op = optimizer.apply_gradients(
                    gradvars, global_step=tf.train.get_global_step())
            metrics = None

    else:
        with tf.device(consolidation_device):
            # Create a dict of metric_name: (metric_var, metric_update_op)
            with tf.name_scope("build_metrics"):
                metrics = build_metrics(labels=stacked_labels,
                                        predictions=preds["top_k"],
                                        seq_lens=stacked_seq_lens,
                                        batch_user_ids=stacked_batch_user_ids,
                                        params=params,
                                        input_top_k=True,
                                        weights=stacked_weights)
            train_op = None
            # Due to memory constraints loss is not recorded during evaluation, though it needs to be set to a tensor
            if params.zero_loss:
                with tf.name_scope("zero_loss"):
                    loss = tf.constant(0)

    with tf.device(consolidation_device):
        # Count processing speed
        batch_size = params.train_batch_size if mode == tf.estimator.ModeKeys.TRAIN else params.validation_batch_size
        training_hooks = [
            estimator_utils.ExamplesPerSecondHook(batch_size, every_n_steps=10)
        ]
        return tf.estimator.EstimatorSpec(mode=mode,
                                          loss=loss,
                                          train_op=train_op,
                                          eval_metric_ops=metrics,
                                          training_chief_hooks=training_hooks)