Beispiel #1
0
    def testFlopRegularizer(self):
        tf.reset_default_graph()
        tf.set_random_seed(7907)
        with slim.arg_scope([slim.layers.conv2d, slim.layers.conv2d_transpose],
                            weights_initializer=tf.random_normal_initializer):
            # Our test model is:
            #
            #         -> conv1 --+
            #        /           |--[concat]
            #  image --> conv2 --+
            #        \
            #         -> convt
            #
            # (the model has two "outputs", convt and concat).
            #
            image = tf.constant(0.0, shape=[1, 17, 19, NUM_CHANNELS])
            conv1 = slim.layers.conv2d(image,
                                       13, [7, 5],
                                       padding='SAME',
                                       scope='conv1')
            conv2 = slim.layers.conv2d(image,
                                       23, [1, 1],
                                       padding='SAME',
                                       scope='conv2')
            self.concat = tf.concat([conv1, conv2], 3)
            self.convt = slim.layers.conv2d_transpose(image,
                                                      29, [7, 5],
                                                      stride=3,
                                                      padding='SAME',
                                                      scope='convt')
            self.name_to_var = {v.op.name: v for v in tf.global_variables()}
        with self.cached_session():
            tf.global_variables_initializer().run()

        threshold = 1.0
        flop_reg = flop_regularizer.GroupLassoFlopsRegularizer(
            [self.concat.op, self.convt.op],
            threshold=threshold,
            l1_fraction=0)

        with self.cached_session() as s:
            evaluated_vars = s.run(self.name_to_var)

        def group_norm(weights, axis=(0, 1, 2)):  # pylint: disable=invalid-name
            return np.sqrt(np.mean(weights**2, axis=axis))

        reg_vectors = {
            'conv1': group_norm(evaluated_vars['conv1/weights'], (0, 1, 2)),
            'conv2': group_norm(evaluated_vars['conv2/weights'], (0, 1, 2)),
            'convt': group_norm(evaluated_vars['convt/weights'], (0, 1, 3))
        }

        num_alive = {k: np.sum(r > threshold) for k, r in reg_vectors.items()}
        total_outputs = (reg_vectors['conv1'].shape[0] +
                         reg_vectors['conv2'].shape[0])
        total_alive_outputs = sum(num_alive.values())
        assert total_alive_outputs > 0, (
            'All outputs are dead - test is trivial. Decrease the threshold.')
        assert total_alive_outputs < total_outputs, (
            'All outputs are alive - test is trivial. Increase the threshold.')

        coeff1 = _coeff(_get_op('conv1/Conv2D'))
        coeff2 = _coeff(_get_op('conv2/Conv2D'))
        coefft = _coeff(_get_op('convt/conv2d_transpose'))

        expected_flop_cost = NUM_CHANNELS * (coeff1 * num_alive['conv1'] +
                                             coeff2 * num_alive['conv2'] +
                                             coefft * num_alive['convt'])
        expected_reg_term = NUM_CHANNELS * (
            coeff1 * np.sum(reg_vectors['conv1']) +
            coeff2 * np.sum(reg_vectors['conv2']) +
            coefft * np.sum(reg_vectors['convt']))
        with self.cached_session():
            self.assertEqual(round(expected_flop_cost),
                             round(flop_reg.get_cost().eval()))
            self.assertNearRelatively(
                expected_reg_term,
                flop_reg.get_regularization_term().eval())
Beispiel #2
0
    data_stat += s
    print(s)

    train_y[train_y < rmin] = rmin
    valid_y[valid_y < rmin] = rmin
    for i in range(len(test_y)):
        test_y[i][test_y[i] < rmin] = rmin

    data_stat += get_stat('train', train_x, train_y)
    data_stat += get_stat('valid', valid_x, valid_y)
    for i in range(len(test_x)):
        data_stat += get_stat('test', test_x[i], test_y[i])

    model = construct_graph(args)
    init = tf.global_variables_initializer()
    best_saver = tf.train.Saver(tf.global_variables(), max_to_keep=1)

    # set seeds
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=args.memory)
    time_now = datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

    with tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) as sess, open('log/'+str(args.save_name)+'_'+time_now+'_'+str(args.seed)+'.log','w') as fp:
        summary_writer = tf.summary.FileWriter('summary/%s_%s' % (str(args.save_name), str(args.seed)), graph=tf.get_default_graph())
        fp.write(' '.join(sys.argv)+'\n')
        fp.write(git_log()+'\n')
        fp.write(data_stat+'\n')

        sess.run(init)

        if args.resume_training != "":
            best_saver.restore(sess, args.resume_training)
Beispiel #3
0
            def build_example(label, param_dict_real, zip_path_label):
                """Build the model with parameter values set in param_dict_real.

        Args:
          label: Label of the model
          param_dict_real: Parameter dictionary (arguments to the factories
            make_graph and make_test_inputs)
          zip_path_label: Filename in the zip

        Returns:
          (tflite_model_binary, report) where tflite_model_binary is the
          serialized flatbuffer as a string and report is a dictionary with
          keys `toco_log` (log of toco conversion), `tf_log` (log of tf
          conversion), `toco` (a string of success status of the conversion),
          `tf` (a string success status of the conversion).
        """

                np.random.seed(RANDOM_SEED)
                report = {
                    "converter": report_lib.NOTRUN,
                    "tf": report_lib.FAILED
                }

                # Build graph
                report["tf_log"] = ""
                report["converter_log"] = ""
                tf.reset_default_graph()

                with tf.Graph().as_default():
                    with tf.device("/cpu:0"):
                        try:
                            inputs, outputs = make_graph(param_dict_real)
                        except (tf.errors.UnimplementedError,
                                tf.errors.InvalidArgumentError, ValueError):
                            report["tf_log"] += traceback.format_exc()
                            return None, report

                    sess = tf.Session()
                    try:
                        baseline_inputs, baseline_outputs = (make_test_inputs(
                            param_dict_real, sess, inputs, outputs))
                    except (tf.errors.UnimplementedError,
                            tf.errors.InvalidArgumentError, ValueError):
                        report["tf_log"] += traceback.format_exc()
                        return None, report
                    report["converter"] = report_lib.FAILED
                    report["tf"] = report_lib.SUCCESS
                    # Convert graph to toco
                    input_tensors = [(input_tensor.name.split(":")[0],
                                      input_tensor.shape, input_tensor.dtype)
                                     for input_tensor in inputs]
                    output_tensors = [
                        _normalize_output_name(out.name) for out in outputs
                    ]
                    # pylint: disable=g-long-ternary
                    graph_def = freeze_graph(
                        sess,
                        tf.global_variables() + inputs +
                        outputs) if use_frozen_graph else sess.graph_def

                if "split_tflite_lstm_inputs" in param_dict_real:
                    extra_toco_options.split_tflite_lstm_inputs = param_dict_real[
                        "split_tflite_lstm_inputs"]
                tflite_model_binary, toco_log = options.tflite_convert_function(
                    options,
                    graph_def,
                    input_tensors,
                    output_tensors,
                    extra_toco_options=extra_toco_options,
                    test_params=param_dict_real)
                report["converter"] = (report_lib.SUCCESS
                                       if tflite_model_binary is not None else
                                       report_lib.FAILED)
                report["converter_log"] = toco_log

                if options.save_graphdefs:
                    zipinfo = zipfile.ZipInfo(zip_path_label + ".pbtxt")
                    archive.writestr(zipinfo,
                                     text_format.MessageToString(graph_def),
                                     zipfile.ZIP_DEFLATED)

                if tflite_model_binary:
                    if options.make_edgetpu_tests:
                        # Set proper min max values according to input dtype.
                        baseline_inputs, baseline_outputs = generate_inputs_outputs(
                            tflite_model_binary, min_value=0, max_value=255)
                    zipinfo = zipfile.ZipInfo(zip_path_label + ".bin")
                    archive.writestr(zipinfo, tflite_model_binary,
                                     zipfile.ZIP_DEFLATED)
                    example = {
                        "inputs": baseline_inputs,
                        "outputs": baseline_outputs
                    }

                    example_fp = StringIO()
                    write_examples(example_fp, [example])
                    zipinfo = zipfile.ZipInfo(zip_path_label + ".inputs")
                    archive.writestr(zipinfo, example_fp.getvalue(),
                                     zipfile.ZIP_DEFLATED)

                    example_fp2 = StringIO()
                    write_test_cases(example_fp2, zip_path_label + ".bin",
                                     [example])
                    zipinfo = zipfile.ZipInfo(zip_path_label + "_tests.txt")
                    archive.writestr(zipinfo, example_fp2.getvalue(),
                                     zipfile.ZIP_DEFLATED)

                    zip_manifest_label = zip_path_label + " " + label
                    if zip_path_label == label:
                        zip_manifest_label = zip_path_label

                    zip_manifest.append(zip_manifest_label + "\n")

                return tflite_model_binary, report
Beispiel #4
0
    def __init__(self,
                 predict: Union[Callable, tf.keras.Model, 'keras.Model'],
                 mode: str,
                 shape: tuple,
                 kappa: float = 0.,
                 beta: float = .1,
                 feature_range: tuple = (-1e10, 1e10),
                 gamma: float = 0.,
                 ae_model: Union[tf.keras.Model, 'keras.Model'] = None,
                 learning_rate_init: float = 1e-2,
                 max_iterations: int = 1000,
                 c_init: float = 10.,
                 c_steps: int = 10,
                 eps: tuple = (1e-3, 1e-3),
                 clip: tuple = (-100., 100.),
                 update_num_grad: int = 1,
                 no_info_val: Union[float, np.ndarray] = None,
                 write_dir: str = None,
                 sess: tf.Session = None) -> None:
        """
        Initialize contrastive explanation method.
        Paper: https://arxiv.org/abs/1802.07623

        Parameters
        ----------
        predict
            Keras or TensorFlow model or any other model's prediction function returning class probabilities
        mode
            Find pertinant negatives ('PN') or pertinant positives ('PP')
        shape
            Shape of input data starting with batch size
        kappa
            Confidence parameter for the attack loss term
        beta
            Regularization constant for L1 loss term
        feature_range
            Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or
            numpy arrays with dimension (1x nb of features) for feature-wise ranges
        gamma
            Regularization constant for optional auto-encoder loss term
        ae_model
            Optional auto-encoder model used for loss regularization
        learning_rate_init
            Initial learning rate of optimizer
        max_iterations
            Maximum number of iterations for finding a PN or PP
        c_init
            Initial value to scale the attack loss term
        c_steps
            Number of iterations to adjust the constant scaling the attack loss term
        eps
            If numerical gradients are used to compute dL/dx = (dL/dp) * (dp/dx), then eps[0] is used to
            calculate dL/dp and eps[1] is used for dp/dx. eps[0] and eps[1] can be a combination of float values and
            numpy arrays. For eps[0], the array dimension should be (1x nb of prediction categories) and for
            eps[1] it should be (1x nb of features)
        clip
            Tuple with min and max clip ranges for both the numerical gradients and the gradients
            obtained from the TensorFlow graph
        update_num_grad
            If numerical gradients are used, they will be updated every update_num_grad iterations
        no_info_val
            Global or feature-wise value considered as containing no information
        write_dir
            Directory to write tensorboard files to
        sess
            Optional Tensorflow session that will be used if passed instead of creating or inferring one internally
        """
        super().__init__(meta=copy.deepcopy(DEFAULT_META_CEM))
        # get params for storage in meta
        params = locals()
        remove = ['self', 'predict', 'ae_model', 'sess', '__class__']
        for key in remove:
            params.pop(key)
        self.meta['params'].update(params)
        self.predict = predict

        # check whether the model and the auto-encoder are Keras or TF models and get session
        is_model, is_model_keras, model_sess = _check_keras_or_tf(predict)
        is_ae, is_ae_keras, ae_sess = _check_keras_or_tf(ae_model)
        # TODO: check ae and model are compatible
        self.meta['params'].update(is_model=is_model, is_model_keras=is_model_keras, is_ae=is_ae,
                                   is_ae_keras=is_ae_keras)

        # if session provided, use it
        if isinstance(sess, tf.Session):
            self.sess = sess
        else:
            self.sess = model_sess

        if is_model:  # Keras or TF model
            self.model = True
            classes = self.sess.run(self.predict(tf.convert_to_tensor(np.zeros(shape), dtype=tf.float32))).shape[1]
        else:
            self.model = False
            classes = self.predict(np.zeros(shape)).shape[1]

        self.mode = mode
        self.shape = shape
        self.kappa = kappa
        self.beta = beta
        self.gamma = gamma
        self.ae = ae_model
        self.batch_size = shape[0]
        self.max_iterations = max_iterations
        self.c_init = c_init
        self.c_steps = c_steps
        self.update_num_grad = update_num_grad
        self.eps = eps
        self.clip = clip
        self.write_dir = write_dir
        if type(no_info_val) == float:
            self.no_info_val = np.ones(shape) * no_info_val
        else:
            self.no_info_val = no_info_val

        # values regarded as containing no information
        # PNs will deviate away from these values while PPs will gravitate towards them
        self.no_info = tf.Variable(np.zeros(shape), dtype=tf.float32, name='no_info')

        # define tf variables for original and perturbed instances, and target labels
        self.orig = tf.Variable(np.zeros(shape), dtype=tf.float32, name='orig')
        self.adv = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv')  # delta(k)
        self.adv_s = tf.Variable(np.zeros(shape), dtype=tf.float32, name='adv_s')  # y(k)
        self.target = tf.Variable(np.zeros((self.batch_size, classes)), dtype=tf.float32, name='target')

        # define tf variable for constant used in FISTA optimization
        self.const = tf.Variable(np.zeros(self.batch_size), dtype=tf.float32, name='const')
        self.global_step = tf.Variable(0.0, trainable=False, name='global_step')

        # define placeholders that will be assigned to relevant variables
        self.assign_orig = tf.placeholder(tf.float32, shape, name='assign_orig')
        self.assign_adv = tf.placeholder(tf.float32, shape, name='assign_adv')
        self.assign_adv_s = tf.placeholder(tf.float32, shape, name='assign_adv_s')
        self.assign_target = tf.placeholder(tf.float32, (self.batch_size, classes), name='assign_target')
        self.assign_const = tf.placeholder(tf.float32, [self.batch_size], name='assign_const')
        self.assign_no_info = tf.placeholder(tf.float32, shape, name='assign_no_info')

        # define conditions and values for element-wise shrinkage thresholding (eq.7)
        with tf.name_scope('shrinkage_thresholding') as scope:
            cond = [tf.cast(tf.greater(tf.subtract(self.adv_s, self.orig), self.beta), tf.float32),
                    tf.cast(tf.less_equal(tf.abs(tf.subtract(self.adv_s, self.orig)), self.beta), tf.float32),
                    tf.cast(tf.less(tf.subtract(self.adv_s, self.orig), tf.negative(self.beta)), tf.float32)]
            upper = tf.minimum(tf.subtract(self.adv_s, self.beta), tf.cast(feature_range[1], tf.float32))
            lower = tf.maximum(tf.add(self.adv_s, self.beta), tf.cast(feature_range[0], tf.float32))
            self.assign_adv = tf.multiply(cond[0], upper) + tf.multiply(cond[1], self.orig) + tf.multiply(cond[2],
                                                                                                          lower)

        # perturbation update for delta and vector projection on correct set depending on PP or PN (eq.5)
        # delta(k) = adv; delta(k+1) = assign_adv
        with tf.name_scope('perturbation_delta') as scope:
            proj_d = [tf.cast(tf.greater(tf.abs(tf.subtract(self.assign_adv, self.no_info)),
                                         tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32),
                      tf.cast(tf.less_equal(tf.abs(tf.subtract(self.assign_adv, self.no_info)),
                                            tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32)]
            if self.mode == "PP":
                self.assign_adv = tf.multiply(proj_d[1], self.assign_adv) + tf.multiply(proj_d[0], self.orig)
            elif self.mode == "PN":
                self.assign_adv = tf.multiply(proj_d[0], self.assign_adv) + tf.multiply(proj_d[1], self.orig)

        # perturbation update and vector projection on correct set for y: y(k+1) = assign_adv_s (eq.6)
        with tf.name_scope('perturbation_y') as scope:
            self.zt = tf.divide(self.global_step, self.global_step + tf.cast(3, tf.float32))  # k/(k+3) in (eq.6)
            self.assign_adv_s = self.assign_adv + tf.multiply(self.zt, self.assign_adv - self.adv)
            proj_d_s = [tf.cast(tf.greater(tf.abs(tf.subtract(self.assign_adv_s, self.no_info)),
                                           tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32),
                        tf.cast(tf.less_equal(tf.abs(tf.subtract(self.assign_adv_s, self.no_info)),
                                              tf.abs(tf.subtract(self.orig, self.no_info))), tf.float32)]
            if self.mode == "PP":
                self.assign_adv_s = tf.multiply(proj_d_s[1], self.assign_adv_s) + tf.multiply(proj_d_s[0], self.orig)
            elif self.mode == "PN":
                self.assign_adv_s = tf.multiply(proj_d_s[0], self.assign_adv_s) + tf.multiply(proj_d_s[1], self.orig)

        # delta(k) <- delta(k+1);  y(k) <- y(k+1)
        with tf.name_scope('update_adv') as scope:
            self.adv_updater = tf.assign(self.adv, self.assign_adv)
            self.adv_updater_s = tf.assign(self.adv_s, self.assign_adv_s)

        # from perturbed instance, derive deviation delta
        with tf.name_scope('update_delta') as scope:
            self.delta = self.orig - self.adv
            self.delta_s = self.orig - self.adv_s

        # define L1 and L2 loss terms; L1+L2 is later used as an optimization constraint for FISTA
        ax_sum = list(np.arange(1, len(shape)))
        with tf.name_scope('loss_l1_l2') as scope:
            self.l2 = tf.reduce_sum(tf.square(self.delta), axis=ax_sum)
            self.l2_s = tf.reduce_sum(tf.square(self.delta_s), axis=ax_sum)
            self.l1 = tf.reduce_sum(tf.abs(self.delta), axis=ax_sum)
            self.l1_s = tf.reduce_sum(tf.abs(self.delta_s), axis=ax_sum)
            self.l1_l2 = self.l2 + tf.multiply(self.l1, self.beta)
            self.l1_l2_s = self.l2_s + tf.multiply(self.l1_s, self.beta)

            # sum losses
            self.loss_l1 = tf.reduce_sum(self.l1)
            self.loss_l1_s = tf.reduce_sum(self.l1_s)
            self.loss_l2 = tf.reduce_sum(self.l2)
            self.loss_l2_s = tf.reduce_sum(self.l2_s)

        with tf.name_scope('loss_ae') as scope:
            # gamma * AE loss
            if self.mode == "PP" and callable(self.ae):
                self.loss_ae = self.gamma * tf.square(tf.norm(self.ae(self.delta) - self.delta))
                self.loss_ae_s = self.gamma * tf.square(tf.norm(self.ae(self.delta_s) - self.delta_s))
            elif self.mode == "PN" and callable(self.ae):
                self.loss_ae = self.gamma * tf.square(tf.norm(self.ae(self.adv) - self.adv))
                self.loss_ae_s = self.gamma * tf.square(tf.norm(self.ae(self.adv_s) - self.adv_s))
            else:  # no auto-encoder available
                self.loss_ae = tf.constant(0.)
                self.loss_ae_s = tf.constant(0.)

        with tf.name_scope('loss_attack') as scope:
            if not self.model:
                self.loss_attack = tf.placeholder(tf.float32)
            else:
                # make predictions on perturbed instance (PN) or delta (PP)
                if self.mode == "PP":
                    self.pred_proba = self.predict(self.delta)
                    self.pred_proba_s = self.predict(self.delta_s)
                elif self.mode == "PN":
                    self.pred_proba = self.predict(self.adv)
                    self.pred_proba_s = self.predict(self.adv_s)

                # probability of target label prediction
                self.target_proba = tf.reduce_sum(self.target * self.pred_proba, 1)
                target_proba_s = tf.reduce_sum(self.target * self.pred_proba_s, 1)

                # max probability of non target label prediction
                self.nontarget_proba_max = tf.reduce_max((1 - self.target) * self.pred_proba - (self.target * 10000), 1)
                nontarget_proba_max_s = tf.reduce_max((1 - self.target) * self.pred_proba_s - (self.target * 10000), 1)

                # loss term f(x,d) for PP (eq.4) and PN (eq.2)
                if self.mode == "PP":
                    loss_attack = tf.maximum(0.0, self.nontarget_proba_max - self.target_proba + self.kappa)
                    loss_attack_s = tf.maximum(0.0, nontarget_proba_max_s - target_proba_s + self.kappa)
                elif self.mode == "PN":
                    loss_attack = tf.maximum(0.0, -self.nontarget_proba_max + self.target_proba + self.kappa)
                    loss_attack_s = tf.maximum(0.0, -nontarget_proba_max_s + target_proba_s + self.kappa)

                # c * f(x,d)
                self.loss_attack = tf.reduce_sum(self.const * loss_attack)
                self.loss_attack_s = tf.reduce_sum(self.const * loss_attack_s)

        with tf.name_scope('loss_combined') as scope:
            # no need for L1 term in loss to optimize when using FISTA
            if self.model:
                self.loss_opt = self.loss_attack_s + self.loss_l2_s + self.loss_ae_s
            else:  # separate numerical computation of loss attack gradient
                self.loss_opt = self.loss_l2_s + self.loss_ae_s

            # add L1 term to overall loss; this is not the loss that will be directly optimized
            self.loss_total = self.loss_attack + self.loss_l2 + self.loss_ae + tf.multiply(self.beta, self.loss_l1)

        with tf.name_scope('training') as scope:
            self.learning_rate = tf.train.polynomial_decay(learning_rate_init, self.global_step,
                                                           self.max_iterations, 0, power=0.5)
            optimizer = tf.train.GradientDescentOptimizer(self.learning_rate)
            start_vars = set(x.name for x in tf.global_variables())

            # first compute, then apply grads
            self.compute_grads = optimizer.compute_gradients(self.loss_opt, var_list=[self.adv_s])
            self.grad_ph = tf.placeholder(tf.float32, name='grad_adv_s')
            var = [tvar for tvar in tf.trainable_variables() if tvar.name.startswith('adv_s')][-1]  # get the last in
            # case explainer is re-initialized and a new graph is created
            grad_and_var = [(self.grad_ph, var)]
            self.apply_grads = optimizer.apply_gradients(grad_and_var, global_step=self.global_step)
            end_vars = tf.global_variables()
            new_vars = [x for x in end_vars if x.name not in start_vars]

        # variables to initialize
        self.setup = []  # type: list
        self.setup.append(self.orig.assign(self.assign_orig))
        self.setup.append(self.target.assign(self.assign_target))
        self.setup.append(self.const.assign(self.assign_const))
        self.setup.append(self.adv.assign(self.assign_adv))
        self.setup.append(self.adv_s.assign(self.assign_adv_s))
        self.setup.append(self.no_info.assign(self.assign_no_info))

        self.init = tf.variables_initializer(var_list=[self.global_step] + [self.adv_s] + [self.adv] + new_vars)

        if self.write_dir is not None:
            writer = tf.summary.FileWriter(write_dir, tf.get_default_graph())
            writer.add_graph(tf.get_default_graph())
Beispiel #5
0
 def collect_variables(self):
   """Collect model variables call needs to be run at least once."""
   self.var_list = [
       v for v in tf.global_variables() if "emission_network" in v.name
   ]
   self.init_op = tf.variables_initializer(var_list=self.var_list)
Beispiel #6
0
    def __init__(self,
                 config,
                 batch_size,
                 checkpoint_dir_or_path=None,
                 var_name_substitutions=None,
                 session_target='',
                 **sample_kwargs):
        if tf.gfile.IsDirectory(checkpoint_dir_or_path):
            checkpoint_path = tf.train.latest_checkpoint(
                checkpoint_dir_or_path)
        else:
            checkpoint_path = checkpoint_dir_or_path
        self._config = copy.deepcopy(config)
        self._config.data_converter.set_mode('infer')
        self._config.hparams.batch_size = batch_size
        with tf.Graph().as_default():
            model = self._config.model
            model.build(self._config.hparams,
                        self._config.data_converter.output_depth,
                        is_training=False)
            # Input placeholders
            self._temperature = tf.placeholder(tf.float32, shape=())

            if self._config.hparams.z_size:
                self._z_input = tf.placeholder(
                    tf.float32,
                    shape=[batch_size, self._config.hparams.z_size])
            else:
                self._z_input = None

            if self._config.data_converter.control_depth > 0:
                self._c_input = tf.placeholder(
                    tf.float32,
                    shape=[None, self._config.data_converter.control_depth])
            else:
                self._c_input = None

            self._inputs = tf.placeholder(
                tf.float32,
                shape=[
                    batch_size, None, self._config.data_converter.input_depth
                ])
            self._controls = tf.placeholder(
                tf.float32,
                shape=[
                    batch_size, None, self._config.data_converter.control_depth
                ])
            self._inputs_length = tf.placeholder(
                tf.int32,
                shape=[batch_size] +
                list(self._config.data_converter.length_shape))
            self._max_length = tf.placeholder(tf.int32, shape=())
            # Outputs
            self._outputs, self._decoder_results = model.sample(
                batch_size,
                max_length=self._max_length,
                z=self._z_input,
                c_input=self._c_input,
                temperature=self._temperature,
                **sample_kwargs)
            if self._config.hparams.z_size:
                q_z = model.encode(self._inputs, self._inputs_length,
                                   self._controls)
                self._mu = q_z.loc
                self._sigma = q_z.scale.diag
                self._z = q_z.sample()

            var_map = None
            if var_name_substitutions is not None:
                var_map = {}
                for v in tf.global_variables():
                    var_name = v.name[:-2]  # Strip ':0' suffix.
                    for pattern, substitution in var_name_substitutions:
                        var_name = re.sub(pattern, substitution, var_name)
                    if var_name != v.name[:-2]:
                        tf.logging.info('Renaming `%s` to `%s`.', v.name[:-2],
                                        var_name)
                    var_map[var_name] = v

            # Restore graph
            self._sess = tf.Session(target=session_target)
            saver = tf.train.Saver(var_map)
            if (os.path.exists(checkpoint_path)
                    and tarfile.is_tarfile(checkpoint_path)):
                tf.logging.info('Unbundling checkpoint.')
                with tempfile.TemporaryDirectory() as temp_dir:
                    tar = tarfile.open(checkpoint_path)
                    tar.extractall(temp_dir)
                    # Assume only a single checkpoint is in the directory.
                    for name in tar.getnames():
                        if name.endswith('.index'):
                            checkpoint_path = os.path.join(
                                temp_dir, name[0:-6])
                            break
                    saver.restore(self._sess, checkpoint_path)
            else:
                saver.restore(self._sess, checkpoint_path)
Beispiel #7
0
    def __init__(self,
                 source_vocab_size,
                 target_vocab_size,
                 buckets,
                 size,
                 num_layers,
                 max_gradient_norm,
                 batch_size,
                 learning_rate,
                 learning_rate_decay_factor,
                 use_lstm=False,
                 num_samples=512,
                 forward_only=False):
        """Create the model.

    Args:
      source_vocab_size: size of the source vocabulary.
      target_vocab_size: size of the target vocabulary.
      buckets: a list of pairs (I, O), where I specifies maximum input length
        that will be processed in that bucket, and O specifies maximum output
        length. Training instances that have inputs longer than I or outputs
        longer than O will be pushed to the next bucket and padded accordingly.
        We assume that the list is sorted, e.g., [(2, 4), (8, 16)].
      size: number of units in each layer of the model.
      num_layers: number of layers in the model.
      max_gradient_norm: gradients will be clipped to maximally this norm.
      batch_size: the size of the batches used during training;
        the model construction is independent of batch_size, so it can be
        changed after initialization if this is convenient, e.g., for decoding.
      learning_rate: learning rate to start with.
      learning_rate_decay_factor: decay learning rate by this much when needed.
      use_lstm: if true, we use LSTM cells instead of GRU cells.
      num_samples: number of samples for sampled softmax.
      forward_only: if set, we do not construct the backward pass in the model.
    """
        self.source_vocab_size = source_vocab_size
        self.target_vocab_size = target_vocab_size
        self.buckets = buckets
        self.batch_size = batch_size
        import tensorflow.compat.v1 as tf
        tf.enable_v2_behavior()
        self.learning_rate = tf.Variable(float(learning_rate), trainable=False)
        self.learning_rate_decay_op = self.learning_rate.assign(
            self.learning_rate * learning_rate_decay_factor)
        self.global_step = tf.Variable(0, trainable=False)

        # If we use sampled softmax, we need an output projection.
        output_projection = None
        softmax_loss_function = None
        # Sampled softmax only makes sense if we sample less than vocabulary size.
        if num_samples > 0 and num_samples < self.target_vocab_size:
            w = tf.get_variable("proj_w", [size, self.target_vocab_size])
            w_t = tf.transpose(w)
            b = tf.get_variable("proj_b", [self.target_vocab_size])
            output_projection = (w, b)
        '''
    def sampled_loss(inputs, labels):
        labels = tf.reshape(labels, [-1, 1])
        return tf.nn.sampled_softmax_loss(w_t, b, inputs, labels, num_samples,
                self.target_vocab_size)
    '''
        def sampled_loss(labels, logits):
            labels = tf.reshape(labels, [-1, 1])
            return tf.nn.sampled_softmax_loss(tf.transpose(w), b, labels,
                                              logits, num_samples,
                                              self.target_vocab_size)

        softmax_loss_function = sampled_loss

        # Create the internal multi-layer cell for our RNN.
        single_cell = tf.nn.rnn_cell.GRUCell(size)
        if use_lstm:
            single_cell = tf.nn.rnn_cell.BasicLSTMCell(size)
        cell = single_cell
        cell = tf.nn.rnn_cell.DropoutWrapper(cell, output_keep_prob=0.5)
        if num_layers > 1:
            cell = tf.nn.rnn_cell.MultiRNNCell([single_cell] * num_layers)

        # The seq2seq function: we use embedding for the input and attention.
        def seq2seq_f(encoder_inputs, decoder_inputs, do_decode):
            #return tf.nn.seq2seq.embedding_attention_seq2seq(
            return tf.contrib.legacy_seq2seq.embedding_attention_seq2seq(
                encoder_inputs,
                decoder_inputs,
                cell,
                num_encoder_symbols=source_vocab_size,
                num_decoder_symbols=target_vocab_size,
                embedding_size=size,
                output_projection=output_projection,
                feed_previous=do_decode)

        # Feeds for inputs.
        self.encoder_inputs = []
        self.decoder_inputs = []
        self.target_weights = []
        for i in xrange(buckets[-1][0]):  # Last bucket is the biggest one.
            self.encoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="encoder{0}".format(i)))
        for i in xrange(buckets[-1][1] + 1):
            self.decoder_inputs.append(
                tf.placeholder(tf.int32,
                               shape=[None],
                               name="decoder{0}".format(i)))
            self.target_weights.append(
                tf.placeholder(tf.float32,
                               shape=[None],
                               name="weight{0}".format(i)))

        # Our targets are decoder inputs shifted by one.
        targets = [
            self.decoder_inputs[i + 1]
            for i in xrange(len(self.decoder_inputs) - 1)
        ]

        # Training outputs and losses.
        if forward_only:
            self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, True),
                softmax_loss_function=softmax_loss_function)
            # If we use output projection, we need to project outputs for decoding.
            if output_projection is not None:
                for b in xrange(len(buckets)):
                    self.outputs[b] = [
                        tf.matmul(output, output_projection[0]) +
                        output_projection[1] for output in self.outputs[b]
                    ]
        else:
            import tensorflow as tf
            #tf.disable_v2_behavior()
            #self.outputs, self.losses = tf.nn.seq2seq.model_with_buckets(
            self.outputs, self.losses = tf.contrib.legacy_seq2seq.model_with_buckets(
                self.encoder_inputs,
                self.decoder_inputs,
                targets,
                self.target_weights,
                buckets,
                lambda x, y: seq2seq_f(x, y, False),
                softmax_loss_function=softmax_loss_function)

        # Gradients and SGD update operation for training the model.
        params = tf.trainable_variables()
        if not forward_only:
            self.gradient_norms = []
            self.updates = []
            opt = tf.train.AdamOptimizer()
            for b in xrange(len(buckets)):
                gradients = tf.gradients(self.losses[b], params)
                clipped_gradients, norm = tf.clip_by_global_norm(
                    gradients, max_gradient_norm)
                self.gradient_norms.append(norm)
                self.updates.append(
                    opt.apply_gradients(zip(clipped_gradients, params),
                                        global_step=self.global_step))

        self.saver = tf.train.Saver(tf.global_variables())
Beispiel #8
0
            var_weights = np.transpose(var_weights, (2, 3, 1, 0))
            ptr += num_params
            assign_ops.append(
                tf.assign(var1, var_weights, validate_shape=True))
            i += 1

    # print('ptr:', ptr)
    return assign_ops


if __name__ == '__main__':
    args = parser.parse_args()

    img_size = 512

    tf.disable_eager_execution() # for placeholders

    with tf.name_scope('input'):
        input_data = tf.placeholder(dtype=tf.float32,shape=(None, img_size, img_size, 3), name='input_data')

    model = YOLOv3(input_data, num_classes=args.num_classes)
    load_ops = load_weights(tf.global_variables(), args.weight_file)

    saver = tf.train.Saver(tf.global_variables())

    with tf.Session() as sess:
        sess.run(load_ops)
        ckpt_path = os.path.join(args.save_path, 'yolov3_pretrained.ckpt')
        save_path = saver.save(sess, save_path=ckpt_path)
        print('Model saved in path: {}'.format(save_path))
Beispiel #9
0
def model_fn(features, labels, mode, params):
  """The model_fn argument for creating an Estimator."""
  tf.logging.info("features = %s labels = %s mode = %s params=%s" %
                  (features, labels, mode, params))
  global_step = tf.train.get_global_step()
  graph = mtf.Graph()
  mesh = mtf.Mesh(graph, "my_mesh")
  logits, loss = mnist_model(features, labels, mesh)
  mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
  layout_rules = mtf.convert_to_layout_rules(FLAGS.layout)
  mesh_size = mesh_shape.size
  mesh_devices = [""] * mesh_size
  mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
      mesh_shape, layout_rules, mesh_devices)

  if mode == tf.estimator.ModeKeys.TRAIN:
    var_grads = mtf.gradients(
        [loss], [v.outputs[0] for v in graph.trainable_variables])
    optimizer = mtf.optimize.AdafactorOptimizer()
    update_ops = optimizer.apply_grads(var_grads, graph.trainable_variables)

  lowering = mtf.Lowering(graph, {mesh: mesh_impl})
  restore_hook = mtf.MtfRestoreHook(lowering)

  tf_logits = lowering.export_to_tf_tensor(logits)
  if mode != tf.estimator.ModeKeys.PREDICT:
    tf_loss = lowering.export_to_tf_tensor(loss)
    tf.summary.scalar("loss", tf_loss)

  if mode == tf.estimator.ModeKeys.TRAIN:
    tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
    tf_update_ops.append(tf.assign_add(global_step, 1))
    train_op = tf.group(tf_update_ops)
    saver = tf.train.Saver(
        tf.global_variables(),
        sharded=True,
        max_to_keep=10,
        keep_checkpoint_every_n_hours=2,
        defer_build=False, save_relative_paths=True)
    tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
    saver_listener = mtf.MtfCheckpointSaverListener(lowering)
    saver_hook = tf.train.CheckpointSaverHook(
        FLAGS.model_dir,
        save_steps=1000,
        saver=saver,
        listeners=[saver_listener])

    accuracy = tf.metrics.accuracy(
        labels=labels, predictions=tf.argmax(tf_logits, axis=1))

    # Name tensors to be logged with LoggingTensorHook.
    tf.identity(tf_loss, "cross_entropy")
    tf.identity(accuracy[1], name="train_accuracy")

    # Save accuracy scalar to Tensorboard output.
    tf.summary.scalar("train_accuracy", accuracy[1])

    # restore_hook must come before saver_hook
    return tf.estimator.EstimatorSpec(
        tf.estimator.ModeKeys.TRAIN, loss=tf_loss, train_op=train_op,
        training_chief_hooks=[restore_hook, saver_hook])

  if mode == tf.estimator.ModeKeys.PREDICT:
    predictions = {
        "classes": tf.argmax(tf_logits, axis=1),
        "probabilities": tf.nn.softmax(tf_logits),
    }
    return tf.estimator.EstimatorSpec(
        mode=tf.estimator.ModeKeys.PREDICT,
        predictions=predictions,
        prediction_hooks=[restore_hook],
        export_outputs={
            "classify": tf.estimator.export.PredictOutput(predictions)
        })
  if mode == tf.estimator.ModeKeys.EVAL:
    return tf.estimator.EstimatorSpec(
        mode=tf.estimator.ModeKeys.EVAL,
        loss=tf_loss,
        evaluation_hooks=[restore_hook],
        eval_metric_ops={
            "accuracy":
            tf.metrics.accuracy(
                labels=labels, predictions=tf.argmax(tf_logits, axis=1)),
        })
Beispiel #10
0
    def _init_graph(self) -> None:
        # Collect inputs.
        self.input_names = []

        for param in inspect.signature(self._build_func).parameters.values():
            if param.kind == param.POSITIONAL_OR_KEYWORD and param.default is param.empty:
                self.input_names.append(param.name)

        self.num_inputs = len(self.input_names)
        assert self.num_inputs >= 1

        # Choose name and scope.
        if self.name is None:
            self.name = self._build_func_name
        assert re.match("^[A-Za-z0-9_.\\-]*$", self.name)
        with tf.name_scope(None):
            self.scope = tf.get_default_graph().unique_name(self.name, mark_as_used=True)

        # Finalize build func kwargs.
        build_kwargs = dict(self.static_kwargs)
        build_kwargs["is_template_graph"] = True
        build_kwargs["components"] = self.components

        # Build template graph.
        with tfutil.absolute_variable_scope(self.scope, reuse=tf.AUTO_REUSE), tfutil.absolute_name_scope(self.scope):  # ignore surrounding scopes
            assert tf.get_variable_scope().name == self.scope
            assert tf.get_default_graph().get_name_scope() == self.scope
            with tf.control_dependencies(None):  # ignore surrounding control dependencies
                self.input_templates = [tf.placeholder(tf.float32, name=name) for name in self.input_names]
                out_expr = self._build_func(*self.input_templates, **build_kwargs)

        # Collect outputs.
        assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple)
        self.output_templates = [out_expr] if tfutil.is_tf_expression(out_expr) else list(out_expr)
        self.num_outputs = len(self.output_templates)
        assert self.num_outputs >= 1
        assert all(tfutil.is_tf_expression(t) for t in self.output_templates)

        # Perform sanity checks.
        if any(t.shape.ndims is None for t in self.input_templates):
            raise ValueError("Network input shapes not defined. Please call x.set_shape() for each input.")
        if any(t.shape.ndims is None for t in self.output_templates):
            raise ValueError("Network output shapes not defined. Please call x.set_shape() where applicable.")
        if any(not isinstance(comp, Network) for comp in self.components.values()):
            raise ValueError("Components of a Network must be Networks themselves.")
        if len(self.components) != len(set(comp.name for comp in self.components.values())):
            raise ValueError("Components of a Network must have unique names.")

        # List inputs and outputs.
        self.input_shapes = [tfutil.shape_to_list(t.shape) for t in self.input_templates]
        self.output_shapes = [tfutil.shape_to_list(t.shape) for t in self.output_templates]
        self.input_shape = self.input_shapes[0]
        self.output_shape = self.output_shapes[0]
        self.output_names = [t.name.split("/")[-1].split(":")[0] for t in self.output_templates]

        # List variables.
        self.own_vars = OrderedDict((var.name[len(self.scope) + 1:].split(":")[0], var) for var in tf.global_variables(self.scope + "/"))
        self.vars = OrderedDict(self.own_vars)
        self.vars.update((comp.name + "/" + name, var) for comp in self.components.values() for name, var in comp.vars.items())
        self.trainables = OrderedDict((name, var) for name, var in self.vars.items() if var.trainable)
        self.var_global_to_local = OrderedDict((var.name.split(":")[0], name) for name, var in self.vars.items())
Beispiel #11
0
def infer(output_path):
    """Run inference."""
    tf.logging.info("Run inference")
    checkpoint = FLAGS.checkpoint_dir

    with tf.gfile.GFile(os.path.join(checkpoint, "config.json")) as f:
        config = json.load(f)
    if FLAGS.num_gpus > 1:
        config["num_gpus"] = FLAGS.num_gpus
    tf.logging.info("# infer config %s", config)

    # Load the data
    tf.logging.info("Loading data")
    features = _get_data(split=FLAGS.split, batch_size=FLAGS.batch_size)
    tf.logging.info("Loading model")
    model_class = configurable.Configurable.load(config["model"])
    model = model_class("eval", config=config["model"])
    outputs = model(features)
    tf.logging.info(outputs)

    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                            log_device_placement=False))
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    sess.run(tf.tables_initializer())
    tf.logging.info(config["model"]["optimizer"])

    # Get trainable / frozen vars
    trainable_vars, frozen_vars, _ = misc_util.get_trainable_vars(
        all_vars=tf.global_variables(),
        exclude_pattern=config["model"]["optimizer"]["nograd_var"])

    # Make sure to load in the exponential moving average vars
    ema = tf.train.ExponentialMovingAverage(decay=0.9999)
    ema_vars = {}
    for var in trainable_vars:
        ema_vars[ema.average_name(var)] = var

    # Restoring EMA trainable vars
    tf.logging.info("Restoring ema.")
    saver = tf.train.Saver(ema_vars)
    ckpt_path = tf.train.latest_checkpoint(checkpoint)
    saver.restore(sess, ckpt_path)

    # Restoring frozen vars
    tf.logging.info("Restoring frozen.")
    saver_frozen = tf.train.Saver(frozen_vars)
    saver_frozen.restore(sess, ckpt_path)

    # Setup scaffolding and load in the saved model
    coord = tf.train.Coordinator()
    _ = tf.train.start_queue_runners(coord=coord, sess=sess)

    is_tgq = FLAGS.data_format == "tgq"
    result = {}
    try:
        i = 0
        while True:
            predictions = sess.run(outputs)
            for qid, answer, start, end in zip(predictions["id"],
                                               predictions["a"],
                                               predictions["p1"],
                                               predictions["p2"]):
                if FLAGS.include_probabilities or is_tgq:
                    output = {"answer": answer}
                    output["start_prob"] = list([float(x) for x in start])
                    output["end_prob"] = list([float(x) for x in end])
                    if is_tgq:
                        start, end, _ = evaluator_util.get_start_end(
                            output["start_prob"], output["end_prob"])
                        output["start"] = start
                        output["end"] = end
                else:
                    output = answer

                result[qid] = output

            if i % 100 == 0:
                tf.logging.info(i)
            i += 1
    except errors.OutOfRangeError:
        pass

    # Dump results to a file
    with tf.gfile.GFile(output_path, "w") as f:
        if is_tgq:
            for qid in result:
                # NOTE(thangluong): from chrisalberti@'s observation,
                #   we need to subtract 1 to get good results.
                #   To investigate; we could have added bos (found no evidence yet).
                start = result[qid]["start"] - 1
                end = result[qid]["end"] - 1
                f.write("%s\t-1\t%d\t%d\n" % (qid, start, end))
        else:
            json.dump(result, f)
Beispiel #12
0
def model_fn(features, labels, mode, params):
    """A model is called by TpuEstimator."""
    del labels
    global_step = tf.train.get_global_step()
    graph = mtf.Graph()
    mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
    layout_rules = mtf.convert_to_layout_rules(FLAGS.layout)
    if FLAGS.use_tpu:
        ctx = params['context']
        num_hosts = ctx.num_hosts
        host_placement_fn = ctx.tpu_host_placement_function
        device_list = [host_placement_fn(host_id=t) for t in range(num_hosts)]
        tf.logging.info('device_list = %s' % device_list, )
        # TODO(ylc): Better estimation of replica cache size?
        replica_cache_size = 300 * 1000000  # 300M per replica
        # Worker 0 caches all the TPU binaries.
        worker0_mem = replica_cache_size * ctx.num_replicas
        devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1)
        var_placer = mtf.utils.BalancedVariablePlacer(device_list,
                                                      devices_memeory_usage)
        mesh_devices = [''] * mesh_shape.size
        mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(mesh_shape, layout_rules,
                                                    mesh_devices,
                                                    ctx.device_assignment)
    else:
        var_placer = None
        mesh_devices = [''] * mesh_shape.size
        mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
            mesh_shape, layout_rules, mesh_devices)
    mesh = mtf.Mesh(graph, 'my_mesh', var_placer)

    with mtf.utils.outside_all_rewrites():
        logits, loss = toy_model(features, mesh)

    # TRAIN mode
    if mode == tf.estimator.ModeKeys.TRAIN:
        var_grads = mtf.gradients(
            [loss], [v.outputs[0] for v in graph.trainable_variables])
        if FLAGS.optimizer == 'Adafactor':
            optimizer = mtf.optimize.AdafactorOptimizer()
        else:
            assert FLAGS.optimizer == 'SGD'
            optimizer = mtf.optimize.SgdOptimizer(learning_rate=FLAGS.lr)
        update_ops = optimizer.apply_grads(var_grads,
                                           graph.trainable_variables)
    else:
        # for now, we can only export fully-replicated tensors.
        fully_replicated_logits = mtf.anonymize(logits)

    lowering = mtf.Lowering(graph, {mesh: mesh_impl})

    tf_loss = tf.to_float(lowering.export_to_tf_tensor(loss))

    if mode == tf.estimator.ModeKeys.TRAIN:
        tf_update_ops = [lowering.lowered_operation(op) for op in update_ops]
        tf_update_ops.append(tf.assign_add(global_step, 1))
        tf.logging.info('tf_update_ops: {}'.format(tf_update_ops))
        train_op = tf.group(tf_update_ops)
    else:
        tf_logits = lowering.export_to_tf_tensor(fully_replicated_logits)

    with mtf.utils.outside_all_rewrites():
        # Copy master variables to slices. Must be called first.
        restore_hook = mtf.MtfRestoreHook(lowering)
        if mode == tf.estimator.ModeKeys.TRAIN:
            saver = tf.train.Saver(tf.global_variables(),
                                   sharded=True,
                                   max_to_keep=10,
                                   keep_checkpoint_every_n_hours=2,
                                   defer_build=False,
                                   save_relative_paths=True)
            tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
            saver_listener = mtf.MtfCheckpointSaverListener(lowering)
            saver_hook = tf.train.CheckpointSaverHook(
                FLAGS.model_dir,
                save_steps=1000,
                saver=saver,
                listeners=[saver_listener])

            return tpu_estimator.TPUEstimatorSpec(
                tf.estimator.ModeKeys.TRAIN,
                loss=tf_loss,
                train_op=train_op,
                training_hooks=[restore_hook, saver_hook])
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(tf_logits):
                mean_logits = tf.metrics.mean(tf_logits)
                return {'mean_logits': mean_logits}

            eval_metrics = (metric_fn, [tf_logits])

            return tpu_estimator.TPUEstimatorSpec(
                tf.estimator.ModeKeys.EVAL,
                evaluation_hooks=[restore_hook],
                loss=tf_loss,
                eval_metrics=eval_metrics)
Beispiel #13
0
    def __init__(
            self,
            predict_fn: Union[Callable, tf.keras.Model, 'keras.Model'],
            shape: Tuple[int, ...],
            distance_fn: str = 'l1',
            target_proba: float = 1.0,
            target_class: Union[str, int] = 'other',
            max_iter: int = 1000,
            early_stop: int = 50,
            lam_init: float = 1e-1,
            max_lam_steps: int = 10,
            tol: float = 0.05,
            learning_rate_init=0.1,
            feature_range: Union[Tuple, str] = (-1e10, 1e10),
            eps: Union[float, np.ndarray] = 0.01,  # feature-wise epsilons
            init: str = 'identity',
            decay: bool = True,
            write_dir: str = None,
            debug: bool = False,
            sess: tf.Session = None) -> None:
        """
        Initialize counterfactual explanation method based on Wachter et al. (2017)

        Parameters
        ----------
        predict_fn
            Keras or TensorFlow model or any other model's prediction function returning class probabilities
        shape
            Shape of input data starting with batch size
        distance_fn
            Distance function to use in the loss term
        target_proba
            Target probability for the counterfactual to reach
        target_class
            Target class for the counterfactual to reach, one of 'other', 'same' or an integer denoting
            desired class membership for the counterfactual instance
        max_iter
            Maximum number of interations to run the gradient descent for (inner loop)
        early_stop
            Number of steps after which to terminate gradient descent if all or none of found instances are solutions
        lam_init
            Initial regularization constant for the prediction part of the Wachter loss
        max_lam_steps
            Maximum number of times to adjust the regularization constant (outer loop) before terminating the search
        tol
            Tolerance for the counterfactual target probability
        learning_rate_init
            Initial learning rate for each outer loop of lambda
        feature_range
            Tuple with min and max ranges to allow for perturbed instances. Min and max ranges can be floats or
            numpy arrays with dimension (1 x nb of features) for feature-wise ranges
        eps
            Gradient step sizes used in calculating numerical gradients, defaults to a single value for all
            features, but can be passed an array for feature-wise step sizes
        init
            Initialization method for the search of counterfactuals, currently must be 'identity'
        decay
            Flag to decay learning rate to zero for each outer loop over lambda
        write_dir
            Directory to write Tensorboard files to
        debug
            Flag to write Tensorboard summaries for debugging
        sess
            Optional Tensorflow session that will be used if passed instead of creating or inferring one internally
        """
        super().__init__(meta=copy.deepcopy(DEFAULT_META_CF))
        # get params for storage in meta
        params = locals()
        remove = ['self', 'predict_fn', 'sess', '__class__']
        for key in remove:
            params.pop(key)
        self.meta['params'].update(params)

        self.data_shape = shape
        self.batch_size = shape[0]
        self.target_class = target_class

        # options for the optimizer
        self.max_iter = max_iter
        self.lam_init = lam_init
        self.tol = tol
        self.max_lam_steps = max_lam_steps
        self.early_stop = early_stop

        self.eps = eps
        self.init = init
        self.feature_range = feature_range
        self.target_proba_arr = target_proba * np.ones(self.batch_size)

        self.debug = debug

        # check if the passed object is a model and get session
        is_model, is_keras, model_sess = _check_keras_or_tf(predict_fn)
        self.meta['params'].update(is_model=is_model, is_keras=is_keras)

        # if session provided, use it
        if isinstance(sess, tf.Session):
            self.sess = sess
        else:
            self.sess = model_sess

        if is_model:  # Keras or TF model
            self.model = True
            self.predict_fn = predict_fn.predict  # type: ignore # array function
            self.predict_tn = predict_fn  # tensor function

        else:  # black-box model
            self.predict_fn = predict_fn
            self.predict_tn = None
            self.model = False

        self.n_classes = self.predict_fn(np.zeros(shape)).shape[1]

        # flag to keep track if explainer is fit or not
        self.fitted = False

        # set up graph session for optimization (counterfactual search)
        with tf.variable_scope('cf_search', reuse=tf.AUTO_REUSE):

            # define variables for original and candidate counterfactual instances, target labels and lambda
            self.orig = tf.get_variable('original',
                                        shape=shape,
                                        dtype=tf.float32)
            self.cf = tf.get_variable(
                'counterfactual',
                shape=shape,
                dtype=tf.float32,
                constraint=lambda x: tf.clip_by_value(x, feature_range[0],
                                                      feature_range[1]))
            # the following will be a 1-hot encoding of the target class (as predicted by the model)
            self.target = tf.get_variable('target',
                                          shape=(self.batch_size,
                                                 self.n_classes),
                                          dtype=tf.float32)

            # constant target probability and global step variable
            self.target_proba = tf.constant(target_proba *
                                            np.ones(self.batch_size),
                                            dtype=tf.float32,
                                            name='target_proba')
            self.global_step = tf.Variable(0.0,
                                           trainable=False,
                                           name='global_step')

            # lambda hyperparameter - placeholder instead of variable as annealed in first epoch
            self.lam = tf.placeholder(tf.float32,
                                      shape=(self.batch_size),
                                      name='lam')

            # define placeholders that will be assigned to relevant variables
            self.assign_orig = tf.placeholder(tf.float32,
                                              shape,
                                              name='assing_orig')
            self.assign_cf = tf.placeholder(tf.float32,
                                            shape,
                                            name='assign_cf')
            self.assign_target = tf.placeholder(tf.float32,
                                                shape=(self.batch_size,
                                                       self.n_classes),
                                                name='assign_target')

            # L1 distance and MAD constants
            # TODO: MADs?
            ax_sum = list(np.arange(1, len(self.data_shape)))
            if distance_fn == 'l1':
                self.dist = tf.reduce_sum(tf.abs(self.cf - self.orig),
                                          axis=ax_sum,
                                          name='l1')
            else:
                logger.exception('Distance metric %s not supported',
                                 distance_fn)
                raise ValueError

            # distance loss
            self.loss_dist = self.lam * self.dist

            # prediction loss
            if not self.model:
                # will need to calculate gradients numerically
                self.loss_opt = self.loss_dist
            else:
                # autograd gradients throughout
                self.pred_proba = self.predict_tn(self.cf)

                # 3 cases for target_class
                if target_class == 'same':
                    self.pred_proba_class = tf.reduce_max(
                        self.target * self.pred_proba, 1)
                elif target_class == 'other':
                    self.pred_proba_class = tf.reduce_max(
                        (1 - self.target) * self.pred_proba, 1)
                elif target_class in range(self.n_classes):
                    # if class is specified, this is known in advance
                    self.pred_proba_class = tf.reduce_max(
                        tf.one_hot(
                            target_class, self.n_classes, dtype=tf.float32) *
                        self.pred_proba, 1)
                else:
                    logger.exception('Target class %s unknown', target_class)
                    raise ValueError

                self.loss_pred = tf.square(self.pred_proba_class -
                                           self.target_proba)

                self.loss_opt = self.loss_pred + self.loss_dist

            # optimizer
            if decay:
                self.learning_rate = tf.train.polynomial_decay(
                    learning_rate_init,
                    self.global_step,
                    self.max_iter,
                    0.0,
                    power=1.0)
            else:
                self.learning_rate = tf.convert_to_tensor(learning_rate_init)

            # TODO optional argument to change type, learning rate scheduler
            opt = tf.train.AdamOptimizer(self.learning_rate)

            # first compute gradients, then apply them
            self.compute_grads = opt.compute_gradients(self.loss_opt,
                                                       var_list=[self.cf])
            self.grad_ph = tf.placeholder(shape=shape,
                                          dtype=tf.float32,
                                          name='grad_cf')
            grad_and_var = [(self.grad_ph, self.cf)]
            self.apply_grads = opt.apply_gradients(
                grad_and_var, global_step=self.global_step)

        # variables to initialize
        self.setup = []  # type: list
        self.setup.append(self.orig.assign(self.assign_orig))
        self.setup.append(self.cf.assign(self.assign_cf))
        self.setup.append(self.target.assign(self.assign_target))

        self.tf_init = tf.variables_initializer(var_list=tf.global_variables(
            scope='cf_search'))

        # tensorboard
        if write_dir is not None:
            self.writer = tf.summary.FileWriter(write_dir,
                                                tf.get_default_graph())
            self.writer.add_graph(tf.get_default_graph())

        # return templates
        self.instance_dict = dict.fromkeys(
            ['X', 'distance', 'lambda', 'index', 'class', 'proba', 'loss'])
        self.return_dict = copy.deepcopy(DEFAULT_DATA_CF)
        self.return_dict['all'] = {i: [] for i in range(self.max_lam_steps)}
Beispiel #14
0
    def testFlopRegularizerWithContribFC(self):
        """Test MatMul Flop regularizer with tf.contrib.fully_connected layer.

    The structure of the fully connected network used in this test is the same
    with that used in testFlopRegularizerWithMatMul.
    """
        tf.reset_default_graph()
        tf.set_random_seed(1234)
        # Create test networks with tf.contrib.layers.fully_connected and initialize
        # the variables.
        with slim.arg_scope([contrib_layers.fully_connected],
                            weights_initializer=tf.random_normal_initializer,
                            biases_initializer=tf.random_normal_initializer):
            x = tf.constant(1.0, shape=[2, 6], name='x', dtype=tf.float32)
            net = contrib_layers.fully_connected(x, 4, scope='matmul1')
            net = contrib_layers.fully_connected(net, 1, scope='matmul2')
            name_to_variable = {v.op.name: v for v in tf.global_variables()}
        with self.cached_session():
            tf.global_variables_initializer().run()

        # Create FLOPs network regularizer.
        threshold = 0.9
        flop_reg = flop_regularizer.GroupLassoFlopsRegularizer([net.op],
                                                               threshold, 0)
        with self.cached_session() as session:
            evaluated_vars = session.run(name_to_variable)

        # Compute the regularizer vector for each layer.
        def group_norm(weights, axis=(0, 1, 2)):  # pylint: disable=invalid-name
            return np.sqrt(np.mean(weights**2, axis=axis))

        regularizer_vec = {
            'matmul1': group_norm(evaluated_vars['matmul1/weights'],
                                  axis=(0, )),
            'matmul2': group_norm(evaluated_vars['matmul2/weights'],
                                  axis=(0, ))
        }

        # Sanity check to make sure that not all outputs are alive or dead.
        total_outputs = (regularizer_vec['matmul1'].shape[0] +
                         regularizer_vec['matmul2'].shape[0])
        total_alive = sum(
            [np.sum(val > threshold) for val in regularizer_vec.values()])
        assert total_alive > 0, (
            'All outputs are dead. Decrease the threshold.')
        assert total_alive < total_outputs, (
            'All outputs are alive. Increase the threshold.')

        # Compute the expected flop cost and regularization term. The L2 norm of
        # columns in weight matrix of layer matmul1 is [2.15381098, 2.57671237,
        # 2.12560201, 2.2081387] and that of layer matmul2 is [1.72404861]. With
        # threshold = 2.2, there are two outputs in matmul1 layer are alive.
        matmul1_live_input = 6
        matmul1_live_output = sum(regularizer_vec['matmul1'] > threshold)
        expected_flop_cost = (_coeff(_get_op('matmul1/MatMul')) *
                              matmul1_live_input * matmul1_live_output)
        regularizer1 = np.sum(regularizer_vec['matmul1'])
        regularizer2 = np.sum(regularizer_vec['matmul2'])
        expected_reg_term = (_coeff(_get_op('matmul1/MatMul')) *
                             matmul1_live_input * regularizer1 +
                             _coeff(_get_op('matmul2/MatMul')) *
                             matmul1_live_output * regularizer2)
        with self.cached_session() as session:
            self.assertEqual(round(flop_reg.get_cost().eval()),
                             round(expected_flop_cost))
            self.assertNearRelatively(
                flop_reg.get_regularization_term().eval(), expected_reg_term)
Beispiel #15
0
    def forward(self,
                inputs,
                init_with_pretrain_vgg=False,
                pre_trained_model='./vgg16/vgg_16.ckpt'):
        # feature extraction part and also the share network
        reuse_fnet = len(
            [v
             for v in tf.global_variables() if v.name.startswith('FNet')]) > 0
        with tf.variable_scope('FNet', reuse=reuse_fnet):
            # feature extraction
            self.conv1_1 = self._conv2d(inputs, dim=64, name='conv1_1')
            self.conv1_2 = self._conv2d(self.conv1_1, dim=64, name='conv1_2')
            self.pool1 = self._max_pool2d(self.conv1_2)  # 256 => /2

            self.conv2_1 = self._conv2d(self.pool1, dim=128, name='conv2_1')
            self.conv2_2 = self._conv2d(self.conv2_1, dim=128, name='conv2_2')
            self.pool2 = self._max_pool2d(self.conv2_2)  # 128 => /4

            self.conv3_1 = self._conv2d(self.pool2, dim=256, name='conv3_1')
            self.conv3_2 = self._conv2d(self.conv3_1, dim=256, name='conv3_2')
            self.conv3_3 = self._conv2d(self.conv3_2, dim=256, name='conv3_3')
            self.pool3 = self._max_pool2d(self.conv3_3)  # 64 => /8

            self.conv4_1 = self._conv2d(self.pool3, dim=512, name='conv4_1')
            self.conv4_2 = self._conv2d(self.conv4_1, dim=512, name='conv4_2')
            self.conv4_3 = self._conv2d(self.conv4_2, dim=512, name='conv4_3')
            self.pool4 = self._max_pool2d(self.conv4_3)  # 32 => /16

            self.conv5_1 = self._conv2d(self.pool4, dim=512, name='conv5_1')
            self.conv5_2 = self._conv2d(self.conv5_1, dim=512, name='conv5_2')
            self.conv5_3 = self._conv2d(self.conv5_2, dim=512, name='conv5_3')
            self.pool5 = self._max_pool2d(self.conv5_3)  # 16 => /32

            # init feature extraction part from pre-train vgg16
            if init_with_pretrain_vgg:
                tf.train.init_from_checkpoint(pre_trained_model,
                                              self.pre_train_restore_map)

            # input size for logits predict
            [n, h, w, c] = inputs.shape.as_list()

        reuse_cw_net = len(
            [v
             for v in tf.global_variables() if v.name.startswith('CWNet')]) > 0
        with tf.variable_scope('CWNet', reuse=reuse_cw_net):
            # upsample
            up2 = (
                self._upconv2d(self.pool5, dim=256, act='linear',
                               name='up2_1')  # 32 => /16
                + self._conv2d(
                    self.pool4, dim=256, act='linear', name='pool4_s'))
            self.up2_cw = self._conv2d(up2, dim=256, name='up2_3')

            up4 = (
                self._upconv2d(
                    self.up2_cw, dim=128, act='linear',
                    name='up4_1')  # 64 => /8
                + self._conv2d(
                    self.pool3, dim=128, act='linear', name='pool3_s'))
            self.up4_cw = self._conv2d(up4, dim=128, name='up4_3')

            up8 = (
                self._upconv2d(self.up4_cw, dim=64, act='linear',
                               name='up8_1')  # 128 => /4
                +
                self._conv2d(self.pool2, dim=64, act='linear', name='pool2_s'))
            self.up8_cw = self._conv2d(up8, dim=64, name='up8_2')

            up16 = (
                self._upconv2d(
                    self.up8_cw, dim=32, act='linear',
                    name='up16_1')  # 256 => /2
                +
                self._conv2d(self.pool1, dim=32, act='linear', name='pool1_s'))
            self.up16_cw = self._conv2d(up16, dim=32, name='up16_2')

            # predict logits
            logits_cw = self._up_bilinear(self.up16_cw,
                                          dim=3,
                                          shape=(h, w),
                                          name='logits')

        # decode network for room type detection
        reuse_rnet = len(
            [v
             for v in tf.global_variables() if v.name.startswith('RNet')]) > 0
        with tf.variable_scope('RNet', reuse=reuse_rnet):
            # upsample
            up2 = (
                self._upconv2d(self.pool5, dim=256, act='linear',
                               name='up2_1')  # 32 => /16
                + self._conv2d(
                    self.pool4, dim=256, act='linear', name='pool4_s'))
            up2 = self._conv2d(up2, dim=256, name='up2_2')
            up2, _ = self._non_local_context(self.up2_cw,
                                             up2,
                                             name='context_up2')

            up4 = (
                self._upconv2d(up2, dim=128, act='linear',
                               name='up4_1')  # 64 => /8
                + self._conv2d(
                    self.pool3, dim=128, act='linear', name='pool3_s'))
            up4 = self._conv2d(up4, dim=128, name='up4_2')
            up4, _ = self._non_local_context(self.up4_cw,
                                             up4,
                                             name='context_up4')

            up8 = (
                self._upconv2d(up4, dim=64, act='linear',
                               name='up8_1')  # 128 => /4
                +
                self._conv2d(self.pool2, dim=64, act='linear', name='pool2_s'))
            up8 = self._conv2d(up8, dim=64, name='up8_2')
            up8, _ = self._non_local_context(self.up8_cw,
                                             up8,
                                             name='context_up8')

            up16 = (
                self._upconv2d(up8, dim=32, act='linear',
                               name='up16_1')  # 256 => /2
                +
                self._conv2d(self.pool1, dim=32, act='linear', name='pool1_s'))
            up16 = self._conv2d(up16, dim=32, name='up16_2')
            self.up16_r, self.a = self._non_local_context(self.up16_cw,
                                                          up16,
                                                          name='context_up16')

            # predict logits
            logits_r = self._up_bilinear(self.up16_r,
                                         dim=9,
                                         shape=(h, w),
                                         name='logits')

            return logits_r, logits_cw
Beispiel #16
0
def main(_):
  tf.logging.set_verbosity(tf.logging.INFO)

  model_dir = os.path.expanduser(FLAGS.model_dir)
  output_dir = os.path.expanduser(FLAGS.output_dir)
  out_base_file = os.path.join(output_dir, "model.ckpt")

  # Copy flags.txt with the original time, so t2t-bleu can report correct
  # relative time.
  tf.gfile.MakeDirs(FLAGS.output_dir)
  if (not os.path.exists(os.path.join(output_dir, "flags.txt")) and
      os.path.exists(os.path.join(model_dir, "flags.txt"))):
    shutil.copy2(os.path.join(model_dir, "flags.txt"),
                 os.path.join(output_dir, "flags.txt"))

  models_processed = 0
  queue = deque()
  for model in bleu_hook.stepfiles_iterator(model_dir, FLAGS.wait_minutes,
                                            FLAGS.min_steps):
    if models_processed == 0:
      var_list = tf.train.list_variables(model.filename)
      avg_values = {}
      for (name, shape) in var_list:
        if not (name.startswith("global_step") or
                name.startswith("train_stats/")):
          avg_values[name] = np.zeros(shape)
    models_processed += 1

    tf.logging.info("Loading [%d]: %s" % (models_processed, model.filename))
    reader = tf.train.load_checkpoint(model.filename)
    for name in avg_values:
      avg_values[name] += reader.get_tensor(name) / FLAGS.n
    queue.append(model)
    if len(queue) < FLAGS.n:
      continue

    out_file = "%s-%d" % (out_base_file, model.steps)
    tf_vars = []
    tf.logging.info("Averaging %s" % (out_file))
    for (name, value) in six.iteritems(avg_values):
      # TODO(martinpopel): dtype=var_dtypes[name]
      tf_vars.append(tf.get_variable(name, shape=value.shape))
    placeholders = [tf.placeholder(v.dtype, shape=v.shape) for v in tf_vars]
    assign_ops = [tf.assign(v, p) for (v, p) in zip(tf_vars, placeholders)]

    global_step = tf.get_variable(
        "global_step",
        initializer=tf.constant(model.steps, dtype=tf.int64),
        trainable=False)
    with tf.variable_scope("train_stats"):
      tf.get_variable("problem_0_steps", initializer=0, trainable=False)
    saver = tf.train.Saver(tf.global_variables())

    tf.logging.info("Running session for %s" % (out_file))
    with tf.Session() as sess:
      sess.run(tf.global_variables_initializer())
      for p, assign_op, (name, value) in zip(
          placeholders, assign_ops, six.iteritems(avg_values)):
        sess.run(assign_op, {p: value})
      tf.logging.info("Storing to %s" % out_file)
      saver.save(sess, out_base_file, global_step=global_step)
    os.utime(out_file + ".index", (model.mtime, model.mtime))

    tf.reset_default_graph()
    first_model = queue.popleft()

    reader = tf.train.load_checkpoint(first_model.filename)
    for name in avg_values:
      avg_values[name] -= reader.get_tensor(name) / FLAGS.n
Beispiel #17
0
    def __init__(self,
                 policy,
                 ob_space,
                 ac_space,
                 nenvs,
                 nsteps,
                 ent_coef=0.01,
                 v_mix_coef=0.5,
                 max_grad_norm=0.5,
                 lr_alpha=7e-4,
                 lr_beta=7e-4,
                 alpha=0.99,
                 epsilon=1e-5,
                 total_timesteps=int(80e6),
                 lrschedule='linear',
                 r_ex_coef=1.0,
                 r_in_coef=0.0,
                 v_ex_coef=1.0):

        sess = tf_util.make_session()
        nact = ac_space.n
        nbatch = nenvs * nsteps

        A = tf.placeholder(tf.int32, [nbatch], 'A')
        R_EX = tf.placeholder(tf.float32, [nbatch], 'R_EX')
        ADV_EX = tf.placeholder(tf.float32, [nbatch], 'ADV_EX')
        RET_EX = tf.placeholder(tf.float32, [nbatch], 'RET_EX')
        V_MIX = tf.placeholder(tf.float32, [nbatch], 'V_MIX')
        DIS_V_MIX_LAST = tf.placeholder(tf.float32, [nbatch], 'DIS_V_MIX_LAST')
        COEF_MAT = tf.placeholder(tf.float32, [nbatch, nbatch], 'COEF_MAT')
        LR_ALPHA = tf.placeholder(tf.float32, [], 'LR_ALPHA')
        LR_BETA = tf.placeholder(tf.float32, [], 'LR_BETA')

        step_model = policy(sess, ob_space, ac_space, nenvs, 1, reuse=False)
        train_model = policy(sess,
                             ob_space,
                             ac_space,
                             nenvs * nsteps,
                             nsteps,
                             reuse=True)

        r_mix = r_ex_coef * R_EX + r_in_coef * tf.reduce_sum(
            train_model.r_in * tf.one_hot(A, nact), axis=1)
        ret_mix = tf.squeeze(
            tf.matmul(COEF_MAT, tf.reshape(r_mix, [nbatch, 1])),
            [1]) + DIS_V_MIX_LAST
        adv_mix = ret_mix - V_MIX

        neglogpac = train_model.pd.neglogp(A)
        pg_mix_loss = tf.reduce_mean(adv_mix * neglogpac)
        v_mix_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_mix),
                                        ret_mix))
        entropy = tf.reduce_mean(cat_entropy(train_model.pi))
        policy_loss = pg_mix_loss - ent_coef * entropy + v_mix_coef * v_mix_loss

        policy_params = train_model.policy_params
        policy_grads = tf.gradients(policy_loss, policy_params)
        if max_grad_norm is not None:
            policy_grads, policy_grad_norm = tf.clip_by_global_norm(
                policy_grads, max_grad_norm)
        policy_grads_and_vars = list(zip(policy_grads, policy_params))
        policy_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_ALPHA,
                                                   decay=alpha,
                                                   epsilon=epsilon)
        policy_train = policy_trainer.apply_gradients(policy_grads_and_vars)

        rmss = [policy_trainer.get_slot(var, 'rms') for var in policy_params]
        policy_params_new = {}
        for grad, rms, var in zip(policy_grads, rmss, policy_params):
            ms = rms + (tf.square(grad) - rms) * (1 - alpha)
            policy_params_new[
                var.name] = var - LR_ALPHA * grad / tf.sqrt(ms + epsilon)
        policy_new = train_model.policy_new_fn(policy_params_new, ob_space,
                                               ac_space, nbatch, nsteps)

        neglogpac_new = policy_new.pd.neglogp(A)
        ratio_new = tf.exp(tf.stop_gradient(neglogpac) - neglogpac_new)
        pg_ex_loss = tf.reduce_mean(-ADV_EX * ratio_new)
        v_ex_loss = tf.reduce_mean(mse(tf.squeeze(train_model.v_ex), RET_EX))
        intrinsic_loss = pg_ex_loss + v_ex_coef * v_ex_loss

        intrinsic_params = train_model.intrinsic_params
        intrinsic_grads = tf.gradients(intrinsic_loss, intrinsic_params)
        if max_grad_norm is not None:
            intrinsic_grads, intrinsic_grad_norm = tf.clip_by_global_norm(
                intrinsic_grads, max_grad_norm)
        intrinsic_grads_and_vars = list(zip(intrinsic_grads, intrinsic_params))
        intrinsic_trainer = tf.train.RMSPropOptimizer(learning_rate=LR_BETA,
                                                      decay=alpha,
                                                      epsilon=epsilon)
        intrinsic_train = intrinsic_trainer.apply_gradients(
            intrinsic_grads_and_vars)

        lr_alpha = Scheduler(v=lr_alpha,
                             nvalues=total_timesteps,
                             schedule=lrschedule)
        lr_beta = Scheduler(v=lr_beta,
                            nvalues=total_timesteps,
                            schedule=lrschedule)

        all_params = tf.global_variables()

        def train(obs, policy_states, masks, actions, r_ex, ret_ex, v_ex,
                  v_mix, dis_v_mix_last, coef_mat):
            advs_ex = ret_ex - v_ex
            for step in range(len(obs)):
                cur_lr_alpha = lr_alpha.value()
                cur_lr_beta = lr_beta.value()
            td_map = {
                train_model.X: obs,
                policy_new.X: obs,
                A: actions,
                R_EX: r_ex,
                ADV_EX: advs_ex,
                RET_EX: ret_ex,
                V_MIX: v_mix,
                DIS_V_MIX_LAST: dis_v_mix_last,
                COEF_MAT: coef_mat,
                LR_ALPHA: cur_lr_alpha,
                LR_BETA: cur_lr_beta
            }
            if policy_states is not None:
                td_map[train_model.PS] = policy_states
                td_map[train_model.M] = masks
            return sess.run([entropy, policy_train, intrinsic_train],
                            td_map)[0]

        def save(save_path):
            ps = sess.run(all_params)
            make_path(osp.dirname(save_path))
            joblib.dump(ps, save_path)

        def load(load_path):
            loaded_params = joblib.load(load_path)
            restores = []
            for p, loaded_p in zip(all_params, loaded_params):
                restores.append(p.assign(loaded_p))
            ps = sess.run(restores)

        self.train = train
        self.train_model = train_model
        self.step_model = step_model
        self.step = step_model.step
        self.value = step_model.value
        self.intrinsic_reward = step_model.intrinsic_reward
        self.init_policy_state = step_model.init_policy_state
        self.save = save
        self.load = load
        tf.global_variables_initializer().run(session=sess)
Beispiel #18
0
    def _init_graph(self) -> None:
        assert self._var_inits is not None
        assert self._input_templates is None
        assert self._output_templates is None
        assert self._own_vars is None

        # Initialize components.
        if self._components is None:
            self._components = util.EasyDict()

        # Choose build func kwargs.
        build_kwargs = dict(self.static_kwargs)
        build_kwargs["is_template_graph"] = True
        build_kwargs["components"] = self._components

        # Override scope and device, and ignore surrounding control dependencies.
        with tfutil.absolute_variable_scope(self.scope, reuse=False), tfutil.absolute_name_scope(self.scope), tf.device(self.device), tf.control_dependencies(None):
            assert tf.get_variable_scope().name == self.scope
            assert tf.get_default_graph().get_name_scope() == self.scope

            # Create input templates.
            self._input_templates = []
            for param in inspect.signature(self._build_func).parameters.values():
                if param.kind == param.POSITIONAL_OR_KEYWORD and param.default is param.empty:
                    self._input_templates.append(tf.placeholder(tf.float32, name=param.name))

            # Call build func.
            print(tf)
            out_expr = self._build_func(*self._input_templates, **build_kwargs)

        # Collect output templates and variables.
        assert tfutil.is_tf_expression(out_expr) or isinstance(out_expr, tuple)
        self._output_templates = [out_expr] if tfutil.is_tf_expression(out_expr) else list(out_expr)
        self._own_vars = OrderedDict((var.name[len(self.scope) + 1:].split(":")[0], var) for var in tf.global_variables(self.scope + "/"))

        # Check for errors.
        if len(self._input_templates) == 0:
            raise ValueError("Network build func did not list any inputs.")
        if len(self._output_templates) == 0:
            raise ValueError("Network build func did not return any outputs.")
        if any(not tfutil.is_tf_expression(t) for t in self._output_templates):
            raise ValueError("Network outputs must be TensorFlow expressions.")
        if any(t.shape.ndims is None for t in self._input_templates):
            raise ValueError("Network input shapes not defined. Please call x.set_shape() for each input.")
        if any(t.shape.ndims is None for t in self._output_templates):
            raise ValueError("Network output shapes not defined. Please call x.set_shape() where applicable.")
        if any(not isinstance(comp, Network) for comp in self._components.values()):
            raise ValueError("Components of a Network must be Networks themselves.")
        if len(self._components) != len(set(comp.name for comp in self._components.values())):
            raise ValueError("Components of a Network must have unique names.")

        # Initialize variables.
        if len(self._var_inits):
            tfutil.set_vars({self._get_vars()[name]: value for name, value in self._var_inits.items() if name in self._get_vars()})
        remaining_inits = [var.initializer for name, var in self._own_vars.items() if name not in self._var_inits]
        if self._all_inits_known:
            assert len(remaining_inits) == 0
        else:
            tfutil.run(remaining_inits)
        self._var_inits = None
Beispiel #19
0
    def rebuild_graph(self, path, model_name, full_assign=False,
                      train_data=None):
        if train_data is None:
            raise ValueError("SVDpp model must provide train_data "
                             "when rebuilding graph")
        sparse_implicit_interaction = sparse_tensor_interaction(
            train_data, recent_num=10)
        self._build_model(sparse_implicit_interaction)
        self._build_train_ops()

        variable_path = os.path.join(path, f"{model_name}_variables.npz")
        variables = np.load(variable_path)
        variables = dict(variables.items())

        (
            user_variables,
            item_variables,
            sparse_variables,
            dense_variables,
            manual_variables
        ) = modify_variable_names(self, trainable=True)

        update_ops = []
        for v in tf.trainable_variables():
            if user_variables is not None and v.name in user_variables:
                # no need to remove oov values
                old_var = variables[v.name]
                user_op = tf.IndexedSlices(old_var, tf.range(len(old_var)))
                update_ops.append(v.scatter_update(user_op))

            if item_variables is not None and v.name in item_variables:
                old_var = variables[v.name]
                item_op = tf.IndexedSlices(old_var, tf.range(len(old_var)))
                update_ops.append(v.scatter_update(item_op))

        if full_assign:
            (
                optimizer_user_variables,
                optimizer_item_variables,
                optimizer_sparse_variables,
                optimizer_dense_variables,
                _
            ) = modify_variable_names(self, trainable=False)

            other_variables = [v for v in tf.global_variables()
                               if v.name not in manual_variables]
            for v in other_variables:
                if (optimizer_user_variables is not None
                        and v.name in optimizer_user_variables):
                    old_var = variables[v.name]
                    user_op = tf.IndexedSlices(old_var, tf.range(len(old_var)))
                    update_ops.append(v.scatter_update(user_op))

                elif (optimizer_item_variables is not None
                          and v.name in optimizer_item_variables):
                    old_var = variables[v.name]
                    item_op = tf.IndexedSlices(old_var, tf.range(len(old_var)))
                    update_ops.append(v.scatter_update(item_op))

                else:
                    old_var = variables[v.name]
                    update_ops.append(v.assign(old_var))

        self.sess.run(update_ops)
Beispiel #20
0
def train():
    # Create training and validation datasets
    train_set = create_dataset(FLAGS.train_files.split(','),
                               batch_size=FLAGS.train_batch_size,
                               cache_path=FLAGS.feature_cache,
                               train_phase=True)

    iterator = tfv1.data.Iterator.from_structure(
        tfv1.data.get_output_types(train_set),
        tfv1.data.get_output_shapes(train_set),
        output_classes=tfv1.data.get_output_classes(train_set))

    # Make initialization ops for switching between the two sets
    train_init_op = iterator.make_initializer(train_set)

    if FLAGS.dev_files:
        dev_csvs = FLAGS.dev_files.split(',')
        dev_sets = [
            create_dataset([csv],
                           batch_size=FLAGS.dev_batch_size,
                           train_phase=False) for csv in dev_csvs
        ]
        dev_init_ops = [
            iterator.make_initializer(dev_set) for dev_set in dev_sets
        ]

    # Dropout
    dropout_rates = [
        tfv1.placeholder(tf.float32, name='dropout_{}'.format(i))
        for i in range(6)
    ]
    dropout_feed_dict = {
        dropout_rates[0]: FLAGS.dropout_rate,
        dropout_rates[1]: FLAGS.dropout_rate2,
        dropout_rates[2]: FLAGS.dropout_rate3,
        dropout_rates[3]: FLAGS.dropout_rate4,
        dropout_rates[4]: FLAGS.dropout_rate5,
        dropout_rates[5]: FLAGS.dropout_rate6,
    }
    no_dropout_feed_dict = {rate: 0. for rate in dropout_rates}

    # Building the graph
    optimizer = create_optimizer()
    gradients, loss, non_finite_files = get_tower_results(
        iterator, optimizer, dropout_rates)

    # Average tower gradients across GPUs
    avg_tower_gradients = average_gradients(gradients)
    log_grads_and_vars(avg_tower_gradients)

    # global_step is automagically incremented by the optimizer
    global_step = tfv1.train.get_or_create_global_step()
    apply_gradient_op = optimizer.apply_gradients(avg_tower_gradients,
                                                  global_step=global_step)

    # Summaries
    step_summaries_op = tfv1.summary.merge_all('step_summaries')
    step_summary_writers = {
        'train':
        tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'train'),
                                max_queue=120),
        'dev':
        tfv1.summary.FileWriter(os.path.join(FLAGS.summary_dir, 'dev'),
                                max_queue=120)
    }

    # Checkpointing
    checkpoint_saver = tfv1.train.Saver(max_to_keep=FLAGS.max_to_keep)
    checkpoint_path = os.path.join(FLAGS.checkpoint_dir, 'train')
    checkpoint_filename = 'checkpoint'

    best_dev_saver = tfv1.train.Saver(max_to_keep=1)
    best_dev_path = os.path.join(FLAGS.checkpoint_dir, 'best_dev')
    best_dev_filename = 'best_dev_checkpoint'

    # Save flags next to checkpoints
    os.makedirs(FLAGS.checkpoint_dir, exist_ok=True)

    flags_file = os.path.join(FLAGS.checkpoint_dir, 'flags.txt')
    with open(flags_file, 'w') as fout:
        fout.write(FLAGS.flags_into_string())

    initializer = tfv1.global_variables_initializer()

    with tfv1.Session(config=Config.session_config) as session:
        log_debug('Session opened.')

        # Loading or initializing
        loaded = False

        # Initialize training from a CuDNN RNN checkpoint
        if FLAGS.cudnn_checkpoint:
            if FLAGS.use_cudnn_rnn:
                log_error(
                    'Trying to use --cudnn_checkpoint but --use_cudnn_rnn '
                    'was specified. The --cudnn_checkpoint flag is only '
                    'needed when converting a CuDNN RNN checkpoint to '
                    'a CPU-capable graph. If your system is capable of '
                    'using CuDNN RNN, you can just specify the CuDNN RNN '
                    'checkpoint normally with --checkpoint_dir.')
                exit(1)

            log_info('Converting CuDNN RNN checkpoint from {}'.format(
                FLAGS.cudnn_checkpoint))
            ckpt = tfv1.train.load_checkpoint(FLAGS.cudnn_checkpoint)
            missing_variables = []

            # Load compatible variables from checkpoint
            for v in tfv1.global_variables():
                try:
                    v.load(ckpt.get_tensor(v.op.name), session=session)
                except tf.errors.NotFoundError:
                    missing_variables.append(v)

            # Check that the only missing variables are the Adam moment tensors
            if any('Adam' not in v.op.name for v in missing_variables):
                log_error(
                    'Tried to load a CuDNN RNN checkpoint but there were '
                    'more missing variables than just the Adam moment '
                    'tensors.')
                exit(1)

            # Initialize Adam moment tensors from scratch to allow use of CuDNN
            # RNN checkpoints.
            log_info('Initializing missing Adam moment tensors.')
            init_op = tfv1.variables_initializer(missing_variables)
            session.run(init_op)
            loaded = True

        tfv1.get_default_graph().finalize()

        if not loaded and FLAGS.load in ['auto', 'last']:
            loaded = try_loading(session, checkpoint_saver,
                                 checkpoint_filename, 'most recent')
        if not loaded and FLAGS.load in ['auto', 'best']:
            loaded = try_loading(session, best_dev_saver, best_dev_filename,
                                 'best validation')
        if not loaded:
            if FLAGS.load in ['auto', 'init']:
                log_info('Initializing variables...')
                session.run(initializer)
            else:
                log_error(
                    'Unable to load %s model from specified checkpoint dir'
                    ' - consider using load option "auto" or "init".' %
                    FLAGS.load)
                sys.exit(1)

        def run_set(set_name, epoch, init_op, dataset=None):
            is_train = set_name == 'train'
            train_op = apply_gradient_op if is_train else []
            feed_dict = dropout_feed_dict if is_train else no_dropout_feed_dict

            total_loss = 0.0
            step_count = 0

            step_summary_writer = step_summary_writers.get(set_name)
            checkpoint_time = time.time()

            # Setup progress bar
            class LossWidget(progressbar.widgets.FormatLabel):
                def __init__(self):
                    progressbar.widgets.FormatLabel.__init__(
                        self, format='Loss: %(mean_loss)f')

                def __call__(self, progress, data, **kwargs):
                    data[
                        'mean_loss'] = total_loss / step_count if step_count else 0.0
                    return progressbar.widgets.FormatLabel.__call__(
                        self, progress, data, **kwargs)

            prefix = 'Epoch {} | {:>10}'.format(
                epoch, 'Training' if is_train else 'Validation')
            widgets = [
                ' | ',
                progressbar.widgets.Timer(), ' | Steps: ',
                progressbar.widgets.Counter(), ' | ',
                LossWidget()
            ]
            suffix = ' | Dataset: {}'.format(dataset) if dataset else None
            pbar = create_progressbar(prefix=prefix,
                                      widgets=widgets,
                                      suffix=suffix).start()

            # Initialize iterator to the appropriate dataset
            session.run(init_op)

            # Batch loop
            while True:
                try:
                    _, current_step, batch_loss, problem_files, step_summary = \
                        session.run([train_op, global_step, loss, non_finite_files, step_summaries_op],
                                    feed_dict=feed_dict)
                except tf.errors.OutOfRangeError:
                    break

                if problem_files.size > 0:
                    problem_files = [
                        f.decode('utf8') for f in problem_files[..., 0]
                    ]
                    log_error(
                        'The following files caused an infinite (or NaN) '
                        'loss: {}'.format(','.join(problem_files)))

                total_loss += batch_loss
                step_count += 1

                pbar.update(step_count)

                step_summary_writer.add_summary(step_summary, current_step)

                if is_train and FLAGS.checkpoint_secs > 0 and time.time(
                ) - checkpoint_time > FLAGS.checkpoint_secs:
                    checkpoint_saver.save(session,
                                          checkpoint_path,
                                          global_step=current_step)
                    checkpoint_time = time.time()

            pbar.finish()
            mean_loss = total_loss / step_count if step_count > 0 else 0.0
            return mean_loss, step_count

        log_info('STARTING Optimization')
        train_start_time = datetime.utcnow()
        best_dev_loss = float('inf')
        dev_losses = []
        try:
            for epoch in range(FLAGS.epochs):
                # Training
                log_progress('Training epoch %d...' % epoch)
                train_loss, _ = run_set('train', epoch, train_init_op)
                log_progress('Finished training epoch %d - loss: %f' %
                             (epoch, train_loss))
                checkpoint_saver.save(session,
                                      checkpoint_path,
                                      global_step=global_step)

                if FLAGS.dev_files:
                    # Validation
                    dev_loss = 0.0
                    total_steps = 0
                    for csv, init_op in zip(dev_csvs, dev_init_ops):
                        log_progress('Validating epoch %d on %s...' %
                                     (epoch, csv))
                        set_loss, steps = run_set('dev',
                                                  epoch,
                                                  init_op,
                                                  dataset=csv)
                        dev_loss += set_loss * steps
                        total_steps += steps
                        log_progress(
                            'Finished validating epoch %d on %s - loss: %f' %
                            (epoch, csv, set_loss))
                    dev_loss = dev_loss / total_steps

                    dev_losses.append(dev_loss)

                    if dev_loss < best_dev_loss:
                        best_dev_loss = dev_loss
                        save_path = best_dev_saver.save(
                            session,
                            best_dev_path,
                            global_step=global_step,
                            latest_filename=best_dev_filename)
                        log_info(
                            "Saved new best validating model with loss %f to: %s"
                            % (best_dev_loss, save_path))

                    # Early stopping
                    if FLAGS.early_stop and len(dev_losses) >= FLAGS.es_steps:
                        mean_loss = np.mean(dev_losses[-FLAGS.es_steps:-1])
                        std_loss = np.std(dev_losses[-FLAGS.es_steps:-1])
                        dev_losses = dev_losses[-FLAGS.es_steps:]
                        log_debug(
                            'Checking for early stopping (last %d steps) validation loss: '
                            '%f, with standard deviation: %f and mean: %f' %
                            (FLAGS.es_steps, dev_losses[-1], std_loss,
                             mean_loss))
                        if dev_losses[-1] > np.max(dev_losses[:-1]) or \
                           (abs(dev_losses[-1] - mean_loss) < FLAGS.es_mean_th and std_loss < FLAGS.es_std_th):
                            log_info(
                                'Early stop triggered as (for last %d steps) validation loss:'
                                ' %f with standard deviation: %f and mean: %f'
                                % (FLAGS.es_steps, dev_losses[-1], std_loss,
                                   mean_loss))
                            break
        except KeyboardInterrupt:
            pass
        log_info('FINISHED optimization in {}'.format(datetime.utcnow() -
                                                      train_start_time))
    log_debug('Session closed.')
Beispiel #21
0
    def my_model_fn(features, labels, mode, params=None, config=None):
        """Estimator model function.
        Args:
          features: dictionary where keys are strings like "inputs" and "targets"
            and the values are the actual values of "inputs". See TPUEstimator's
            docs for more information
          labels: ignored argument
          mode: a tf.estimator.ModeKeys
          params: dictionary containing the key "context"
          config: ignored argument
        Returns:
          a TPUEstimatorSpec
        """
        del labels, config
        global_step = tf.train.get_global_step()
        if use_tpu and "context" in params:
            ctx = params["context"]
            num_hosts = ctx.num_hosts
            host_placement_fn = ctx.tpu_host_placement_function
            device_list = [
                host_placement_fn(host_id=t) for t in range(num_hosts)
            ]
            # TODO(ylc): Better estimation of replica cache size?
            replica_cache_size = 300 * 1000000  # 300M per replica
            # Worker 0 caches all the TPU binaries.
            worker0_mem = replica_cache_size * ctx.num_replicas
            devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1)
            var_placer = mtf.utils.BalancedVariablePlacer(
                device_list, devices_memeory_usage)
            # deprecated mesh_devices = [""] * mesh_shape.size
            physical_shape = list(
                params["context"].device_assignment.topology.mesh_shape)
            logical_to_physical = mtf.simd_mesh_impl.auto_logical_to_physical_tpu(
                mesh_shape.to_integer_list, physical_shape)
            mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(
                mesh_shape,
                layout_rules,
                mesh_devices,
                ctx.device_assignment,
                logical_to_physical=logical_to_physical)
        else:
            var_placer = None
            # deprecated mesh_devices = [""] * mesh_shape.size
            mesh_impl = mtf.placement_mesh_impl.PlacementMeshImpl(
                mesh_shape, layout_rules, mesh_devices)

        graph = mtf.Graph()
        mesh = mtf.Mesh(graph, "my_mesh", var_placer)

        mtf_features = {}
        for key, x in features.items():
            outer_batch_dim = mtf.Dimension("outer_batch", outer_batch_size)
            batch_dim = mtf.Dimension("batch", batch_size // outer_batch_size)
            # Some auxiliary features may have been generated in packing.
            # The names of these new features are of the form
            #   "<original_feature_name>_<suffix>", e.g. "inputs_segmentation".
            #   We look up the lengths based on the original feature name, without
            #   the "_<suffix>".
            feature_length = sequence_length[key.split("_")[0]]
            length_dim = mtf.Dimension("length", feature_length)
            ensemble_dims = ([mtf.Dimension("ensemble", ensemble_inputs)]
                             if ensemble_inputs else [])
            feature_shape = mtf.Shape(ensemble_dims +
                                      [outer_batch_dim, batch_dim, length_dim])
            x = tf.cast(features[key], tf.int32)
            x = tf.reshape(x, feature_shape.to_integer_list)
            if not use_tpu:
                tf.logging.info("feature %s : %s" % (key, x))
                x = tf.Print(x, [x],
                             "import feature %s" % key,
                             summarize=1000,
                             first_n=10)
            mtf_features[key] = mtf.import_fully_replicated(mesh,
                                                            x,
                                                            feature_shape,
                                                            name=key)
            if key == "targets" or key == "codeprefixedtargets" or key == "controlcode":
                anon_targets = mtf.anonymize(mtf_features[key])

        if mode == tf.estimator.ModeKeys.PREDICT:

            def _feature_shape(key):
                feature_length = sequence_length[key.split("_")[0]]
                return mtf.Shape([
                    mtf.Dimension("batch", batch_size),
                    mtf.Dimension("length", feature_length)
                ])

            mtf_features = {
                k: mtf.reshape(v, _feature_shape(k))
                for k, v in six.iteritems(mtf_features)
            }
            inputs = mtf_features["inputs"]

            if attribute_embedding:
                attributes = mtf_features["attribute"]
            else:
                attributes = None

            if has_partial_sequences:
                controlcodes = mtf_features["controlcode"]
            else:
                controlcodes = None

            if predict_fn:
                mtf_samples = predict_fn(model=transformer_model,
                                         features=mtf_features,
                                         variable_dtype=get_variable_dtype())
            elif isinstance(transformer_model, transformer.Unitransformer):
                # pad so that there is enough room for the targets
                inputs = mtf.pad(inputs, [0, sequence_length["targets"]],
                                 length_dim.name)
                mtf_samples = transformer_model.sample_autoregressive(
                    inputs,
                    variable_dtype=get_variable_dtype(),
                    remove_partial_sequences=True)
            elif isinstance(transformer_model, Bitransformer_ll):
                mtf_samples = transformer_model.decode(
                    inputs,
                    attributes=attributes,
                    controlcodes=controlcodes,
                    has_partial_sequences=has_partial_sequences,
                    remove_partial_sequences=remove_partial_sequences,
                    variable_dtype=get_variable_dtype())  #
            elif isinstance(
                    transformer_model,
                (transformer.Bitransformer, transformer.StudentTeacher)):
                mtf_samples = transformer_model.decode(
                    inputs, variable_dtype=get_variable_dtype())
            else:
                raise ValueError("unrecognized class")
            mtf_samples = mtf.anonymize(mtf_samples)
            inputs = mtf.anonymize(inputs)
            lowering = mtf.Lowering(graph, {mesh: mesh_impl},
                                    autostack=autostack)
            inputs = lowering.export_to_tf_tensor(inputs)
            outputs = lowering.export_to_tf_tensor(mtf_samples)
            predictions = {"inputs": inputs, "outputs": outputs}

            # When exporting a model, we need to communicate to TF-Serving that
            # master variables need to be copied to their slave slice variables.
            # Estimator uses a Scaffold's "local_init_op" for this purpose, so we
            # augment the default "local_init_op" here.
            #
            # The "ready_op" is also constructed here to ensure the variables
            # initialized by "local_init_op" are the same ones checked by "ready_op".
            #
            # WARNING: Any variables created outside of this model_fn()
            # (e.g. tpu_estimator/iterations_per_loop) will NOT be initialized nor
            # checked by these ops.
            def scaffold_fn():
                return tf.train.Scaffold(
                    local_init_op=tf.group(
                        tf.train.Scaffold.default_local_init_op(),
                        lowering.copy_masters_to_slices(),
                        name="mtf_local_init_op"),
                    ready_op=tf.concat([
                        tf.report_uninitialized_variables(),
                        resources.report_uninitialized_resources()
                    ],
                                       axis=0,
                                       name="mtf_ready_op"))

            return tpu_estimator.TPUEstimatorSpec(
                mode=tf.estimator.ModeKeys.PREDICT,
                predictions=predictions,
                scaffold_fn=scaffold_fn,
                prediction_hooks=[mtf.MtfRestoreHook(lowering)])

        assert (mode == tf.estimator.ModeKeys.TRAIN
                or mode == tf.estimator.ModeKeys.EVAL)

        def logits_and_loss(mtf_features):
            """Compute logits and loss.
            Args:
              mtf_features: a dictionary
            Returns:
              logits: a mtf.Tensor
              loss: a mtf.Tensor
            """
            if model_type == "lm":  # TOTRY Adapt that to our case
                if "inputs" in mtf_features:
                    mtf_features = _dynamic_text2self(mtf_features)
                _, _, length_dim = mtf_features["targets"].shape
                inputs = mtf.shift(mtf_features["targets"],
                                   offset=1,
                                   dim=length_dim,
                                   wrap=False)
            else:
                inputs = mtf_features["inputs"]

            if attribute_embedding:
                attributes = mtf_features["attribute"]
            else:
                attributes = None

            if control_codes:
                codeprefixedtargets = mtf_features["codeprefixedtargets"]
            else:
                codeprefixedtargets = None

            if isinstance(transformer_model, transformer.Unitransformer):
                position_kwargs = dict(
                    sequence_id=mtf_features.get("targets_segmentation", None),
                    position=mtf_features.get("targets_position", None),
                )
            elif isinstance(transformer_model, transformer.Bitransformer
                            ) or model_type == "bi_student_teacher":
                if control_codes:
                    position_kwargs = dict(
                        encoder_sequence_id=mtf_features.get(
                            "inputs_segmentation", None),
                        decoder_sequence_id=mtf_features.get(
                            "codeprefixedtargets_segmentation", None),
                        decoder_subsequence_id=mtf_features.get(
                            "codeprefixedtargets_subsegmentation", None),
                        encoder_position=mtf_features.get(
                            "inputs_position", None),
                        decoder_position=mtf_features.get(
                            "codeprefixedtargets_position", None),
                    )
                else:
                    position_kwargs = dict(
                        encoder_sequence_id=mtf_features.get(
                            "inputs_segmentation", None),
                        decoder_sequence_id=mtf_features.get(
                            "targets_segmentation", None),
                        decoder_subsequence_id=mtf_features.get(
                            "targets_subsegmentation", None),
                        encoder_position=mtf_features.get(
                            "inputs_position", None),
                        decoder_position=mtf_features.get(
                            "targets_position", None),
                    )
            else:
                raise ValueError("unrecognized class")

            if isinstance(transformer_model, Bitransformer_ll):
                if cycle_consistency_loss:
                    logits_ae, l_ae = transformer_model.call_simple(
                        inputs=inputs,
                        targets=mtf_features["targets"],
                        compute_loss=True,
                        attributes=attributes,
                        codeprefixedtargets=codeprefixedtargets,
                        mode=mode,
                        variable_dtype=get_variable_dtype(),
                        **position_kwargs)

                    if has_partial_sequences:
                        controlcodes = mtf_features["controlcode"]
                    else:
                        controlcodes = None

                    with gin.config_scope('training'):
                        mtf_samples = transformer_model.decode(
                            inputs,
                            attributes=attributes,
                            controlcodes=controlcodes,
                            has_partial_sequences=has_partial_sequences,
                            remove_partial_sequences=remove_partial_sequences,
                            variable_dtype=get_variable_dtype())
                        # mtf_samples = mtf.anonymize(mtf_samples)
                    outputs = mtf_samples

                    logits_cycle, l_cycle = transformer_model.call_simple(
                        inputs=outputs,
                        targets=mtf_features["targets"],
                        compute_loss=True,
                        attributes=attributes,
                        codeprefixedtargets=codeprefixedtargets,
                        mode=mode,
                        variable_dtype=get_variable_dtype(),
                        **position_kwargs)

                    loss_ae_cycle = lambda_ae * l_ae + lambda_cycle * l_cycle
                    return logits_cycle, loss_ae_cycle
                else:
                    return transformer_model.call_simple(
                        inputs=inputs,
                        targets=mtf_features["targets"],
                        compute_loss=True,
                        attributes=attributes,
                        codeprefixedtargets=codeprefixedtargets,
                        mode=mode,
                        variable_dtype=get_variable_dtype(),
                        **position_kwargs)
            else:
                return transformer_model.call_simple(
                    inputs=inputs,
                    targets=mtf_features["targets"],
                    compute_loss=True,
                    mode=mode,
                    variable_dtype=get_variable_dtype(),
                    num_microbatches=num_microbatches,
                    **position_kwargs)

        if mode == tf.estimator.ModeKeys.TRAIN:
            num_microbatches = serialize_num_microbatches(
                batch_dim, sequence_length, mesh_shape, layout_rules)
            if num_microbatches > 1:

                def serialized_fn(mtf_features):
                    return {
                        "loss":
                        (logits_and_loss(mtf_features)[1] / num_microbatches)
                    }

                var_grads, loss_dict = mtf.serialize_training_step(
                    mtf_features, serialized_fn, batch_dim, num_microbatches)
                loss = loss_dict["loss"]
            else:
                loss = logits_and_loss(mtf_features)[1]
                var_grads = mtf.gradients(
                    [loss], [v.outputs[0] for v in graph.trainable_variables])

            if tpu_summaries:
                mtf.scalar_summary("loss", loss)

            if callable(learning_rate_schedule):
                # the following happens on CPU since TPU can't handle summaries.
                with mtf.utils.outside_all_rewrites():
                    learning_rate = learning_rate_schedule(
                        step=tf.train.get_global_step())
                    tf.summary.scalar("learning_rate", learning_rate)
            else:
                learning_rate = learning_rate_schedule

            if isinstance(variable_filter, str):
                pattern = re.compile(variable_filter)
                variable_filter_fn = lambda v: pattern.search(v.name)
            elif variable_filter is None:
                variable_filter_fn = lambda v: True
            elif callable(variable_filter):
                variable_filter_fn = variable_filter
            else:
                raise ValueError(
                    "variable_filter must be None, a string, or a callable function"
                )
            trainable_vars = [
                v for v in graph.trainable_variables if variable_filter_fn(v)
            ]
            trainable_var_grads = [
                g for g, v in zip(var_grads, graph.trainable_variables)
                if variable_filter_fn(v)
            ]
            if len(trainable_vars) != len(graph.trainable_variables):
                tf.logging.info("Variables being trained:")
                tf.logging.info([v.name for v in trainable_vars])
                tf.logging.info("Variables not being trained:")
                tf.logging.info([
                    v.name for v in graph.trainable_variables
                    if not variable_filter_fn(v)
                ])

            update_ops = optimizer(learning_rate=learning_rate).apply_grads(
                trainable_var_grads, trainable_vars)

            lowering = mtf.Lowering(graph, {mesh: mesh_impl},
                                    autostack=autostack)

            tf_loss = lowering.export_to_tf_tensor(loss)
            tf_loss = tf.cast(tf_loss, tf.float32)
            if not use_tpu:
                tf_loss = tf.Print(
                    tf_loss, [tf_loss, tf.train.get_global_step()],
                    "step, tf_loss")

            tf_update_ops = [
                lowering.lowered_operation(op) for op in update_ops
            ]
            tf_update_ops.append(tf.assign_add(global_step, 1))
            train_op = tf.group(tf_update_ops)

            if hasattr(transformer_model, "initialize"):
                with mtf.utils.outside_all_rewrites():
                    transformer_model.initialize()

            if tpu_summaries:
                # has to be outside of
                # with mtf.utils.outside_all_rewrites()
                host_call = mtf.utils.create_host_call(model_dir)
                mtf.utils.remove_summaries()
            else:
                host_call = None

            with mtf.utils.outside_all_rewrites():

                if init_checkpoint:
                    ckpt_vars = {
                        v
                        for v, _ in tf.train.list_variables(init_checkpoint)
                    }
                    global_vars = {v.op.name for v in tf.global_variables()}
                    restore_vars = ckpt_vars.intersection(global_vars)
                    tf.logging.info("Initializing variables from %s:",
                                    init_checkpoint)
                    tf.logging.debug("\n".join(sorted(restore_vars)))
                    tf.logging.info("Variables in %s but not in graph:",
                                    init_checkpoint)
                    tf.logging.info("\n".join(sorted(ckpt_vars - global_vars)))
                    tf.logging.info("Variables in graph but not in %s:",
                                    init_checkpoint)
                    tf.logging.info("\n".join(sorted(global_vars - ckpt_vars)))
                    tf.train.init_from_checkpoint(init_checkpoint,
                                                  {v: v
                                                   for v in restore_vars})

                # Copy master variables to slices. Must be called first.
                restore_hook = mtf.MtfRestoreHook(lowering)
                saver = tf.train.Saver(tf.global_variables(),
                                       sharded=True,
                                       max_to_keep=keep_checkpoint_max,
                                       keep_checkpoint_every_n_hours=2,
                                       defer_build=False,
                                       save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                saver_listener = mtf.MtfCheckpointSaverListener(lowering)
                saver_hook = tf.train.CheckpointSaverHook(
                    model_dir,
                    save_steps=save_checkpoints_steps,
                    saver=saver,
                    listeners=[saver_listener])
                gin_config_saver_hook = gin.tf.GinConfigSaverHook(
                    model_dir,
                    summarize_config=True,
                    include_step_in_filename=False)

                if use_tpu:
                    return tpu_estimator.TPUEstimatorSpec(
                        mode=tf.estimator.ModeKeys.TRAIN,
                        loss=tf_loss,
                        train_op=train_op,
                        host_call=host_call,
                        training_hooks=[
                            restore_hook,
                            saver_hook,
                            gin_config_saver_hook,
                        ])
                else:
                    return tf.estimator.EstimatorSpec(
                        tf.estimator.ModeKeys.TRAIN,
                        loss=tf_loss,
                        train_op=train_op,
                        training_chief_hooks=[
                            restore_hook,
                            saver_hook,
                            gin_config_saver_hook,
                        ])
        elif mode == tf.estimator.ModeKeys.EVAL:
            logits, loss = logits_and_loss(mtf_features)
            anon_logits = mtf.anonymize(logits)
            lowering = mtf.Lowering(graph, {mesh: mesh_impl},
                                    autostack=autostack)
            tf_loss = tf.cast(lowering.export_to_tf_tensor(loss), tf.float32)
            tf_loss = tf.cast(tf_loss, tf.float32)
            tf_logits = tf.cast(lowering.export_to_tf_tensor(anon_logits),
                                tf.float32)

            def simple_metrics(logits, labels):
                """Simple metrics for teacher-forced eval."""
                weights = tf.cast(tf.not_equal(labels, 0), tf.float32)
                xent = tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=labels, logits=logits)
                predictions = tf.cast(tf.argmax(logits, axis=-1), labels.dtype)
                token_correct = tf.cast(tf.equal(predictions, labels),
                                        tf.float32) * weights
                sequence_correct = tf.to_float(
                    tf.equal(tf.reduce_sum(token_correct, -1),
                             tf.reduce_sum(weights, -1)))
                sequence_weights = tf.to_float(
                    tf.not_equal(tf.reduce_sum(weights, -1), 0))
                return {
                    "neg_log_perplexity":
                    tf.metrics.mean(-xent, weights),
                    "token_accuracy":
                    tf.metrics.mean(token_correct, weights),
                    "sequence_accuracy":
                    tf.metrics.mean(sequence_correct, sequence_weights)
                }

            labels = lowering.export_to_tf_tensor(anon_targets)
            eval_metrics = (simple_metrics, [tf_logits, labels])
            with mtf.utils.outside_all_rewrites():
                restore_hook = mtf.MtfRestoreHook(lowering)
            return tpu_estimator.TPUEstimatorSpec(
                tf.estimator.ModeKeys.EVAL,
                evaluation_hooks=[restore_hook],
                loss=tf_loss,
                eval_metrics=eval_metrics)
    def __init__(self,
                 sess,
                 model,
                 batch_size=1,
                 confidence=CONFIDENCE,
                 targeted=TARGETED,
                 learning_rate=LEARNING_RATE,
                 binary_search_steps=BINARY_SEARCH_STEPS,
                 max_iterations=MAX_ITERATIONS,
                 abort_early=ABORT_EARLY,
                 initial_const=INITIAL_CONST,
                 boxmin=-0.5,
                 boxmax=0.5,
                 x_window=0,
                 y_window=0,
                 window_size=-1):
        """
        The L_2 optimized attack. 

        This attack is the most efficient and should be used as the primary 
        attack to evaluate potential defenses.

        Returns adversarial examples for the supplied model.

        confidence: Confidence of adversarial examples: higher produces examples
          that are farther away, but more strongly classified as adversarial.
        batch_size: Number of attacks to run simultaneously.
        targeted: True if we should perform a targetted attack, False otherwise.
        learning_rate: The learning rate for the attack algorithm. Smaller values
          produce better results but are slower to converge.
        binary_search_steps: The number of times we perform binary search to
          find the optimal tradeoff-constant between distance and confidence. 
        max_iterations: The maximum number of iterations. Larger values are more
          accurate; setting too small will require a large learning rate and will
          produce poor results.
        abort_early: If true, allows early aborts if gradient descent gets stuck.
        initial_const: The initial tradeoff-constant to use to tune the relative
          importance of distance and confidence. If binary_search_steps is large,
          the initial constant is not important.
        boxmin: Minimum pixel value (default -0.5).
        boxmax: Maximum pixel value (default 0.5).
        """
        if window_size == -1:
            window_size = model.image_size

        image_size, num_channels, num_labels = model.image_size, model.num_channels, model.num_labels
        self.sess = sess
        self.TARGETED = targeted
        self.LEARNING_RATE = learning_rate
        self.MAX_ITERATIONS = max_iterations
        self.BINARY_SEARCH_STEPS = binary_search_steps
        self.ABORT_EARLY = abort_early
        self.CONFIDENCE = confidence
        self.initial_const = initial_const
        self.batch_size = batch_size

        self.repeat = binary_search_steps >= 10

        self.I_KNOW_WHAT_I_AM_DOING_AND_WANT_TO_OVERRIDE_THE_PRESOFTMAX_CHECK = False

        shape = (batch_size, window_size, window_size, num_channels)

        # the variable we're going to optimize over
        modifier = tf.Variable(np.zeros(
            shape, dtype=np.float32))  #qui ridimensionare per fare porzione

        # these are variables to be more efficient in sending data to tf
        self.timg = tf.Variable(np.zeros(shape), dtype=tf.float32)
        self.tlab = tf.Variable(np.zeros((batch_size, num_labels)),
                                dtype=tf.float32)
        self.const = tf.Variable(np.zeros(batch_size), dtype=tf.float32)

        # and here's what we use to assign them
        self.assign_timg = tf.placeholder(tf.float32, shape)
        self.assign_tlab = tf.placeholder(tf.float32, (batch_size, num_labels))
        self.assign_const = tf.placeholder(tf.float32, [batch_size])

        # the resulting image, tanh'd to keep bounded from boxmin to boxmax
        self.boxmul = (boxmax - boxmin) / 2.
        self.boxplus = (boxmin + boxmax) / 2.

        ###################################################################### editing

        mask = tf.zeros((batch_size, image_size, image_size, num_channels),
                        tf.float32)
        # Get input shapes
        modifier_shape = tf.shape(modifier)
        mask_shape = tf.shape(mask)
        # Make indices grid
        oo, ii, jj, kk = tf.meshgrid(tf.range(modifier_shape[0]),
                                     tf.range(modifier_shape[1]),
                                     tf.range(modifier_shape[2]),
                                     tf.range(modifier_shape[3]),
                                     indexing='ij')
        # Shift indices
        ii += y_window
        jj += x_window
        # Scatter update
        mask_to_apply = tf.tensor_scatter_nd_update(
            mask, tf.stack([oo, ii, jj, kk], axis=-1), modifier)

        self.newimg = tf.tanh(mask_to_apply +
                              self.timg) * self.boxmul + self.boxplus

        ###################################################################### editing

        # prediction BEFORE-SOFTMAX of the model
        self.output = model.predict(self.newimg)

        # distance to the input data
        self.l2dist = tf.reduce_sum(
            tf.square(self.newimg -
                      (tf.tanh(self.timg) * self.boxmul + self.boxplus)),
            [1, 2, 3])

        # compute the probability of the label class versus the maximum other
        real = tf.reduce_sum((self.tlab) * self.output, 1)
        other = tf.reduce_max(
            (1 - self.tlab) * self.output - (self.tlab * 10000), 1)

        if self.TARGETED:
            # if targetted, optimize for making the other class most likely
            loss1 = tf.maximum(0.0, other - real + self.CONFIDENCE)
        else:
            # if untargeted, optimize for making this class least likely.
            loss1 = tf.maximum(0.0, real - other + self.CONFIDENCE)

        # sum up the losses
        self.loss2 = tf.reduce_sum(self.l2dist)
        self.loss1 = tf.reduce_sum(self.const * loss1)
        self.loss = self.loss1 + self.loss2

        # Setup the adam optimizer and keep track of variables we're creating
        start_vars = set(x.name for x in tf.global_variables())
        optimizer = tf.train.AdamOptimizer(self.LEARNING_RATE)
        self.train = optimizer.minimize(self.loss, var_list=[modifier])
        end_vars = tf.global_variables()
        new_vars = [x for x in end_vars if x.name not in start_vars]

        # these are the variables to initialize when we run
        self.setup = []
        self.setup.append(self.timg.assign(self.assign_timg))
        self.setup.append(self.tlab.assign(self.assign_tlab))
        self.setup.append(self.const.assign(self.assign_const))

        self.init = tf.variables_initializer(var_list=[mask] + new_vars)
Beispiel #23
0
            def build_example(label, param_dict_real, zip_path_label):
                """Build the model with parameter values set in param_dict_real.

        Args:
          label: Label of the model
          param_dict_real: Parameter dictionary (arguments to the factories
            make_graph and make_test_inputs)
          zip_path_label: Filename in the zip

        Returns:
          (tflite_model_binary, report) where tflite_model_binary is the
          serialized flatbuffer as a string and report is a dictionary with
          keys `toco_log` (log of toco conversion), `tf_log` (log of tf
          conversion), `toco` (a string of success status of the conversion),
          `tf` (a string success status of the conversion).
        """

                np.random.seed(RANDOM_SEED)
                report = {
                    "converter": report_lib.NOTRUN,
                    "tf": report_lib.FAILED
                }

                # Build graph
                report["tf_log"] = ""
                report["converter_log"] = ""
                tf.reset_default_graph()

                with tf.Graph().as_default():
                    with tf.device("/cpu:0"):
                        try:
                            inputs, outputs = make_graph(param_dict_real)
                            inputs = [x for x in inputs if x is not None]
                        except (tf.errors.UnimplementedError,
                                tf.errors.InvalidArgumentError, ValueError):
                            report["tf_log"] += traceback.format_exc()
                            return None, report

                    sess = tf.Session()
                    try:
                        baseline_inputs, baseline_outputs = (make_test_inputs(
                            param_dict_real, sess, inputs, outputs))
                        baseline_inputs = [
                            x for x in baseline_inputs if x is not None
                        ]
                        # Converts baseline inputs/outputs to maps. The signature input and
                        # output names are set to be the same as the tensor names.
                        input_names = [
                            _normalize_input_name(x.name) for x in inputs
                        ]
                        output_names = [
                            _normalize_output_name(x.name) for x in outputs
                        ]
                        baseline_input_map = dict(
                            zip(input_names, baseline_inputs))
                        baseline_output_map = dict(
                            zip(output_names, baseline_outputs))
                    except (tf.errors.UnimplementedError,
                            tf.errors.InvalidArgumentError, ValueError):
                        report["tf_log"] += traceback.format_exc()
                        return None, report
                    report["converter"] = report_lib.FAILED
                    report["tf"] = report_lib.SUCCESS

                    # Sorts the lists to make the order of input/output the same as order
                    # of the signature names.
                    # TODO(b/192473002): Remove sorting after TFLiteDriver can run with
                    # signatures.
                    inputs = sorted(
                        inputs, key=lambda x: _normalize_input_name(x.name))
                    outputs = sorted(
                        outputs, key=lambda x: _normalize_output_name(x.name))

                    # Builds a saved model with the default signature key.
                    input_names, tensor_info_inputs = _get_tensor_info(
                        inputs, "input_", _normalize_input_name)
                    output_tensors, tensor_info_outputs = _get_tensor_info(
                        outputs, "output_", _normalize_output_name)
                    input_tensors = [(name, t.shape, t.dtype)
                                     for name, t in zip(input_names, inputs)]

                    inference_signature = (
                        tf.saved_model.signature_def_utils.build_signature_def(
                            inputs=tensor_info_inputs,
                            outputs=tensor_info_outputs,
                            method_name="op_test"))
                    saved_model_dir = tempfile.mkdtemp("op_test")
                    saved_model_tags = [tf.saved_model.tag_constants.SERVING]
                    signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
                    builder = tf.saved_model.builder.SavedModelBuilder(
                        saved_model_dir)
                    builder.add_meta_graph_and_variables(
                        sess,
                        saved_model_tags,
                        signature_def_map={
                            signature_key: inference_signature,
                        },
                        strip_default_attrs=True)
                    builder.save(as_text=False)
                    # pylint: disable=g-long-ternary
                    graph_def = freeze_graph(
                        sess,
                        tf.global_variables() + inputs +
                        outputs) if use_frozen_graph else sess.graph_def

                if "split_tflite_lstm_inputs" in param_dict_real:
                    extra_toco_options.split_tflite_lstm_inputs = param_dict_real[
                        "split_tflite_lstm_inputs"]
                tflite_model_binary, toco_log = options.tflite_convert_function(
                    options,
                    saved_model_dir,
                    input_tensors,
                    output_tensors,
                    extra_toco_options=extra_toco_options,
                    test_params=param_dict_real)
                report["converter"] = (report_lib.SUCCESS
                                       if tflite_model_binary is not None else
                                       report_lib.FAILED)
                report["converter_log"] = toco_log

                if options.save_graphdefs:
                    zipinfo = zipfile.ZipInfo(zip_path_label + ".pbtxt")
                    archive.writestr(zipinfo,
                                     text_format.MessageToString(graph_def),
                                     zipfile.ZIP_DEFLATED)

                if tflite_model_binary:
                    if options.make_edgetpu_tests:
                        # Set proper min max values according to input dtype.
                        baseline_input_map, baseline_output_map = generate_inputs_outputs(
                            tflite_model_binary, min_value=0, max_value=255)
                    zipinfo = zipfile.ZipInfo(zip_path_label + ".bin")
                    archive.writestr(zipinfo, tflite_model_binary,
                                     zipfile.ZIP_DEFLATED)

                    # TODO(b/192473002): Remove sorting after TFLiteDriver can run with
                    # signatures.
                    baseline_input_map = collections.OrderedDict(
                        sorted(baseline_input_map.items()))
                    baseline_output_map = collections.OrderedDict(
                        sorted(baseline_output_map.items()))
                    example = {
                        "inputs": baseline_input_map,
                        "outputs": baseline_output_map
                    }

                    example_fp = StringIO()
                    write_examples(example_fp, [example])
                    zipinfo = zipfile.ZipInfo(zip_path_label + ".inputs")
                    archive.writestr(zipinfo, example_fp.getvalue(),
                                     zipfile.ZIP_DEFLATED)

                    example_fp2 = StringIO()
                    write_test_cases(example_fp2, zip_path_label + ".bin",
                                     [example])
                    zipinfo = zipfile.ZipInfo(zip_path_label + "_tests.txt")
                    archive.writestr(zipinfo, example_fp2.getvalue(),
                                     zipfile.ZIP_DEFLATED)

                    zip_manifest_label = zip_path_label + " " + label
                    if zip_path_label == label:
                        zip_manifest_label = zip_path_label

                    zip_manifest.append(zip_manifest_label + "\n")

                return tflite_model_binary, report
Beispiel #24
0
def _load_checkpoint(session, checkpoint_path):
    # Load the checkpoint and put all variables into loading list
    # we will exclude variables we do not wish to load and then
    # we will initialize them instead
    ckpt = tfv1.train.load_checkpoint(checkpoint_path)
    vars_in_ckpt = frozenset(ckpt.get_variable_to_shape_map().keys())
    load_vars = set(tfv1.global_variables())
    init_vars = set()

    # We explicitly allow the learning rate variable to be missing for backwards
    # compatibility with older checkpoints.
    lr_var = set(v for v in load_vars if v.op.name == 'learning_rate')
    if lr_var and ('learning_rate' not in vars_in_ckpt or FLAGS.force_initialize_learning_rate):
        assert len(lr_var) <= 1
        load_vars -= lr_var
        init_vars |= lr_var

    if FLAGS.load_cudnn:
        # Initialize training from a CuDNN RNN checkpoint
        # Identify the variables which we cannot load, and set them
        # for initialization
        missing_vars = set()
        for v in load_vars:
            if v.op.name not in vars_in_ckpt:
                log_warn('CUDNN variable not found: %s' % (v.op.name))
                missing_vars.add(v)
                init_vars.add(v)

        load_vars -= init_vars

        # Check that the only missing variables (i.e. those to be initialised)
        # are the Adam moment tensors, if they aren't then we have an issue
        missing_var_names = [v.op.name for v in missing_vars]
        if any('Adam' not in v for v in missing_var_names):
            log_error('Tried to load a CuDNN RNN checkpoint but there were '
                      'more missing variables than just the Adam moment '
                      'tensors. Missing variables: {}'.format(missing_var_names))
            sys.exit(1)

    if FLAGS.drop_source_layers > 0:
        # This transfer learning approach requires supplying
        # the layers which we exclude from the source model.
        # Say we want to exclude all layers except for the first one,
        # then we are dropping five layers total, so: drop_source_layers=5
        # If we want to use all layers from the source model except
        # the last one, we use this: drop_source_layers=1
        if FLAGS.drop_source_layers >= 6:
            log_warn('The checkpoint only has 6 layers, but you are trying to drop '
                     'all of them or more than all of them. Continuing and '
                     'dropping only 5 layers.')
            FLAGS.drop_source_layers = 5

        dropped_layers = ['2', '3', 'lstm', '5', '6'][-1 * int(FLAGS.drop_source_layers):]
        # Initialize all variables needed for DS, but not loaded from ckpt
        for v in load_vars:
            if any(layer in v.op.name for layer in dropped_layers):
                init_vars.add(v)
        load_vars -= init_vars

    for v in sorted(load_vars, key=lambda v: v.op.name):
        log_info('Loading variable from checkpoint: %s' % (v.op.name))
        v.load(ckpt.get_tensor(v.op.name), session=session)

    for v in sorted(init_vars, key=lambda v: v.op.name):
        log_info('Initializing variable: %s' % (v.op.name))
        session.run(v.initializer)
Beispiel #25
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        tf.logging.info("*** Features ***")
        for name in sorted(features.keys()):
            tf.logging.info("  name = %s, shape = %s" %
                            (name, features[name].shape))

        # MTF setup.
        graph = mtf.Graph()
        mesh_shape = mtf.convert_to_shape(FLAGS.mesh_shape)
        layout_rules = mtf.convert_to_layout_rules(FLAGS.layout)

        ctx = params["context"]
        num_hosts = ctx.num_hosts
        host_placement_fn = ctx.tpu_host_placement_function
        device_list = [host_placement_fn(host_id=t) for t in range(num_hosts)]
        tf.logging.info("device_list = %s" % device_list, )
        replica_cache_size = 300 * 1000000  # 300M per replica
        # Worker 0 caches all the TPU binaries.
        worker0_mem = replica_cache_size * ctx.num_replicas
        devices_memeory_usage = [worker0_mem] + [0] * (num_hosts - 1)
        var_placer = mtf.utils.BalancedVariablePlacer(device_list,
                                                      devices_memeory_usage)
        mesh_devices = [""] * mesh_shape.size
        physical_shape = list(ctx.device_assignment.topology.mesh_shape)
        logical_to_physical = mtf.simd_mesh_impl.auto_logical_to_physical_tpu(
            mesh_shape.to_integer_list, physical_shape)
        mesh_impl = mtf.simd_mesh_impl.SimdMeshImpl(
            mesh_shape,
            layout_rules,
            mesh_devices,
            ctx.device_assignment,
            logical_to_physical=logical_to_physical)
        mesh = mtf.Mesh(graph, "bert_mesh", var_placer)

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        masked_lm_positions = features["masked_lm_positions"]
        masked_lm_ids = features["masked_lm_ids"]
        masked_lm_weights = features["masked_lm_weights"]
        next_sentence_labels = tf.squeeze(features["next_sentence_labels"], 1)

        batch_size = input_ids.get_shape()[0].value
        batch_dim = mtf.Dimension("batch", batch_size)

        seq_length = input_ids.get_shape()[1].value
        seq_dim = mtf.Dimension("seq", seq_length)
        max_predictions_per_seq = masked_lm_positions.get_shape()[1].value
        max_predictions_per_seq_dim = mtf.Dimension("max_pred_seq",
                                                    max_predictions_per_seq)

        mtf_input_ids = mtf.import_tf_tensor(mesh, input_ids,
                                             [batch_dim, seq_dim])
        mtf_input_mask = mtf.import_tf_tensor(mesh, input_mask,
                                              [batch_dim, seq_dim])
        mtf_segment_ids = mtf.import_tf_tensor(mesh, segment_ids,
                                               [batch_dim, seq_dim])
        mtf_masked_lm_positions = mtf.import_tf_tensor(
            mesh, masked_lm_positions,
            [batch_dim, max_predictions_per_seq_dim])
        mtf_masked_lm_ids = mtf.import_tf_tensor(
            mesh, masked_lm_ids, [batch_dim, max_predictions_per_seq_dim])

        mtf_masked_lm_weights = mtf.import_tf_tensor(
            mesh, masked_lm_weights, [batch_dim, max_predictions_per_seq_dim])
        mtf_next_sentence_labels = mtf.import_tf_tensor(
            mesh, next_sentence_labels, [batch_dim])

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = bert_lib.BertModel(config=bert_config,
                                   is_training=is_training,
                                   input_ids=mtf_input_ids,
                                   input_mask=mtf_input_mask,
                                   token_type_ids=mtf_segment_ids,
                                   layout=layout_rules,
                                   mesh_shape=mesh_shape)

        (masked_lm_loss, masked_lm_example_loss,
         masked_lm_logits) = model.get_masked_lm_output(
             mtf_masked_lm_positions, mtf_masked_lm_ids, mtf_masked_lm_weights)

        (next_sentence_loss, next_sentence_example_loss, next_sentence_logits
         ) = model.get_next_sentence_output(mtf_next_sentence_labels)

        extra_loss = model.get_extra_loss()

        total_loss = masked_lm_loss + next_sentence_loss
        total_loss = mtf.anonymize(total_loss)
        masked_lm_example_loss = mtf.anonymize(masked_lm_example_loss)
        masked_lm_logits = mtf.anonymize(masked_lm_logits)
        next_sentence_example_loss = mtf.anonymize(next_sentence_example_loss)
        next_sentence_logits = mtf.anonymize(next_sentence_logits)

        # TRAIN mode
        if mode == tf.estimator.ModeKeys.TRAIN:
            _, update_ops = optimization_lib.create_optimizer(
                total_loss + extra_loss,
                learning_rate,
                num_train_steps,
                num_warmup_steps,
                optimizer=FLAGS.optimizer,
                clip_gradients=FLAGS.clip_gradients)

        lowering = mtf.Lowering(graph, {mesh: mesh_impl})

        tf_loss = tf.to_float(lowering.export_to_tf_tensor(total_loss))

        if mode == tf.estimator.ModeKeys.TRAIN:
            global_step = tf.train.get_global_step()
            tf_update_ops = [
                lowering.lowered_operation(op) for op in update_ops
            ]
            tf_update_ops.append(tf.assign_add(global_step, 1))
            tf.logging.info("tf_update_ops: {}".format(tf_update_ops))
            train_op = tf.group(tf_update_ops)
        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_example_loss, masked_lm_logits,
                          masked_lm_ids, masked_lm_weights,
                          next_sentence_example_loss, next_sentence_logits,
                          next_sentence_labels):
                """Computes the loss and accuracy of the model."""
                masked_lm_logits = tf.reshape(masked_lm_logits,
                                              [-1, masked_lm_logits.shape[-1]])
                masked_lm_predictions = tf.argmax(masked_lm_logits,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_example_loss = tf.reshape(masked_lm_example_loss,
                                                    [-1])
                masked_lm_ids = tf.reshape(masked_lm_ids, [-1])
                masked_lm_weights = tf.reshape(masked_lm_weights, [-1])
                masked_lm_accuracy = tf.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.metrics.mean(
                    values=masked_lm_example_loss, weights=masked_lm_weights)

                next_sentence_logits = tf.reshape(
                    next_sentence_logits, [-1, next_sentence_logits.shape[-1]])
                next_sentence_predictions = tf.argmax(next_sentence_logits,
                                                      axis=-1,
                                                      output_type=tf.int32)
                next_sentence_labels = tf.reshape(next_sentence_labels, [-1])
                next_sentence_accuracy = tf.metrics.accuracy(
                    labels=next_sentence_labels,
                    predictions=next_sentence_predictions)
                next_sentence_mean_loss = tf.metrics.mean(
                    values=next_sentence_example_loss)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                    "next_sentence_accuracy": next_sentence_accuracy,
                    "next_sentence_loss": next_sentence_mean_loss,
                }

            eval_metrics = (metric_fn, [
                lowering.export_to_tf_tensor(masked_lm_example_loss),
                lowering.export_to_tf_tensor(masked_lm_logits), masked_lm_ids,
                masked_lm_weights,
                lowering.export_to_tf_tensor(next_sentence_example_loss),
                lowering.export_to_tf_tensor(next_sentence_logits),
                next_sentence_labels
            ])

        with mtf.utils.outside_all_rewrites():
            # Copy master variables to slices. Must be called first.
            restore_hook = mtf.MtfRestoreHook(lowering)
            if mode == tf.estimator.ModeKeys.TRAIN:
                saver = tf.train.Saver(tf.global_variables(),
                                       sharded=True,
                                       max_to_keep=10,
                                       keep_checkpoint_every_n_hours=2,
                                       defer_build=False,
                                       save_relative_paths=True)
                tf.add_to_collection(tf.GraphKeys.SAVERS, saver)
                saver_listener = mtf.MtfCheckpointSaverListener(lowering)
                saver_hook = tf.train.CheckpointSaverHook(
                    FLAGS.output_dir,
                    save_steps=1000,
                    saver=saver,
                    listeners=[saver_listener])

                return tf.estimator.tpu.TPUEstimatorSpec(
                    tf.estimator.ModeKeys.TRAIN,
                    loss=tf_loss,
                    train_op=train_op,
                    training_hooks=[restore_hook, saver_hook])
            elif mode == tf.estimator.ModeKeys.EVAL:
                return tf.estimator.tpu.TPUEstimatorSpec(
                    tf.estimator.ModeKeys.EVAL,
                    evaluation_hooks=[restore_hook],
                    loss=tf_loss,
                    eval_metrics=eval_metrics)
Beispiel #26
0
def _initialize_all_variables(session):
    init_vars = tfv1.global_variables()
    for v in init_vars:
        session.run(v.initializer)
Beispiel #27
0
def run_training():
    if not os.path.exists(FLAGS.model_dir):
        os.makedirs(FLAGS.model_dir)

    poems_vector, word_to_int, vocabularies = process_poems(FLAGS.file_path)
    batches_inputs, batches_outputs = generate_batch(FLAGS.batch_size,
                                                     poems_vector, word_to_int)

    input_data = tf.placeholder(tf.int32, [FLAGS.batch_size, None])
    output_targets = tf.placeholder(tf.int32, [FLAGS.batch_size, None])

    end_points = rnn_model(model='lstm',
                           input_data=input_data,
                           output_data=output_targets,
                           vocab_size=len(vocabularies),
                           rnn_size=128,
                           num_layers=2,
                           batch_size=64,
                           learning_rate=FLAGS.learning_rate)

    saver = tf.train.Saver(tf.global_variables())
    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())
    with tf.Session() as sess:
        # sess = tf_debug.LocalCLIDebugWrapperSession(sess=sess)
        # sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan)
        sess.run(init_op)

        start_epoch = 0
        checkpoint = tf.train.latest_checkpoint(FLAGS.model_dir)
        if checkpoint:
            saver.restore(sess, checkpoint)
            print("## restore from the checkpoint {0}".format(checkpoint))
            start_epoch += int(checkpoint.split('-')[-1])
        print('## start training...')
        try:
            n_chunk = len(poems_vector) // FLAGS.batch_size
            for epoch in range(start_epoch, FLAGS.epochs):
                n = 0
                for batch in range(n_chunk):
                    loss, _, _ = sess.run(
                        [
                            end_points['total_loss'], end_points['last_state'],
                            end_points['train_op']
                        ],
                        feed_dict={
                            input_data: batches_inputs[n],
                            output_targets: batches_outputs[n]
                        })
                    n += 1
                    print('Epoch: %d, batch: %d, training loss: %.6f' %
                          (epoch, batch, loss))
                if epoch % 6 == 0:
                    saver.save(sess,
                               os.path.join(FLAGS.model_dir,
                                            FLAGS.model_prefix),
                               global_step=epoch)
        except KeyboardInterrupt:
            print('## Interrupt manually, try saving checkpoint for now...')
            saver.save(sess,
                       os.path.join(FLAGS.model_dir, FLAGS.model_prefix),
                       global_step=epoch)
            print(
                '## Last epoch were saved, next time will start from epoch {}.'
                .format(epoch))
def build_graph(bert_config,
                opts,
                iterations_per_step=1,
                is_training=True,
                feed_name=None):
    """Build the graph for training.

    Args:
        bert_config: configuration for the BERT model.
        opts: a dictionary containing all global options.
        iterations_per_step: number of iterations per step
        is_training (bool): if true return a graph with trainable variables.
        feed_name: name of the IPU infeed.

    Returns:
        a GraphOps containing a BERT graph and session prepared for inference or training.
    """
    train_graph = tf.Graph()
    with train_graph.as_default():

        placeholders = dict()
        placeholders['learning_rate'] = tf.placeholder(bert_config.dtype,
                                                       shape=[])
        learning_rate = placeholders['learning_rate']

        train_iterator = ipu.ipu_infeed_queue.IPUInfeedQueue(
            dataset.data(opts, is_training=is_training),
            feed_name=feed_name + "_in",
            replication_factor=opts['replicas'])

        outfeed_queue = ipu.ipu_outfeed_queue.IPUOutfeedQueue(
            feed_name=feed_name + "_out", replication_factor=opts['replicas'])

        with ipu.scopes.ipu_scope('/device:IPU:0'):
            train = training_step_with_infeeds_and_outfeeds(
                bert_config,
                train_iterator,
                outfeed_queue,
                opts,
                learning_rate,
                iterations_per_step,
                is_training=is_training)

        outfeed = outfeed_queue.dequeue()

        bert_logging.print_trainable_variables(opts['logs_path'])

        model_variables = tf.trainable_variables() + tf.get_collection(
            tf.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)
        model_and_optimiser_variables = tf.global_variables()

        restore = tf.train.Saver(
            var_list=model_and_optimiser_variables
            if opts['restore_optimiser_from_ckpt'] else model_variables)

        # We store two savers: one for the standard training and another one for the best checkpoint
        savers = {
            "train_saver":
            tf.train.Saver(var_list=model_variables if opts['ckpt_model_only']
                           else model_and_optimiser_variables,
                           name='latest',
                           max_to_keep=5),
            "best_saver":
            tf.train.Saver(var_list=model_variables if opts['ckpt_model_only']
                           else model_and_optimiser_variables,
                           name='best',
                           max_to_keep=1)
        }

        ipu.utils.move_variable_initialization_to_cpu()
        train_init = tf.global_variables_initializer()
        tvars = tf.trainable_variables()

    # Calculate number of IPUs required for pretraining pipeline.
    num_embedding_ipu = {
        'two_ipus': 2,
        'same_ipu': 1,
        'same_as_hidden_layers': 0
    }[opts['embeddings_placement']]

    num_hidden_layer_stages = len(bert_config.hidden_layers_per_stage)
    num_ipus_required = opts['replicas'] * next_power_of_two(
        num_hidden_layer_stages + num_embedding_ipu)

    # Configure the IPU options.
    ipu_options = get_ipu_config(
        fp_exceptions=opts["fp_exceptions"],
        stochastic_rounding=opts['stochastic_rounding'],
        xla_recompute=opts["xla_recompute"],
        available_memory_proportion=opts['available_memory_proportion'],
        disable_graph_outlining=opts["disable_graph_outlining"],
        num_ipus_required=num_ipus_required,
        max_cross_replica_sum_buffer_size=opts[
            'max_cross_replica_sum_buffer_size'],
        scheduler_selection=opts['scheduler'],
        compile_only=opts['compile_only'],
        partials_type=opts['partials_type'])
    ipu.utils.configure_ipu_system(ipu_options)

    train_sess = tf.Session(graph=train_graph, config=tf.ConfigProto())

    return GraphOps(train_graph, train_sess, train_init, [train], placeholders,
                    train_iterator, outfeed, savers, restore, tvars)
Beispiel #29
0
    def test_simple_model_shapes(self):
        # Shape = [4, 2, 3].
        input_features = tf.constant([[[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]],
                                      [[7.0, 8.0, 9.0], [10.0, 11.0, 12.0]],
                                      [[13.0, 14.0, 15.0], [16.0, 17.0, 18.0]],
                                      [[19.0, 20.0, 21.0], [22.0, 23.0,
                                                            24.0]]])
        output_sizes = {'a': 8, 'b': [4, 3]}
        outputs, activations = models.simple_model(input_features,
                                                   output_sizes,
                                                   sequential_inputs=False,
                                                   is_training=True,
                                                   num_bottleneck_nodes=16)

        expected_global_variable_shapes = {
            'SimpleModel/InputFC/Linear/weight:0': ([3, 1024]),
            'SimpleModel/InputFC/Linear/bias:0': ([1024]),
            'SimpleModel/InputFC/BatchNorm/gamma:0': ([1024]),
            'SimpleModel/InputFC/BatchNorm/beta:0': ([1024]),
            'SimpleModel/InputFC/BatchNorm/moving_mean:0': ([1024]),
            'SimpleModel/InputFC/BatchNorm/moving_variance:0': ([1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_0/Linear/weight:0':
            ([1024, 1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_0/Linear/bias:0': ([1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_0/BatchNorm/gamma:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_0/BatchNorm/beta:0': ([1024
                                                                         ]),
            'SimpleModel/FullyConnectedBlock_0/FC_0/BatchNorm/moving_mean:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_0/BatchNorm/moving_variance:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_1/Linear/weight:0':
            ([1024, 1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_1/Linear/bias:0': ([1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_1/BatchNorm/gamma:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_1/BatchNorm/beta:0': ([1024
                                                                         ]),
            'SimpleModel/FullyConnectedBlock_0/FC_1/BatchNorm/moving_mean:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_0/FC_1/BatchNorm/moving_variance:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_1/FC_0/Linear/weight:0':
            ([1024, 1024]),
            'SimpleModel/FullyConnectedBlock_1/FC_0/Linear/bias:0': ([1024]),
            'SimpleModel/FullyConnectedBlock_1/FC_0/BatchNorm/gamma:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_1/FC_0/BatchNorm/beta:0': ([1024
                                                                         ]),
            'SimpleModel/FullyConnectedBlock_1/FC_0/BatchNorm/moving_mean:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_1/FC_0/BatchNorm/moving_variance:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_1/FC_1/Linear/weight:0': ([
                1024, 1024
            ]),
            'SimpleModel/FullyConnectedBlock_1/FC_1/Linear/bias:0': ([1024]),
            'SimpleModel/FullyConnectedBlock_1/FC_1/BatchNorm/gamma:0': ([
                1024
            ]),
            'SimpleModel/FullyConnectedBlock_1/FC_1/BatchNorm/beta:0': ([1024
                                                                         ]),
            'SimpleModel/FullyConnectedBlock_1/FC_1/BatchNorm/moving_mean:0':
            ([1024]),
            'SimpleModel/FullyConnectedBlock_1/FC_1/BatchNorm/moving_variance:0':
            ([1024]),
            'SimpleModel/BottleneckLogits/weight:0': ([1024, 16]),
            'SimpleModel/BottleneckLogits/bias:0': ([16]),
            'SimpleModel/OutputLogits/a/weight:0': ([16, 8]),
            'SimpleModel/OutputLogits/a/bias:0': ([8]),
            'SimpleModel/OutputLogits/b/weight:0': ([16, 12]),
            'SimpleModel/OutputLogits/b/bias:0': ([12]),
        }

        self.assertDictEqual(
            {var.name: var.shape.as_list()
             for var in tf.global_variables()},
            expected_global_variable_shapes)
        self.assertCountEqual(outputs.keys(), ['a', 'b'])
        self.assertAllEqual(outputs['a'].shape.as_list(), [4, 2, 8])
        self.assertAllEqual(outputs['b'].shape.as_list(), [4, 2, 4, 3])
        self.assertCountEqual(activations.keys(),
                              ['base_activations', 'bottleneck_activations'])
        self.assertAllEqual(activations['base_activations'].shape.as_list(),
                            [4, 2, 1024])
        self.assertAllEqual(
            activations['bottleneck_activations'].shape.as_list(), [4, 2, 16])
Beispiel #30
0
    def test_simple_conv3d(self, threshold, expected_alive):
        # TODO(e1) remove when gamma is supported.
        # This test works if reshape not set to be a handled by
        # leaf_op_handler.LeafOpHandler() in op_handlers.py. However this changes
        # brakes other tests for reasons to be investigated.
        if SKIP_GAMMA_CONV3D:
            return

        def fused_batch_norm3d(*args, **kwargs):
            if args:
                inputs = args[0]
                args = args[1:]
            else:
                inputs = kwargs.pop('inputs')
            shape = inputs.shape
            # inputs is assumed to be NHWTC (T is for time).
            batch_size = shape[0]
            # space x time cube is reshaped to be 2D with dims:
            # H, W, T --> H, W * T
            # the idea is that batch norm only needs this to collect spacial stats.
            target_shape = [
                batch_size, shape[1], shape[2] * shape[3], shape[4]
            ]
            inputs = tf.reshape(inputs, target_shape, name='Reshape/to2d')
            normalized = slim.batch_norm(inputs, *args, **kwargs)
            return tf.reshape(normalized, shape, name='Reshape/to3d')

        gamma_val = [0.5, 0.3, 0.2]
        num_inputs = 4
        batch_size = 2
        video = tf.zeros([batch_size, 8, 8, 8, num_inputs])
        kernel = [5, 5, 5]
        num_outputs = 3
        net = slim.conv3d(video,
                          num_outputs,
                          kernel,
                          padding='SAME',
                          normalizer_fn=fused_batch_norm3d,
                          normalizer_params={
                              'scale': True,
                              'fused': True
                          },
                          scope='vconv1')
        self.assertLen(net.shape.as_list(), 5)
        shape = net.shape.as_list()
        # The number of applications is the number of elements in the [HWT] tensor.
        num_applications = shape[1] * shape[2] * shape[3]
        application_cost = num_inputs * kernel[0] * kernel[1] * kernel[2]
        name_to_var = {v.op.name: v for v in tf.global_variables()}
        flop_reg = flop_regularizer.GammaFlopsRegularizer(
            [
                net.op,
                tf.get_default_graph().get_operation_by_name('vconv1/Conv3D')
            ],
            threshold,
            force_group=[
                'vconv1/Reshape/to3d|vconv1/Reshape/to2d|vconv1/Conv3D'
            ])
        gamma = name_to_var['vconv1/BatchNorm/gamma']
        with self.session():
            tf.global_variables_initializer().run()
            gamma.assign(gamma_val).eval()

            self.assertAllClose(
                flop_reg.get_cost(),
                2 * expected_alive * num_applications * application_cost)

            raw_cost = 2 * num_outputs * num_applications * application_cost
            self.assertAllClose(flop_reg.get_regularization_term(),
                                raw_cost * np.mean(gamma_val))