Example #1
0
 def remove(self, filename):
     if os.path.exists(filename):
         try:
             ret_code = subprocess.check_call(["rm", filename])
             logging.debug("delete local file " + filename)
         except subprocess.CalledProcessError, e:
             logging.error(e)
Example #2
0
 def run_eval_step(self, sess, next_batch):
     """Runs one evaluation iteration.
     Returns a dictionary containing summaries, loss, global_step and (optionally) coverage loss.
     """
     articles, abstracts = sess.run(next_batch)
     for i in range(len(articles)):
         article = articles[i]
         abstract = abstracts[i]
         log.debug('eval i={}\n\narticle={}\n\nabstract={}'.format(
             i, repr(article), repr(abstract)))
     log.debug('eval len(articles)={}, len(abstracts)={}'.format(
         len(articles), len(abstracts)))
     batch = batcher.to_batch(articles=articles,
                              abstracts=abstracts,
                              vocab=self._vocab,
                              hps=self._hps,
                              pointer_gen=self._pointer_gen)
     feed_dict = self._make_feed_dict(batch)
     to_return = {
         'summaries': self._summaries,
         'loss': self._loss,
         'global_step': tf.train.get_global_step()
     }
     if self._coverage:
         to_return['coverage_loss'] = self._coverage_loss
     return sess.run(to_return, feed_dict)
Example #3
0
    def make_batch_predictions(self, video_id_batch_val, video_batch_val):
        """
        Make predictions for a batch of videos.
        Return:
            Predictions probabilities as a Numpy array.
        """
        topk_video_ids, topk_labels = find_k_nearest_neighbors(
            video_id_batch_val,
            video_batch_val,
            self.train_data_pipeline,
            is_train=False,
            k=self.k)

        logging.debug('topk_video_ids: {}\ntopk_labels: {}'.format(
            topk_video_ids, topk_labels))

        # batch_size * delta.
        deltas = topk_labels.astype(np.int32).sum(axis=1)

        batch_predictions_prob = []
        for delta in deltas:
            positive_prob_numerator = np.multiply(
                self.labels_prior_prob,
                self.pos_prob_positive[delta, self.range_num_classes])
            negative_prob_numerator = np.multiply(
                1.0 - self.labels_prior_prob,
                self.pos_prob_negative[delta, self.range_num_classes])
            # predictions = positive_prob_numerator > negative_prob_numerator

            batch_predictions_prob.append(
                np.true_divide(
                    positive_prob_numerator,
                    positive_prob_numerator + negative_prob_numerator))

        return np.array(batch_predictions_prob, dtype=np.float32)
    def attack(self, imgs, targets):
        """
    Perform the EAD attack on the given instance for the given targets.

    If self.targeted is true, then the targets represents the target labels
    If self.targeted is false, then targets are the original class labels
    """

        batch_size = self.batch_size
        r = []
        for i in range(0, len(imgs) // batch_size):
            logging.debug(
                ("Running EAD attack on instance %s of %s", i * batch_size,
                 len(imgs)))
            r.extend(
                self.attack_batch(imgs[i * batch_size:(i + 1) * batch_size],
                                  targets[i * batch_size:(i + 1) *
                                          batch_size]))
        if len(imgs) % batch_size != 0:
            last_elements = len(imgs) - (len(imgs) % batch_size)
            logging.debug(
                ("Running EAD attack on instance %s of %s", last_elements,
                 len(imgs)))
            temp_imgs = np.zeros((batch_size, ) + imgs.shape[2:])
            temp_targets = np.zeros((batch_size, ) + targets.shape[2:])
            temp_imgs[:(len(imgs) % batch_size)] = imgs[last_elements:]
            temp_targets[:(len(imgs) % batch_size)] = targets[last_elements:]
            temp_data = self.attack_batch(temp_imgs, temp_targets)
            r.extend(temp_data[:(len(imgs) % batch_size)],
                     targets[last_elements:])
        return np.array(r)
Example #5
0
    def input_fn(batch_size):
        debug("input_fn images shape %s" % (images.shape, ))
        debug("input_fn labels shape %s" % (labels.shape, ))
        dataset = tf.data.Dataset.from_tensor_slices((images, labels))

        SHUFFLE_SIZE = 5000
        dataset = dataset.shuffle(SHUFFLE_SIZE).repeat().batch(batch_size)
        dataset = dataset.prefetch(None)

        return dataset
Example #6
0
    def close(self):
        self.put_data(None)
        self.put_msg('Done')
        self.__blocked = True

        filename = self.get_data(timeout=2)
        while filename:
            self.remove(filename)
            filename = self.get_data(timout=2)
        logging.debug("clean data queue && close")
        self.__closed = True
  def attack(self, imgs, targets):
    """
    Perform the L_2 attack on the given instance for the given targets.
    If self.targeted is true, then the targets represents the target labels
    If self.targeted is false, then targets are the original class labels
    """

    r = []
    for i in range(0, len(imgs), self.batch_size):
      logging.debug(
          ("Running CWL2 attack on instance %s of %s", i, len(imgs)))
      adv = self.attack_batch(
        imgs[i:i + self.batch_size], targets[i:i + self.batch_size])
      r.extend(adv)
    return np.array(r)
Example #8
0
 def process(self, element):
     labels = {
         constants.SUBDIR_POSITIVE: constants.POSITIVE_SENTIMENT_LABEL,
         constants.SUBDIR_NEGATIVE: constants.NEGATIVE_SENTIMENT_LABEL
     }
     found_labels = [labels[l] for l in labels if l in element]
     if len(found_labels) > 1:
         raise ValueError('Incompatible path: `{}`.'.format(element))
     if found_labels:
         with gfile.GFile(element, 'r') as single_file:
             for line in single_file:
                 yield {
                     constants.LABELS: found_labels[0],
                     constants.REVIEW: line
                 }
     else:
         logging.debug('Label not found for file: `%s`.', element)
Example #9
0
 def step_fn(step_context):
     articles, abstracts = step_context.session.run(next_batch)
     for i in range(len(articles)):
         article = articles[i]
         abstract = abstracts[i]
         log.debug('train i={}\n\narticle={}\n\nabstract={}'.format(
             i, repr(article), repr(abstract)))
     batch = batcher.to_batch(articles=articles,
                              abstracts=abstracts,
                              vocab=self._vocab,
                              hps=self._hps,
                              pointer_gen=self._pointer_gen)
     feed_dict = self._make_feed_dict(batch)
     to_return = {
         'train_op': self._train_op,
         'summaries': self._summaries,
         'loss': self._loss,
         'global_step': tf.train.get_global_step()
     }
     if self._coverage:
         to_return['coverage_loss'] = self._coverage_loss
     return step_context.run_with_hooks(to_return, feed_dict)
Example #10
0
 def enqueue(self, sess):
     # first remove used local files
     #if len(self._curr_files) > 1:
     #    for f in self._curr_files:
     #        self._file_queue.remove(f)
     self._curr_files = []
     # enqueue
     if self._finished >= self.num_downloader or not self._downloading:
         return 1
     while len(self._curr_files) < self.min_cache:
         f = self._file_queue.get_data()
         if f is None:
             self._finished += 1
             continue
         logging.debug('Got file {}'.format(f))
         self._curr_files.append(f)
         if self._finished >= self.num_downloader or not self._downloading:
             break
     outputs = sess.run([self._enqueue_op, self._queue_size_op],
                        feed_dict={self._input_files: self._curr_files})
     logging.debug('Output queue size: {}'.format(outputs[1]))
     self._enqueued += self.min_cache
     return 0
Example #11
0
def __run_training(model, data_dir, coverage, debug, conf, hps):
    """Repeatedly runs training iterations, logging loss to screen and writing summaries"""
    log.debug("starting run_training")
    checkpoint_dir = os.path.join(conf.model_dir, 'train')
    if not os.path.exists(checkpoint_dir):
        os.makedirs(checkpoint_dir)
    with model.build_graph().as_default():
        summary_writer = tf.summary.FileWriterCache.get(checkpoint_dir)
        ds = etl.dataset(data_dir, hps.batch_size, shuffle=True, repeat=True)
        iterator = ds.make_one_shot_iterator()
        ds_init_op = iterator.make_initializer(ds)
        next_batch = iterator.get_next()
        with __session(checkpoint_dir=checkpoint_dir,
                       debug=debug,
                       conf=conf,
                       local_init_ops=[ds_init_op]) as sess:
            step = 0
            # repeats until max_step is reached
            while not sess.should_stop() and step <= hps.max_step:
                t0 = time.time()
                results = model.run_train_step(sess, next_batch)
                t1 = time.time()
                loss = results['loss']
                if not np.isfinite(loss):
                    raise Exception("Loss is not finite. Stopping.")
                step = results[
                    'global_step']  # we need this to update our running average loss
                msg = 'train step={}, loss={:.4f}, secs={}'.format(
                    step, loss, int(t1 - t0))
                if coverage:
                    coverage_loss = results['coverage_loss']
                    msg += ", coverage_loss={:.4f}".format(coverage_loss)
                log.info(msg)
                # get the summaries and iteration number so we can write summaries to tensorboard
                summaries = results['summaries']
                summary_writer.add_summary(summaries, step)
    log.info('training done')
  def attack_batch(self, imgs, labs):
    """
    Run the attack on a batch of instance and labels.
    """

    def compare(x, y):
      if not isinstance(x, (float, int, np.int64)):
        x = np.copy(x)
        if self.y_target:
          x[y] -= self.confidence
        else:
          x[y] += self.confidence
        x = np.argmax(x)
      if self.y_target:
        return x == y
      else:
        return x != y

    batch_size = self.batch_size

    oimgs = np.clip(imgs, self.clip_min, self.clip_max)

    # re-scale instances to be within range [0, 1]
    imgs = (imgs - self.clip_min) / (self.clip_max - self.clip_min)
    imgs = np.clip(imgs, 0, 1)
    # now convert to [-1, 1]
    imgs = (imgs * 2) - 1
    # convert to tanh-space
    imgs = np.arctanh(imgs * .999999)

    # set the lower and upper bounds accordingly
    lower_bound = np.zeros(batch_size)
    CONST = np.ones(batch_size) * self.initial_const
    upper_bound = np.ones(batch_size) * 1e10

    # placeholders for the best l2, score, and instance attack found so far
    o_bestl2 = [1e10] * batch_size
    o_bestscore = [-1] * batch_size
    o_bestattack = np.copy(oimgs)

    for outer_step in range(self.binary_search_steps):
      # completely reset adam's internal state.
      self.sess.run(self.init)
      batch = imgs[:batch_size]
      batchlab = labs[:batch_size]

      bestl2 = [1e10] * batch_size
      bestscore = [-1] * batch_size
      logging.debug("  Binary search step %s of %s",
                    outer_step, self.binary_search_steps)

      # The last iteration (if we run many steps) repeat the search once.
      if self.repeat and outer_step == self.binary_search_steps - 1:
        CONST = upper_bound

      # set the variables so that we don't have to send them over again
      self.sess.run(
          self.setup, {
              self.assign_timg: batch,
              self.assign_tlab: batchlab,
              self.assign_const: CONST
          })

      prev = 1e6
      for iteration in range(self.max_iterations):
        # perform the attack
        _, l, l2s, scores, nimg = self.sess.run([
            self.train, self.loss, self.l2dist, self.output,
            self.newimg
        ])

        if iteration % ((self.max_iterations // 10) or 1) == 0:
          logging.debug(("    Iteration {} of {}: loss={:.3g} " +
                         "l2={:.3g} f={:.3g}").format(
                             iteration, self.max_iterations, l,
                             np.mean(l2s), np.mean(scores)))

        # check if we should abort search if we're getting nowhere.
        if self.abort_early and \
           iteration % ((self.max_iterations // 10) or 1) == 0:
          if l > prev * .9999:
            msg = "    Failed to make progress; stop early"
            logging.debug(msg)
            break
          prev = l

        # adjust the best result found so far
        for e, (l2, sc, ii) in enumerate(zip(l2s, scores, nimg)):
          lab = np.argmax(batchlab[e])
          if l2 < bestl2[e] and compare(sc, lab):
            bestl2[e] = l2
            bestscore[e] = np.argmax(sc)
          if l2 < o_bestl2[e] and compare(sc, lab):
            o_bestl2[e] = l2
            o_bestscore[e] = np.argmax(sc)
            o_bestattack[e] = ii

      # adjust the constant as needed
      for e in range(batch_size):
        if compare(bestscore[e], np.argmax(batchlab[e])) and \
           bestscore[e] != -1:
          # success, divide const by two
          upper_bound[e] = min(upper_bound[e], CONST[e])
          if upper_bound[e] < 1e9:
            CONST[e] = (lower_bound[e] + upper_bound[e]) / 2
        else:
          # failure, either multiply by 10 if no solution found yet
          #          or do binary search with the known upper bound
          lower_bound[e] = max(lower_bound[e], CONST[e])
          if upper_bound[e] < 1e9:
            CONST[e] = (lower_bound[e] + upper_bound[e]) / 2
          else:
            CONST[e] *= 10
      logging.debug("  Successfully generated adversarial examples " +
                    "on {} of {} instances.".format(
                        sum(upper_bound < 1e9), batch_size))
      o_bestl2 = np.array(o_bestl2)
      mean = np.mean(np.sqrt(o_bestl2[o_bestl2 < 1e9]))
      logging.debug("   Mean successful distortion: {:.4g}".format(mean))

    # return the best solution found
    logging.info("  Successfully generated adversarial examples " +
                  "on {} of {} instances.".format(
                      sum(upper_bound < 1e9), batch_size))
    o_bestl2 = np.array(o_bestl2)
    mean = np.mean(np.sqrt(o_bestl2[o_bestl2 < 1e9]))
    logging.info("   Mean successful distortion: {:.4g}".format(mean))
    return o_bestattack
Example #13
0
def compute_prior_prob(data_pipeline, smooth_para=1.0):
    """
    Compute prior probabilities for future use in ml-knn.
    :param data_pipeline:
    :param smooth_para:
    :return: (total number of labels per label, total number of videos processed, prior probabilities)
    """
    reader = data_pipeline.reader
    num_classes = reader.num_classes

    with tf.Graph().as_default() as g:
        sum_labels_onehot = tf.Variable(tf.zeros([num_classes]))
        total_num_videos = tf.Variable(0, dtype=tf.float32)

        # Generate example queue. Traverse the queue to traverse the data set.
        video_id_batch, video_batch, video_labels_batch, num_frames_batch = get_input_data_tensors(
            data_pipeline, num_epochs=1, name_scope='prior_prob_input')

        sum_labels_onehot_op = sum_labels_onehot.assign_add(
            tf.reduce_sum(tf.cast(video_labels_batch, tf.float32), axis=0))
        accum_num_videos_op = total_num_videos.assign_add(
            tf.cast(tf.shape(video_labels_batch)[0], tf.float32))

        with tf.control_dependencies(
            [sum_labels_onehot_op, accum_num_videos_op]):
            accum_non_op = tf.no_op()

        init_op = tf.group(tf.global_variables_initializer(),
                           tf.local_variables_initializer())

    with tf.Session(graph=g) as sess:
        sess.run(init_op)

        # Start input enqueue threads.
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

        try:
            while not coord.should_stop():
                # sum video labels
                sess.run(accum_non_op)

        except tf.errors.OutOfRangeError:
            logging.info('Done the whole data set.')
        finally:
            # When done, ask the threads to stop.
            coord.request_stop()

        # Wait for threads to finish.
        coord.join(threads)

        sum_labels_val, total_num_videos_val = sess.run(
            [sum_labels_onehot, total_num_videos])
        sess.close()

    labels_prior_prob_val = (smooth_para + sum_labels_val) / (
        smooth_para * 2 + total_num_videos_val)

    logging.debug('sum_labels_val: {}\n accum_num_videos_val: {}'.format(
        sum_labels_val, total_num_videos_val))
    logging.debug('compute_labels_prob: {}'.format(labels_prior_prob_val))

    return sum_labels_val, total_num_videos_val, labels_prior_prob_val
Example #14
0
    def transform(self, test_data_pipeline, out_file_location, top_k=20):
        test_graph = tf.Graph()
        with test_graph.as_default():
            video_id_batch, video_batch, labels_batch, num_frames_batch = (
                get_input_data_tensors(test_data_pipeline, shuffle=False, num_epochs=1, name_scope='test_input'))

            init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer())

        # Run test graph to get video batch and feed video batch to pre_trained_graph to get predictions.
        test_sess = tf.Session(graph=test_graph)
        with gfile.Open(out_file_location, "w+") as out_file:
            test_sess.run(init_op)

            # Be cautious to not be blocked by queue.
            # Start input enqueue threads.
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=test_sess, coord=coord)

            processing_count, num_examples_processed = 0, 0
            out_file.write("VideoId,LabelConfidencePairs\n")

            try:
                while not coord.should_stop():
                    # Run training steps or whatever.
                    start_time = time.time()
                    video_id_batch_val, video_batch_val = test_sess.run([video_id_batch, video_batch])
                    logging.debug('video_id_batch_val: {}\nvideo_batch_val: {}'.format(
                        video_id_batch_val, video_batch_val))

                    batch_predictions_prob_list = []
                    for sess, video_input_batch, pred_prob, phase_train_pl in zip(
                            self.sess_list, self.video_input_batch_list,
                            self.pred_prob_list, self.phase_train_pl_list):
                        feature_shape = video_input_batch.get_shape()[-1]
                        # logging.info('Feature shape is {}.'.format(feature_shape))
                        if feature_shape == 128:
                            _video_batch = video_batch_val[:, -128:]
                        elif feature_shape == 1024:
                            _video_batch = video_batch_val[:, :1024]
                        else:
                            _video_batch = video_batch_val

                        batch_predictions_prob = sess.run(pred_prob, feed_dict=dict(
                            {video_input_batch: _video_batch}, **phase_train_pl
                        ))
                        batch_predictions_prob_list.append(batch_predictions_prob)

                    batch_predictions_mean_prob = np.mean(np.stack(batch_predictions_prob_list, axis=0), axis=0)
                    # Write batch predictions to files.
                    for line in format_lines(video_id_batch_val, batch_predictions_mean_prob, top_k):
                        out_file.write(line)
                    out_file.flush()

                    now = time.time()
                    processing_count += 1
                    num_examples_processed += video_id_batch_val.shape[0]
                    print('Batch processing step {}, elapsed {} s, processed {} examples in total'.format(
                        processing_count, now - start_time, num_examples_processed))

            except tf.errors.OutOfRangeError:
                logging.info('Done with inference. The predictions were written to {}'.format(out_file_location))
            finally:
                # When done, ask the threads to stop.
                coord.request_stop()

            # Wait for threads to finish.
            coord.join(threads)

            test_sess.close()
            out_file.close()
Example #15
0
    def fit(self,
            train_data_pipeline,
            start_new_model=False,
            tr_data_fn=None,
            tr_data_paras=None,
            validate_set=None,
            validate_fn=None,
            bootstrap=False,
            init_learning_rate=0.01,
            decay_steps=40000,
            decay_rate=0.95,
            epochs=None,
            l1_reg_rate=None,
            l2_reg_rate=0.01,
            pos_weights=None,
            initial_weights=None,
            initial_biases=None):
        """
        Logistic regression fit function.
        Args:
            train_data_pipeline: A namedtuple consisting of reader, data_pattern, batch_size and num_readers.
            start_new_model: If True, start a new model instead of restoring from existing checkpoints.
            tr_data_fn: a function that transforms input data.
            tr_data_paras: Other parameters should be passed to tr_data_fn. A dictionary.
            validate_set: If not None, check validation loss regularly. Else, ignored.
            validate_fn: The function to check the performance of learned model parameters on validate set.
            bootstrap: If True, sampling training examples with replacement by differential weighting.
            init_learning_rate: Decayed gradient descent parameter.
            decay_steps: Decayed gradient descent parameter.
            decay_rate: Decayed gradient descent parameter.
            epochs: Maximal epochs to use.
            l1_reg_rate: None, not impose l1 regularization.
            l2_reg_rate: l2 regularization rate.
            pos_weights: For imbalanced binary classes. Here, num_pos << num_neg, the weights should be > 1.0.
                If None, treated as 1.0 for all binary classifiers.
            initial_weights: If not None, the weights will be initialized with it.
            initial_biases: If not None, the biases will be initialized with it.
        Returns: None.
        """
        reader = train_data_pipeline.reader
        batch_size = train_data_pipeline.batch_size
        num_classes = reader.num_classes
        feature_names = reader.feature_names
        feature_sizes = reader.feature_sizes
        logging.info(
            'Logistic regression uses {} features with dims {}.'.format(
                feature_names, feature_sizes))

        raw_feature_size = sum(feature_sizes)

        self.train_data_pipeline = train_data_pipeline
        self.raw_feature_size = raw_feature_size
        self.feature_size = raw_feature_size
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.tr_data_fn = tr_data_fn
        self.tr_data_paras = tr_data_paras
        self.bootstrap = bootstrap
        self.init_learning_rate = init_learning_rate
        self.decay_steps = decay_steps
        self.decay_rate = decay_rate
        self.epochs = epochs
        self.l1_reg_rate = l1_reg_rate
        self.l2_reg_rate = l2_reg_rate
        self.pos_weights = pos_weights
        self.initial_weights = initial_weights
        self.initial_biases = initial_biases

        # Check extra data transform function arguments.
        # If transform changes the features size, change it.
        if self.tr_data_fn is not None:
            if self.tr_data_paras is None:
                self.tr_data_paras = dict()
            else:
                if ('reshape' in self.tr_data_paras) and (
                        self.tr_data_paras['reshape'] is True):
                    self.feature_size = self.tr_data_paras['size']
                    logging.warn(
                        'Data transform changes the features size to {}.'.
                        format(self.feature_size))

            logging.debug('Data transform arguments are {}.'.format(
                self.tr_data_paras))
        else:
            self.tr_data_paras = dict()

        start_new_model = start_new_model or (not tf.gfile.Exists(self.logdir))

        # This is NECESSARY to avoid contaminating default graph.
        # Alternatively, we can define a member graph variable. When building a new graph or
        # restoring a graph, wrap the code into a similar contextmanager.
        self.graph = tf.Graph()
        with self.graph.as_default():
            if start_new_model:
                logging.info('Starting a new model...')
                # Start new model, delete existing checkpoints.
                if tf.gfile.Exists(self.logdir):
                    try:
                        tf.gfile.DeleteRecursively(self.logdir)
                    except tf.errors.OpError:
                        logging.error('Failed to delete dir {}.'.format(
                            self.logdir))
                    else:
                        logging.info(
                            'Succeeded to delete train dir {}.'.format(
                                self.logdir))
                else:
                    # Do nothing.
                    pass

                # Build graph, namely building a graph and initialize member variables associated with graph.
                self.saver = self._build_graph()
            else:
                self.saver = self._restore_graph()

            # After either building a graph or restoring a graph, graph is CONSTRUCTED successfully.
            # Get collections to be used in training.
            self.global_step = tf.get_collection('global_step')[0]
            self.init_op = tf.get_collection('init_op')[0]
            self.train_op = tf.get_collection('train_op')[0]
            self.summary_op = tf.get_collection('summary_op')[0]
            self.raw_features_batch = tf.get_collection(
                'raw_features_batch')[0]
            self.labels_batch = tf.get_collection('labels_batch')[0]
            self.loss = tf.get_collection('loss')[0]
            self.pred_prob = tf.get_collection('predictions')[0]

        if self._check_graph_initialized():
            logging.info('Succeeded to initialize logistic regression Graph.')
        else:
            logging.error('Failed to initialize logistic regression Graph.')

        # Start or restore training.
        # To avoid summary causing memory usage peak, manually save summaries.
        sv = tf.train.Supervisor(graph=self.graph,
                                 init_op=self.init_op,
                                 logdir=self.logdir,
                                 global_step=self.global_step,
                                 summary_op=None,
                                 save_model_secs=600,
                                 saver=self.saver)

        with sv.managed_session() as sess:
            logging.info("Entering training loop...")
            for step in range(self.max_train_steps):
                if sv.should_stop():
                    # Save the final model and break.
                    self.saver.save(sess,
                                    save_path='{}_{}'.format(
                                        sv.save_path, 'final'))
                    break

                if step % 500 == 0:
                    if validate_fn is not None:
                        _, summary, train_pred_prob_batch, train_labels_batch, global_step_val = sess.run(
                            [
                                self.train_op, self.summary_op, self.pred_prob,
                                self.labels_batch, self.global_step
                            ])

                        # Evaluate on train data.
                        train_per = validate_fn(
                            predictions=train_pred_prob_batch,
                            labels=train_labels_batch)
                        sv.summary_writer.add_summary(
                            MakeSummary(
                                'train/{}'.format(validate_fn.func_name),
                                train_per), global_step_val)
                        logging.info('Step {}, train {}: {}.'.format(
                            global_step_val, validate_fn.func_name, train_per))
                    else:
                        _, summary, global_step_val = sess.run(
                            [self.train_op, self.summary_op, self.global_step])

                    # Add train summary.
                    sv.summary_computed(sess,
                                        summary,
                                        global_step=global_step_val)

                    # Compute validate loss and performance (validate_fn).
                    if validate_set is not None:
                        validate_data, validate_labels = validate_set

                        # Compute validation loss.
                        num_validate_videos = validate_data.shape[0]
                        split_indices = np.linspace(
                            0,
                            num_validate_videos + 1,
                            num=max(
                                num_validate_videos // (2 * batch_size) + 1,
                                2),
                            dtype=np.int32)

                        validate_loss_vals, predictions = [], []
                        for i in range(len(split_indices) - 1):
                            start_ind = split_indices[i]
                            end_ind = split_indices[i + 1]

                            if validate_fn is not None:
                                ith_validate_loss_val, ith_predictions = sess.run(
                                    [self.loss, self.pred_prob],
                                    feed_dict={
                                        self.raw_features_batch:
                                        validate_data[start_ind:end_ind],
                                        self.labels_batch:
                                        validate_labels[start_ind:end_ind]
                                    })

                                validate_loss_vals.append(
                                    ith_validate_loss_val *
                                    (end_ind - start_ind))
                                predictions.append(ith_predictions)
                            else:
                                ith_validate_loss_val = sess.run(
                                    self.loss,
                                    feed_dict={
                                        self.raw_features_batch:
                                        validate_data[start_ind:end_ind],
                                        self.labels_batch:
                                        validate_labels[start_ind:end_ind]
                                    })

                                validate_loss_vals.append(
                                    ith_validate_loss_val *
                                    (end_ind - start_ind))

                        validate_loss_val = sum(
                            validate_loss_vals) / num_validate_videos
                        # Add validate summary.
                        sv.summary_writer.add_summary(
                            MakeSummary('validate/xentropy',
                                        validate_loss_val), global_step_val)

                        if validate_fn is not None:
                            validate_per = validate_fn(
                                predictions=np.concatenate(predictions,
                                                           axis=0),
                                labels=validate_labels)

                            sv.summary_writer.add_summary(
                                MakeSummary(
                                    'validate/{}'.format(
                                        validate_fn.func_name), validate_per),
                                global_step_val)
                            logging.info('Step {}, validate {}: {}.'.format(
                                global_step_val, validate_fn.func_name,
                                validate_per))

                elif step % 200 == 0:
                    _, summary, global_step_val = sess.run(
                        [self.train_op, self.summary_op, self.global_step])
                    sv.summary_computed(sess,
                                        summary,
                                        global_step=global_step_val)
                else:
                    sess.run(self.train_op)

            logging.info("Exited training loop.")

        # Session will close automatically when with clause exits.
        # sess.close()
        sv.stop()
Example #16
0
    def fit(self, max_iter=100, tol=0.01):
        """
        This function works as sk-learn estimator fit.
        :param max_iter: 
        :param tol: Percentage not improved one iteration, stop iteration.
        :return: Update current centers and current objective function value (member variables).
        """
        for iter_count in range(max_iter):
            start_time = time.time()
            new_centers, new_mean_dist, new_per_clu_mean_dist = self.kmeans_iter(
            )
            print('The {}-th iteration took {} s.'.format(
                iter_count + 1,
                time.time() - start_time))

            # There are empty centers (clusters) being removed.
            need_rebuild_graph = new_centers.shape[
                0] != self.current_centers.shape[0]

            # Update current centers and mean distance per cluster.
            # Normalize current centers if distance metric is cosine.
            if self.metric == 'cosine':
                self.current_centers = new_centers / np.clip(
                    np.linalg.norm(new_centers, axis=-1, keepdims=True), 1e-6,
                    np.PINF)
            else:
                self.current_centers = new_centers

            self.per_clu_mean_dist = new_per_clu_mean_dist

            # Converged, break!
            if not np.isinf(self.mean_dist) and np.abs(
                    self.mean_dist - new_mean_dist) / self.mean_dist < tol:
                # Update current objective function value.
                self.mean_dist = new_mean_dist
                logging.info(
                    'Done k-means clustering. Final centers have shape {}. Final mean dist is {}.'
                    .format(self.current_centers.shape, self.mean_dist))
                break
            else:
                # Update current objective function value.
                self.mean_dist = new_mean_dist

            if need_rebuild_graph:
                # Re-build graph using updated current centers.
                self.build_iter_graph()
                initialize_success = self.check_graph_initialized()
                if initialize_success:
                    logging.info(
                        'Succeeded re-initializing a Tensorflow graph to perform k-means.'
                    )
                else:
                    raise ValueError(
                        'Failed to re-initialize a Tensorflow Graph to perform k-means.'
                    )

            logging.debug('new_centers: {}'.format(self.current_centers))
            logging.info('new_centers shape: {}'.format(
                self.current_centers.shape))
            logging.info('New mean point-center distance: {}'.format(
                self.mean_dist))
Example #17
0
      tf.keras.datasets.cifar10.load_data()

img_width, img_height, img_channels = 32, 32, 3
label_dimensions = 10

train_images = np.asarray(train_images, dtype=np.float32) / 255
test_images = np.asarray(test_images, dtype=np.float32) / 255

train_images_mean = np.mean(train_images, axis=0)
train_images -= train_images_mean
test_images -= train_images_mean

train_images = train_images.reshape((-1, img_width, img_height, img_channels))
test_images = test_images.reshape((-1, img_width, img_height, img_channels))

debug("shape train_images %s" % (train_images.shape, ))
debug("shape train_labels %s" % (train_labels.shape, ))
debug("shape test_images %s" % (test_images.shape, ))
debug("shape test_labels %s" % (test_labels.shape, ))

train_labels = tf.keras.utils.to_categorical(train_labels, label_dimensions)
test_labels = tf.keras.utils.to_categorical(test_labels, label_dimensions)

train_labels = train_labels.astype(np.float32)
test_labels = test_labels.astype(np.float32)

train_labels = np.asarray(train_labels).astype('int').reshape(
    (-1, label_dimensions))
test_labels = np.asarray(test_labels).astype('int').reshape(
    (-1, label_dimensions))
Example #18
0
import tempfile


def fetch_hdfs_data(paths, data_msg_q, retry_times=3, data_dir=None):
    msg = data_msg_q.get_msg()
    if data_dir and not os.path.isdir(data_dir):
        try:
            data_dir = tempfile.mkdtemp(prefix='data_',
                                        suffix='_tmp',
                                        dir='./')
        except Exception, e:
            logging.error(e)
            data_dir = './'

    while True:
        logging.debug("receive msg: " + msg)
        if msg == 'reset':
            for data_path in paths:
                filename = os.path.split(data_path)[1]
                filepath = os.path.join(data_dir, filename)
                if os.path.exists(filepath):
                    logging.debug(filename + " all readly exist in local")
                    data_msg_q.put_data(filepath)
                    logging.debug("reuse local data " + filepath + " done")
                    continue
                count, ret_code = 0, -1
                command = ["hadoop", "fs", "-get", data_path, data_dir]
                while count < retry_times and ret_code != 0:
                    try:
                        ret_code = subprocess.check_call(command)
                    except subprocess.CalledProcessError, e:
Example #19
0
  def attack(self, imgs):
    """
    Return a tensor that constructs adversarial examples for the given
    input. Generate uses tf.py_func in order to operate over tensors.
    :param x: A tensor with the inputs.
    :param kwargs: See `parse_params`
    """
    imgs = tf.cast(imgs, tf.float32)
    preds = self.fn_logits(imgs)
    preds_max = tf.reduce_max(preds, 1, keepdims=True)
    original_predictions = tf.to_float(tf.equal(preds, preds_max))
    labs = tf.stop_gradient(original_predictions)
    repeat = self.binary_search_steps >= 10
    shape = tf.shape(imgs)

    # # the variable we're going to optimize over
    # modifier = tfe.Variable(tf.zeros(shape, dtype=tf_dtype))

    def compute_newimage(imgs, modifier):
      # the resulting instance, tanh'd to keep bounded from clip_min
      # to clip_max
      newimg = (tf.tanh(modifier + imgs) + 1) / 2
      newimg = newimg * (self.clip_max - self.clip_min) + self.clip_min
      return newimg

    def get_l2dist(imgs, newimg):
      # distance to the input data
      other = (tf.tanh(imgs) + 1) / 2 * (self.clip_max - self.clip_min) + self.clip_min
      sum_axis = list(range(1, len(shape.numpy())))
      l2dist = tf.reduce_sum(tf.square(newimg - other), sum_axis)
      return l2dist

    def loss(timg, tlab, const, modifier):
      newimg = compute_newimage(timg, modifier)
      # prediction BEFORE-SOFTMAX of the model
      if self.sample <= 1:
        output = self.fn_logits(newimg)
      else:
        logging.info(
          "Monte Carlo (MC) on attacks, sample: {}".format(self.sample))
        for i in range(self.sample):
          logits = self.fn_logits(newimg)
          if i == 0:
            assert logits.op.type != 'Softmax'
          output.append(logits)
        output = tf.reduct_mean(output, 0)

      # distantce to the input data
      l2dist = get_l2dist(timg, newimg)

      # compute the probability of the label class versus the maximum other
      real_target = tf.reduce_sum((tlab) * output, 1)
      other_target = tf.reduce_max((1 - tlab) * output - tlab * 10000, 1)
      zero = tf.constant(0., dtype=tf_dtype)
      if self.y_target:
        # if targeted, optimize for making the other class most likely
        loss1 = tf.maximum(zero, other_target - real_target + self.confidence)
      else:
        # if untargeted, optimize for making this class least likely.
        loss1 = tf.maximum(zero, real_target - other_target + self.confidence)

      # sum up the losses
      loss2 = tf.reduce_sum(l2dist)
      loss1 = tf.reduce_sum(const * loss1)
      loss = loss1 + loss2
      return loss, output


    def grad(imgs, labs, const, modifier):
      with tf.GradientTape(watch_accessed_variables=False) as tape:
        tape.watch(modifier)
        loss_value, logits = loss(imgs, labs, const, modifier)
        with tape.stop_recording():
          gradients = tape.gradient(loss_value, [modifier])
      return gradients, loss_value, logits


    def compare_multi(x, y):
      x_array = tf.unstack(x)
      if self.y_target:
        x_array[y] = x_array[y] - self.confidence
      else:
        x_array[y] = x_array[y] + self.confidence
      x = tf.argmax(tf.stack(x_array))
      if self.y_target:
        return x == y
      else:
        return x != y

    def compare_single(x, y):
      if self.y_target:
        return x == y
      else:
        return x != y


    # batch_size = tf.shape(imgs)[0]
    batch_size = imgs.get_shape().as_list()[0]

    # re-scale instances to be within range [0, 1]
    imgs = (imgs - self.clip_min) / (self.clip_max - self.clip_min)
    imgs = tf.clip_by_value(imgs, 0, 1)
    # now convert to [-1, 1]
    imgs = (imgs * 2) - 1
    # convert to tanh-space
    imgs = tf.atanh(imgs * .999999)

    # set the lower and upper bounds accordingly
    lower_bound = tfe.Variable(tf.zeros(batch_size), trainable=False)
    const = tfe.Variable(tf.ones(batch_size) * self.initial_const, trainable=False)
    upper_bound = tfe.Variable(tf.ones(batch_size) * 1e10, trainable=False)

    # placeholders for the best l2, score, and instance attack found so far
    o_bestl2 = tfe.Variable(tf.constant(1e10, shape=(batch_size, )), trainable=False)
    o_bestscore = tfe.Variable(tf.constant(-1, shape=(batch_size, )), trainable=False)
    o_bestattack = tfe.Variable(tf.identity(imgs), trainable=False)

    for outer_step in range(self.binary_search_steps):

      # completely reset adam's internal state.
      modifier = tfe.Variable(tf.zeros(shape, dtype=tf_dtype))
      optimizer = tf.train.AdamOptimizer(self.learning_rate)

      bestl2 = tfe.Variable(tf.constant(1e10, shape=(batch_size, )), trainable=False)
      bestscore = tfe.Variable(tf.constant(-1, shape=(batch_size, )), trainable=False)
      logging.info("  Binary search step %s of %s",
                    outer_step, self.binary_search_steps)

      # The last iteration (if we run many steps) repeat the search once.
      if repeat and outer_step == self.binary_search_steps - 1:
        const = upper_bound

      prev = 1e6
      for iteration in range(self.max_iterations):

        import resource, gc
        mem = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss
        logging.info('resource {}'.format(mem))
        gc.collect()

        tf.set_random_seed(np.random.randint(0, 100))

        # perform the attack
        gradients, loss_value, scores = grad(imgs, labs, const, modifier)
        optimizer.apply_gradients(zip(gradients, [modifier]))

        nimg = compute_newimage(imgs, modifier)
        l2s = get_l2dist(imgs, nimg)

        if iteration % ((self.max_iterations // 10) or 1) == 0 and \
           logging.get_verbosity() == logging.DEBUG:
          l2_mean = tf.reduce_mean(l2s).numpy()
          logging.debug(
            "    Iteration {} of {}: loss={:.3g} l2={:.3g}".format(
              iteration, self.max_iterations, loss_value, l2_mean))

        # check if we should abort search if we're getting nowhere.
        if self.abort_early and \
           iteration % ((self.max_iterations // 10) or 1) == 0:
          if loss_value > prev * .9999:
            logging.debug("    Failed to make progress; stop early" )
            break
          prev = loss_value

        # adjust the best result found so far
        for e, (l2, sc, ii) in enumerate(zip(l2s, scores, nimg)):
          lab = tf.argmax(labs[e])
          comp = compare_multi(sc, lab)
          if l2 < bestl2[e] and comp:
            bestl2[e].assign(l2)
            bestscore[e].assign(tf.argmax(sc, output_type=tf.int32))
          if l2 < o_bestl2[e] and comp:
            o_bestl2[e].assign(l2)
            o_bestscore[e].assign(tf.argmax(sc, output_type=tf.int32))
            o_bestattack[e].assign(ii)

      # adjust the constant as needed
      for e in range(batch_size):
        if compare_single(bestscore[e], tf.argmax(labs[e])) and bestscore[e] != -1:
          # success, divide const by two
          upper_bound[e].assign(tf.minimum(upper_bound[e], const[e]))
          if upper_bound[e] < 1e9:
            const[e].assign((lower_bound[e] + upper_bound[e]) / 2)
        else:
          # failure, either multiply by 10 if no solution found yet
          #          or do binary search with the known upper bound
          lower_bound[e].assign(tf.maximum(lower_bound[e], const[e]))
          if upper_bound[e] < 1e9:
            const[e].assign((lower_bound[e] + upper_bound[e]) / 2)
          else:
            const[e].assign(const[e]*10)

      if logging.get_verbosity() == logging.DEBUG:
        success = tf.cast(tf.less(upper_bound, 1e9), tf.int32)
        logging.debug("  Successfully generated adversarial examples " +
                      "on {} of {} instances.".format(
                          tf.reduce_sum(success), batch_size))

        mask = tf.less(o_bestl2, 1e9)
        mean = tf.reduce_mean(tf.sqrt(tf.boolean_mask(o_bestl2, mask)))
        logging.debug("   Mean successful distortion: {:.4g}".format(mean.numpy()))

    # return the best solution found
    success = tf.cast(tf.less(upper_bound, 1e9), tf.int32)
    logging.info("  Successfully generated adversarial examples " +
                 "on {} of {} instances.".format(
                      tf.reduce_sum(success), batch_size))

    mask = tf.less(o_bestl2, 1e9)
    mean = tf.reduce_mean(tf.sqrt(tf.boolean_mask(o_bestl2, mask)))
    logging.info("   Mean successful distortion: {:.4g}".format(mean.numpy()))
    return o_bestattack.read_value()
Example #20
0
    def make_predictions(self, test_data_pipeline, output_file_loc, top_k=20):
        """
        Make predictions.
        :param test_data_pipeline
        :param output_file_loc: The file to which predictions should be written to. Supports gcloud file.
        :param top_k: See FLAGS.top_k.
        """
        with tf.Graph().as_default() as g:
            video_id_batch, video_batch, video_labels_batch, num_frames_batch = get_input_data_tensors(
                test_data_pipeline, num_epochs=1, name_scope='test_input')

            init_op = tf.group(tf.global_variables_initializer(),
                               tf.local_variables_initializer())

        with tf.Session(graph=g) as sess, gfile.Open(output_file_loc,
                                                     "w+") as out_file:
            sess.run(init_op)

            # Be cautious to not be blocked by queue.
            # Start input enqueue threads.
            coord = tf.train.Coordinator()
            threads = tf.train.start_queue_runners(sess=sess, coord=coord)

            processing_count, num_examples_processed = 0, 0
            out_file.write("VideoId,LabelConfidencePairs\n")

            try:

                while not coord.should_stop():
                    # Run training steps or whatever.
                    start_time = time.time()
                    video_id_batch_val, video_batch_val = sess.run(
                        [video_id_batch, video_batch])

                    logging.debug(
                        'video_id_batch_val: {}\nvideo_batch_val: {}'.format(
                            video_id_batch_val, video_batch_val))

                    # Pass values instead of tensors.
                    batch_predictions_prob = self.make_batch_predictions(
                        video_id_batch_val, video_batch_val)

                    # Write batch predictions to files.
                    for line in format_lines(video_id_batch_val,
                                             batch_predictions_prob, top_k):
                        out_file.write(line)
                    out_file.flush()

                    now = time.time()
                    processing_count += 1
                    num_examples_processed += video_id_batch_val.shape[0]
                    print(
                        'Batch processing step {}, elapsed {} seconds, processed {} examples in total'
                        .format(processing_count, now - start_time,
                                num_examples_processed))

            except tf.errors.OutOfRangeError:
                logging.info(
                    'Done with inference. The predictions were written to {}'.
                    format(output_file_loc))
            finally:
                # When done, ask the threads to stop.
                coord.request_stop()

            # Wait for threads to finish.
            coord.join(threads)

            sess.close()
            out_file.close()
Example #21
0
    def prepare_serialized_examples(self, serialized_examples):

        logging.set_verbosity(tf.logging.DEBUG)

        # hardcoded values
        len_features_frames = 1024
        len_features_audio = 128
        name_frames = "mean_rgb"
        name_audio = "mean_audio"

        # set the mapping from the fields to data types in the proto
        num_features = len(self.feature_names)
        assert num_features > 0, "self.feature_names is empty!"
        assert len(self.feature_names) == len(self.feature_sizes), \
            "length of feature_names (={}) != length of feature_sizes (={})".format( \
                len(self.feature_names), len(self.feature_sizes))

        feature_map = {
            "video_id": tf.FixedLenFeature([], tf.string),
            "labels": tf.VarLenFeature(tf.int64)
        }
        logging.debug("self.random_selection es " + str(self.random_selection))

        zeros_float = tf.zeros([tf.shape(serialized_examples)[0]])
        # Manera cutre de crear un vector de False. Alguna altra manera ha d'haver-hi
        is_negative = tf.not_equal(zeros_float, zeros_float)

        for feature_index in range(num_features):
            feature_map[
                self.feature_names[feature_index]] = tf.FixedLenFeature(
                    [self.feature_sizes[feature_index]], tf.float32)

        features = tf.parse_example(serialized_examples, features=feature_map)
        features_rgb = features[name_frames]
        features_audio = features[name_audio]

        labels_audio = tf.sparse_to_indicator(features["labels"],
                                              self.num_classes)

        batch_size = tf.shape(features[name_frames])[0]

        if self.negative_sampling:

            labels = tf.sparse_to_indicator(features["labels"],
                                            self.num_classes)
            labels.set_shape([None, self.num_classes])

            def return_itself(a, b):
                return a, b

            # 80% of the samples are negative
            number_neg_sample = tf.random_uniform(
                [],
                minval=0.,
                maxval=1.,
                dtype=tf.float32,
                name="random_number_neg_sample")
            constant = tf.constant(self.percentage_negative)
            batch_size = tf.shape(features_rgb)[0]
            logging.info("-----------------")
            logging.info(batch_size)
            is_negative = tf.random_uniform([batch_size, 1],
                                            minval=0,
                                            maxval=1)
            is_negative = tf.less(is_negative, constant)
            features_audio_return, labels_audio = self.sample_negatively(
                features, labels, is_negative)
            concatenated_features = tf.concat(
                [features_rgb, features_audio_return], 1)

        else:
            # Normal case, leave as it was
            # We can use python comparisons because they are checked only when creating the graph
            if self.random_selection == 0 | (self.random_selection ==
                                             1 & num_features > 1):
                for feature_index in range(num_features):
                    feature_map[self.feature_names[
                        feature_index]] = tf.FixedLenFeature(
                            [self.feature_sizes[feature_index]], tf.float32)

                features = tf.parse_example(serialized_examples,
                                            features=feature_map)

                labels = tf.sparse_to_indicator(features["labels"],
                                                self.num_classes)
                labels.set_shape([None, self.num_classes])
                labels_audio = labels
                concatenated_features = tf.concat([
                    features[feature_name]
                    for feature_name in self.feature_names
                ], 1)

            # Evaluation with only one of the two features
            elif self.random_selection == 1:
                feature_map[name_frames] = tf.FixedLenFeature(
                    [len_features_frames], tf.float32)
                feature_map[name_audio] = tf.FixedLenFeature(
                    [len_features_audio], tf.float32)

                features = tf.parse_example(serialized_examples,
                                            features=feature_map)

                labels = tf.sparse_to_indicator(features["labels"],
                                                self.num_classes)
                labels.set_shape([None, self.num_classes])

                # In this point there is only 1 feature_name
                # We can use python comparisons because they are checked only when creating the graph
                if self.feature_names[0] == name_frames:
                    concatenated_features = tf.concat([
                        features[name_frames],
                        tf.zeros_like(features[name_audio])
                    ], 1)
                else:
                    concatenated_features = tf.concat([
                        tf.zeros_like(features[name_frames]),
                        features[name_audio]
                    ], 1)

            # Training with thirds
            else:
                feature_map[name_frames] = tf.FixedLenFeature(
                    [len_features_frames], tf.float32)
                feature_map[name_audio] = tf.FixedLenFeature(
                    [len_features_audio], tf.float32)

                features = tf.parse_example(serialized_examples,
                                            features=feature_map)

                labels = tf.sparse_to_indicator(features["labels"],
                                                self.num_classes)
                labels.set_shape([None, self.num_classes])
                number = tf.random_uniform([],
                                           minval=0.,
                                           maxval=3.,
                                           dtype=tf.float32,
                                           name="random_number")

                features_rgb = features[name_frames]
                features_audio = features[name_audio]

                one = tf.constant(1.)
                two = tf.constant(2.)

                features_audio = tf.cond(
                    tf.less(number, one),
                    lambda: tf.clip_by_value(features_audio, 0, 0),
                    lambda: features_audio)
                features_rgb = tf.cond(
                    tf.greater(number, two),
                    lambda: tf.clip_by_value(features_rgb, 0, 0),
                    lambda: features_rgb)

                concatenated_features = tf.concat(
                    [features_rgb, features_audio], 1, name="concat_features")

        return features["video_id"], concatenated_features, labels, tf.ones(
            [tf.shape(serialized_examples)[0]]), is_negative, labels_audio
Example #22
0
def main(unused_argv):
    """
    Training.
    init_learning_rate: Initial learning rate.
    decay_steps: How many training steps to decay learning rate once.
    decay_rate: How much to decay learning rate.
    l2_reg_rate: l2 regularization rate.
    epochs: The maximal epochs to pass all training data.
    """
    logging.set_verbosity(logging.INFO)

    output_dir = FLAGS.output_dir
    start_new_model = FLAGS.start_new_model

    init_learning_rate = FLAGS.init_learning_rate
    decay_steps = FLAGS.decay_steps
    decay_rate = FLAGS.decay_rate
    l2_reg_rate = FLAGS.l2_reg_rate
    train_epochs = FLAGS.train_epochs

    model_type, feature_names, feature_sizes = FLAGS.model_type, FLAGS.feature_names, FLAGS.feature_sizes
    reader = get_reader(model_type, feature_names, feature_sizes)
    train_data_pattern = FLAGS.train_data_pattern
    validate_data_pattern = FLAGS.validate_data_pattern
    batch_size = FLAGS.batch_size
    num_readers = FLAGS.num_readers
    init_with_linear_clf = FLAGS.init_with_linear_clf
    is_bootstrap = FLAGS.is_bootstrap

    # Increase num_readers.
    validate_data_pipeline = DataPipeline(reader=reader,
                                          data_pattern=validate_data_pattern,
                                          batch_size=batch_size,
                                          num_readers=num_readers)

    if tf.gfile.Exists(path_join(output_dir, 'validate_data.pickle')):
        with open(path_join(output_dir, 'validate_data.pickle'), 'rb') as f:
            validate_data = pickle.load(f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'rb') as f:
            validate_labels = pickle.load(f)
    else:
        # Sample validate set for line search in linear classifier or logistic regression early stopping.
        _, validate_data, validate_labels, _ = random_sample(
            0.05,
            mask=(False, True, True, False),
            data_pipeline=validate_data_pipeline)
        with open(path_join(output_dir, 'validate_data.pickle'), 'wb') as f:
            pickle.dump(validate_data, f)

        with open(path_join(output_dir, 'validate_labels.pickle'), 'wb') as f:
            pickle.dump(validate_labels, f)

    start_new_model = start_new_model or (not tf.gfile.Exists(output_dir))

    # Set pos_weights for extremely imbalanced situation in one-vs-all classifiers.
    try:
        # Load sum_labels in training set, numpy float format to compute pos_weights.
        train_sum_labels = load_sum_labels()
        # num_neg / num_pos, assuming neg_weights === 1.0.
        pos_weights = np.sqrt(
            (float(NUM_TRAIN_EXAMPLES) - train_sum_labels) / train_sum_labels)
        logging.info(
            'Computing pos_weights based on sum_labels in train set successfully.'
        )
    except IOError:
        logging.error('Cannot load train sum_labels. Use default value.')
        pos_weights = None
    finally:
        logging.error('Disable pos_weights.')
        # Set it as None to disable pos_weights.
        pos_weights = None

    train_data_pipeline = DataPipeline(reader=reader,
                                       data_pattern=train_data_pattern,
                                       batch_size=batch_size,
                                       num_readers=num_readers)
    if start_new_model:
        # Load train data mean and std.
        train_features_mean, train_features_var = load_features_mean_var(
            reader)

        tr_data_fn = standard_scale
        tr_data_paras = {
            'mean': train_features_mean,
            'variance': train_features_var,
            'reshape': False,
            'size': None
        }

        if init_with_linear_clf:
            # ...Start linear classifier...
            # Compute weights and biases of linear classifier using normal equation.
            # Linear search helps little.
            linear_clf = LinearClassifier(
                logdir=path_join(output_dir, 'linear_classifier'))
            linear_clf.fit(data_pipeline=train_data_pipeline,
                           tr_data_fn=tr_data_fn,
                           tr_data_paras=tr_data_paras,
                           l2_regs=[
                               0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1,
                               1.0, 10.0, 100.0, 1000.0
                           ],
                           validate_set=(validate_data, validate_labels),
                           line_search=True)
            linear_clf_weights, linear_clf_biases = linear_clf.weights, linear_clf.biases

            logging.info(
                'linear classifier weights and biases with shape {}, {}'.
                format(linear_clf_weights.shape, linear_clf_biases.shape))
            logging.debug(
                'linear classifier weights and {} biases: {}.'.format(
                    linear_clf_weights, linear_clf_biases))
            # ...Exit linear classifier...
        else:
            linear_clf_weights, linear_clf_biases = None, None
    else:
        linear_clf_weights, linear_clf_biases = None, None
        tr_data_fn = None
        tr_data_paras = None

    # Run logistic regression.
    log_reg = LogisticRegression(logdir=path_join(output_dir, 'log_reg'))
    log_reg.fit(train_data_pipeline,
                start_new_model=start_new_model,
                tr_data_fn=tr_data_fn,
                tr_data_paras=tr_data_paras,
                validate_set=(validate_data, validate_labels),
                validate_fn=gap_fn,
                bootstrap=is_bootstrap,
                init_learning_rate=init_learning_rate,
                decay_steps=decay_steps,
                decay_rate=decay_rate,
                epochs=train_epochs,
                l2_reg_rate=l2_reg_rate,
                pos_weights=pos_weights,
                initial_weights=linear_clf_weights,
                initial_biases=linear_clf_biases)
Example #23
0
def initialize(num_centers_ratio,
               data_pipeline,
               method=None,
               metric='cosine',
               max_iter=100,
               tol=0.005,
               scaling_method=1,
               alpha=1.0,
               p=3):
    """
    This functions initializes representative prototypes (RBF centers) c and scaling factors sigma.

    This function will generate one group of centers for all labels as a whole. Be cautious with initialize_per_label.
    Args:
        num_centers_ratio: The number of centers to be decided / total number of examples that belong to label l,
            for l = 0, ..., num_classes - 1.
        data_pipeline: A namedtuple consisting of the following elements.
            reader, video-level features reader or frame-level features reader.
            data_pattern, File Glob of data set.
            batch_size, How many examples to handle per time.
            num_readers, How many IO threads to prefetch examples.
        method: The method to decide the centers. Possible choices are random selection, kmeans and online (kmeans).
         Default is None, which represents randomly selecting a certain number of examples as centers.
        metric: Distance metric, euclidean distance or cosine distance.
        max_iter: The maximal number of iterations clustering to be done.
        tol: The minimal reduction of objective function of clustering to be reached to stop iteration.
        scaling_method: There are four choices. 1, all of them use the same sigma, the p smallest pairs of distances.
         2, average of p nearest centers.
         3, distance to the nearest center that has a different label (Not supported!).
         4, mean distance between this center and all of its points.
        alpha: The alpha parameter that should be set heuristically. It works like a learning rate. (mu in Zhang)
        p: When scaling_method is 1 or 2, p is needed.
    Returns:
        centers (prototypes) and scaling factors (sigmas).
    Raises:
        ValueError if num_centers_ratio is not between 0.0 (open) and 1.0 (closed).
        ValueError if metric is not euclidean or cosine.
        ValueError if method is not one of None, kmeans or online.
        NotImplementedError if scaling_method is 3 or 5.
        ValueError if scaling method is not 1 - 5.
    """
    logging.info('Generate a group of centers for all labels. See Schwenker.')
    # Argument checking.
    if (num_centers_ratio <= 0.0) or (num_centers_ratio > 1.0):
        raise ValueError(
            'num_centers_ratio must be larger than 0.0 and no greater than 1.0.'
        )
    logging.info('num_centers_ratio is {}.'.format(num_centers_ratio))

    if ('euclidean' == metric) or ('cosine' == metric):
        logging.info(
            'Using {} distance. The larger, the less similar.'.format(metric))
    else:
        raise ValueError(
            'Only euclidean and cosine distance are supported, {} passed.'.
            format(metric))

    # Sample features only.
    _, centers, _, _ = random_sample(num_centers_ratio,
                                     mask=(False, True, False, False),
                                     data_pipeline=data_pipeline,
                                     name_scope='sample_centers')
    logging.info('Sampled {} centers totally.'.format(centers.shape[0]))
    logging.debug('Randomly selected centers: {}'.format(centers))

    # Used in scaling method 4. Average distance of each point with its cluster center.
    per_clu_mean_dist = None
    # Perform k-means or online k-means.
    if method is None:
        logging.info(
            'Using randomly selected centers as model prototypes (centers).')
    elif 'online' == method:
        raise NotImplementedError(
            'Only None (randomly select examples), online, kmeans are supported.'
        )
    elif 'kmeans' == method:
        logging.info(
            'Using k-means clustering result as model prototypes (centers).')

        return_mean_clu_dist = (scaling_method == 4)
        kmeans = KMeans(centers,
                        data_pipeline=data_pipeline,
                        metric=metric,
                        return_mean_clu_dist=return_mean_clu_dist)
        kmeans.fit(max_iter=max_iter, tol=tol)
        # Get current centers and update centers.
        centers = kmeans.current_centers
        per_clu_mean_dist = kmeans.per_clu_mean_dist

    else:
        raise ValueError(
            'Only None (randomly select examples), online, kmeans are supported.'
        )

    # Compute scaling factors based on these centers.
    num_centers = centers.shape[0]
    sigmas = None
    if scaling_method == 1:
        # Equation 27.
        pairwise_distances = sci_distance.pdist(centers, metric=metric)
        p = min(p, len(pairwise_distances))
        logging.info('Using {} minimal pairwise distances.'.format(p))
        # np.partition second argument begins with 0.
        sigmas = np.array(
            [alpha * np.mean(np.partition(pairwise_distances, p - 1)[:p])] *
            num_centers,
            dtype=np.float32)
    elif scaling_method == 2:
        # Equation 28.
        p = min(p, num_centers - 1)
        logging.info('Using {} minimal distances per center.'.format(p))

        if 'euclidean' == metric:
            dis_fn = sci_distance.euclidean
        else:
            dis_fn = sci_distance.cosine
        sigmas = []
        for c in centers:
            distances = [dis_fn(c, _c) for _c in centers]
            # The distance between c and itself is zero and is in the left partition.
            sigmas.append(alpha * np.sum(np.partition(distances, p)[:p + 1]) /
                          float(p))

        sigmas = np.array(sigmas, dtype=np.float32)
    elif scaling_method == 3:
        # Equation 29.
        raise NotImplementedError(
            'Not supported when all labels use the same centers.')
    elif scaling_method == 4:
        # Equation 30.
        if per_clu_mean_dist is None:
            kmeans = KMeans(centers,
                            data_pipeline=data_pipeline,
                            metric=metric,
                            return_mean_clu_dist=True)
            kmeans.fit(max_iter=1, tol=tol)

            centers = kmeans.current_centers
            per_clu_mean_dist = kmeans.per_clu_mean_dist

            logging.info(
                'Compute mean distance per cluster using kmeans or online kmeans.'
            )
        else:
            logging.info(
                'Reuse mean distance per cluster computed in kmeans or online kmeans.'
            )

        sigmas = alpha * per_clu_mean_dist
    elif scaling_method == 5:
        # Equation 31.
        raise NotImplementedError(
            'Only three methods are supported. Please read the documentation.')
    else:
        raise ValueError(
            'Only three methods are supported. Please read the documentation.')

    logging.debug('Scaling factor sigmas: {}'.format(sigmas))

    return centers, sigmas