Ejemplo n.º 1
0
    def setUp(self):
        logging.set_verbosity(tf.logging.INFO)

        config_string = """
      debug: true
      batch_size: 32
      feature_dimensions: 1536
      max_stmts_per_image: 5
      max_stmt_len: 21
      number_of_regions: 10
      shuffle_buffer_size: 1000
      max_densecap_len: 9
      max_densecaps_per_image: 5
      max_symbols_per_image: 7
      use_single_densecap: true
      image_feature_path: "output/img_features_train.npy"
      region_feature_path: "output/roi_features_train.npy"
      statement_vocab_path: "output/action_reason_vocab_200d.txt"
      statement_annot_path: "data/train/QA_Combined_Action_Reason_train.json"
      densecap_vocab_path: "output/densecap_vocab_200d.txt"
      densecap_annot_path: "output/densecap_train.json"
      symbol_annot_path: "output/symbol_train.json"
      symbol_cluster_path: "data/additional/clustered_symbol_list.json"
    """
        self.default_config = ads_mem_examples_pb2.AdsMemExamples()
        text_format.Merge(config_string, self.default_config)

        self.stmt_vocab = _load_vocab(self.default_config.statement_vocab_path)
        self.densecap_vocab = _load_vocab(
            self.default_config.densecap_vocab_path)
        word_to_id, id_to_symbol = load_symbol_cluster(
            self.default_config.symbol_cluster_path)
        self.symbol_vocab = id_to_symbol
Ejemplo n.º 2
0
def main(args):
    """Main."""
    word_to_id, id_to_symbol = load_symbol_cluster(args.symbol_cluster_path)
    print >> sys.stderr, 'Load %i pairs of mapping.' % (len(word_to_id))
    print >> sys.stderr, 'Symbol list: \n%s' % (json.dumps(id_to_symbol,
                                                           indent=2))

    id_to_symbol = sorted(id_to_symbol.iteritems(),
                          lambda x, y: cmp(x[0], y[0]))
    with open(args.output_vocab_path, 'w') as fp:
        for symbol_id, symbol in id_to_symbol:
            if symbol_id != 0:
                fp.write('%s\t%i\n' % (symbol, 999))

    print >> sys.stderr, 'Done'
def main(args):
  """Main."""
  # Load symbol annotations.
  symbol_annots = load_symbol_raw_annots(args.symbol_raw_annot_path)
  print >> sys.stderr, 'Load symbol annotations for %i images.' % (len(symbol_annots))

  word_to_id, id_to_symbol = load_symbol_cluster(args.symbol_cluster_path)
  print >> sys.stderr, 'Load %i pairs of mapping.' % (len(word_to_id))
  print >> sys.stderr, 'Symbol list: \n%s' % (json.dumps(id_to_symbol, indent=2))

  results = {}
  for image_id, annots in symbol_annots.iteritems():
    symbol_set = set()
    for annot in annots:
      symbols = [s.strip() for s in annot[4].lower().split('/') if len(s.strip()) > 0]
      symbols = [word_to_id[s] for s in symbols if s in word_to_id]
      symbol_set.update(symbols)
    if len(symbol_set):
      results[image_id] = sorted(symbol_set)
    
  with open(args.output_json_path, 'w') as fp:
    fp.write(json.dumps(results))
  print >> sys.stderr, 'Export %i symbols' % (len(results))
  print >> sys.stderr, 'Done'
Ejemplo n.º 4
0
def main(_):
    logging.set_verbosity(tf.logging.INFO)

    model_proto = _load_model_proto(FLAGS.model_proto)
    logging.info('Model proto: \n%s', model_proto)

    # Load vocab.
    word_to_id, id_to_symbol = load_symbol_cluster(FLAGS.symbol_cluster_path)
    logging.info('Number of classes: %i.', len(id_to_symbol))

    # Load image features.
    img_features = np.load(FLAGS.feature_path).item()
    logging.info('Load %i features.', len(img_features))

    # Note that ZERO is reserved for 'unclear'.
    annots = load_raw_annots(FLAGS.symbol_annot_path)
    x, y = _get_data(annots, img_features, len(id_to_symbol))

    number_of_val_examples = FLAGS.number_of_val_examples
    x_train, y_train = x[number_of_val_examples:], y[number_of_val_examples:]
    x_valid, y_valid = x[:number_of_val_examples], y[:number_of_val_examples]

    logging.info('Load %d train examples.', len(x_train))
    logging.info('Load %d valid examples.', len(x_valid))

    # Build graph to train symbol classifier.
    g = tf.Graph()
    with g.as_default():
        # For training
        logits, init_fn = mlp.model(model_proto, x_train, is_training=True)
        loss_op = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_train,
                                                          logits=logits)
        loss_op = tf.reduce_mean(loss_op)

        optimizer = tf.train.AdamOptimizer(FLAGS.learning_rate)
        train_op = slim.learning.create_train_op(loss_op, optimizer=optimizer)
        init_op = tf.global_variables_initializer()

        # For evaluation.
        tf.get_variable_scope().reuse_variables()
        logits_val, init_fn_val = mlp.model(model_proto,
                                            x_valid,
                                            is_training=False)

        saver = tf.train.Saver()

    with tf.Session(graph=g,
                    config=train_utils.default_session_config()) as sess:
        sess.run(init_op)

        max_v, metrics = 0, []
        for i in xrange(FLAGS.max_iters):
            (_, loss, pred_train,
             pred_valid) = sess.run([train_op, loss_op, logits, logits_val])

            mAP_micro = average_precision_score(y_valid[:, 1:],
                                                pred_valid[:, 1:],
                                                average='micro')
            mAP_macro = average_precision_score(y_valid[:, 1:],
                                                pred_valid[:, 1:],
                                                average='macro')
            metric = mAP_macro

            if i % 100 == 0:
                logging.info(
                    'step=%d, loss=%.4lf, mAP_micro=%.4lf, mAP_macro=%.4lf.',
                    i + 1, loss, mAP_micro, mAP_macro)

                if metric >= max_v:
                    saver.save(sess, FLAGS.output_model_path)
                    max_v = metric

                if len(metrics) >= 3:
                    if metric < metrics[-1] and metrics[-1] < metrics[
                            -2] and metrics[-2] < metrics[-3]:
                        logging.info('Process early stopping.')
                        break

                metrics.append(metric)

    logging.info('Done')
    def build_inference_graph(self, examples, **kwargs):
        """Builds tensorflow graph for inference.

    Args:
      examples: a python dict involving at least following k-v fields:
        img_features: a [batch, feature_dimensions] tf.float32 tensor.
        roi_features: a [batch, number_of_regions, feature_dimensions] tf.float32 tensor.
        statement_strings: a [batch, statement_max_sent_len] tf.int64 tensor.
        statement_lengths: a [batch] tf.int64 tensor.

    Returns:
      predictions: a dict mapping from output names to output tensors.

    Raises:
      ValueError: if model_proto is not properly configured.
    """
        model_proto = self._model_proto
        is_training = self._is_training

        # Encode image features.
        (img_encoded,
         img_attention) = self.encode_image(examples['img_features'],
                                            examples['roi_features'])

        # Encode statement features.
        (stmt_encoded, stmt_attention) = self.encode_text(
            text_strings=examples['statement_strings'],
            text_lengths=examples['statement_lengths'],
            encoder=self._stmt_encoder)

        # For optional constraints.
        if model_proto.densecap_loss_weight > 0:
            # For densecap constraint.
            (densecap_encoded, densecap_attention) = self.encode_text(
                text_strings=examples['densecap_strings'],
                text_lengths=examples['densecap_lengths'],
                encoder=self._densecap_encoder)

        if model_proto.symbol_loss_weight > 0:
            # For symbol constraint.
            (symbol_encoded, symbol_attention) = self.encode_text(
                text_strings=examples['symbols'],
                text_lengths=examples['number_of_symbols'],
                encoder=self._symbol_encoder)

        # Encode knowledge if specified.
        if model_proto.use_knowledge_branch:
            # Symbol probability distribution from pre-trained MLP model.
            symbol_logits, symbol_init_fn = mlp.model(
                model_proto.symbol_classifier,
                examples['img_features'],
                is_training=is_training)
            self._init_fn_list.append(symbol_init_fn)
            symbol_proba = tf.sigmoid(symbol_logits)[:, 1:]

            # Assign weight to each symbol classifier.
            with tf.variable_scope('confidence'):
                symbol_classifier_weights = tf.get_variable(
                    name='weights',
                    shape=[symbol_proba.get_shape()[-1].value],
                    initializer=tf.constant_initializer(-3))
            symbol_classifier_weights = 2 * tf.sigmoid(
                symbol_classifier_weights)
            weights = symbol_proba * symbol_classifier_weights

            word_to_id, id_to_symbol = load_symbol_cluster(
                model_proto.symbol_cluster_path)
            for symbol_id, symbol_name in id_to_symbol.iteritems():
                if symbol_id != 0:
                    tf.summary.scalar('confidence/{}'.format(symbol_name),
                                      symbol_classifier_weights[symbol_id - 1])

            # Add encoded symbol prediction as a residual branch.
            symbol_embedding_mat = self._symbol_encoder.embedding_weights[
                1:, :]
            symbol_pred_encoded = tf.matmul(weights, symbol_embedding_mat)
            img_encoded += symbol_pred_encoded

        # Joint embedding and cosine distance computation.
        predictions = {
            'image_id': examples['image_id'],
            'img_encoded': tf.nn.l2_normalize(img_encoded, 1),
            'stmt_encoded': tf.nn.l2_normalize(stmt_encoded, 1),
        }
        if model_proto.densecap_loss_weight > 0:
            predictions.update(
                {'dense_encoded': tf.nn.l2_normalize(densecap_encoded, 1)})
        if model_proto.symbol_loss_weight > 0:
            predictions.update({
                'number_of_symbols':
                examples['number_of_symbols'],
                'symb_encoded':
                tf.nn.l2_normalize(symbol_encoded, 1)
            })

        return predictions
Ejemplo n.º 6
0
def _get_data_placeholders(config, split):
    """Returns data placeholder to feed the dataset.

  Args:
    config: an instance of ads_mem_examples_pb2.AdsMemExamples.

  Returns:
    data_placeholders: a dict mapping from name to placeholders.
    feed_dict: a dict mapping from name to data.
  """
    # Create placeholders.
    data_placeholders = {
        'image_id':
        tf.placeholder(tf.string, [None]),
        'img_features':
        tf.placeholder(tf.float32, [None, config.feature_dimensions]),
        'roi_features':
        tf.placeholder(
            tf.float32,
            [None, config.number_of_regions, config.feature_dimensions]),
        'number_of_statements':
        tf.placeholder(tf.int32, [None]),
        'statement_strings':
        tf.placeholder(
            tf.int32, [None, config.max_stmts_per_image, config.max_stmt_len]),
        'statement_lengths':
        tf.placeholder(tf.int32, [None, config.max_stmts_per_image]),
        'number_of_symbols':
        tf.placeholder(tf.int32, [None]),
        'symbols':
        tf.placeholder(tf.int32, [None, config.max_symbols_per_image]),
    }
    if not config.use_single_densecap:
        data_placeholders.update({
            'number_of_densecaps':
            tf.placeholder(tf.int32, [None]),
            'densecap_strings':
            tf.placeholder(tf.int32, [
                None, config.max_densecaps_per_image, config.max_densecap_len
            ]),
            'densecap_lengths':
            tf.placeholder(tf.int32, [None, config.max_densecaps_per_image]),
        })
    else:
        data_placeholders.update({
            'number_of_densecaps':
            tf.placeholder(tf.int32, [None]),
            'densecap_strings':
            tf.placeholder(tf.int32, [
                None, 1,
                config.max_densecaps_per_image * config.max_densecap_len
            ]),
            'densecap_lengths':
            tf.placeholder(tf.int32, [None, 1]),
        })

    if split != 'train':
        data_placeholders.update({
            'eval_statement_strings':
            tf.placeholder(tf.int32, [
                None, config.number_of_val_stmts_per_image, config.max_stmt_len
            ]),
            'eval_statement_lengths':
            tf.placeholder(tf.int32,
                           [None, config.number_of_val_stmts_per_image]),
        })

    # Load annotations and image features.
    assert tf.gfile.Exists(config.image_feature_path)
    assert tf.gfile.Exists(config.region_feature_path)
    assert tf.gfile.Exists(config.statement_vocab_path)
    assert tf.gfile.Exists(config.statement_annot_path)
    assert tf.gfile.Exists(config.densecap_vocab_path)
    assert tf.gfile.Exists(config.densecap_annot_path)
    assert tf.gfile.Exists(config.symbol_annot_path)
    assert tf.gfile.Exists(config.symbol_cluster_path)

    # Image features.
    start = time.time()
    image_features = np.load(config.image_feature_path).item()
    region_features = np.load(config.region_feature_path).item()
    logging.info(
        'Image features are loaded, cost=%is, img_len=%i, roi_len=%i.',
        time.time() - start, len(image_features), len(region_features))

    # Action-reason annotations.
    start = time.time()
    stmt_annots = load_action_reason_annots(config.statement_annot_path)
    logging.info('Annotations are loaded, cost=%is, path=%s, len=%i.',
                 time.time() - start, config.statement_annot_path,
                 len(stmt_annots))

    stmt_vocab = load_vocab(config.statement_vocab_path)
    logging.info('Load vocab from %s, vocab_size=%i',
                 config.statement_vocab_path, len(stmt_vocab))

    # Densecap annotations.
    start = time.time()
    dense_annots = load_densecap_annots(config.densecap_annot_path,
                                        config.max_densecaps_per_image)
    logging.info('Dense annotations are loaded, cost=%is, path=%s, len=%i.',
                 time.time() - start, config.densecap_annot_path,
                 len(dense_annots))

    dense_vocab = load_vocab(config.densecap_vocab_path)
    logging.info('Load vocab from %s, vocab_size=%i',
                 config.densecap_vocab_path, len(dense_vocab))

    # Symbol annotations.
    start = time.time()
    symbol_annots = load_raw_annots(config.symbol_annot_path)
    logging.info('Symbol annotations are loaded, cost=%is, path=%s, len=%i.',
                 time.time() - start, config.symbol_annot_path,
                 len(symbol_annots))
    word_to_id, id_to_symbol = load_symbol_cluster(config.symbol_cluster_path)

    # Initialize feed_dict.
    feed_dict = {
        'image_id': [],
        'img_features': [],
        'roi_features': [],
        'number_of_statements': [],
        'statement_strings': [],
        'statement_lengths': [],
        'number_of_densecaps': [],
        'densecap_strings': [],
        'densecap_lengths': [],
        'number_of_symbols': [],
        'symbols': [],
    }
    if split != 'train':
        feed_dict.update({
            'eval_statement_strings': [],
            'eval_statement_lengths': [],
        })

    total_images = total_statements = 0

    # Split training data for validation purpose.
    stmt_annots = stmt_annots.items()
    if split == 'valid':
        stmt_annots = stmt_annots[:config.number_of_val_examples]
    elif split == 'train':
        stmt_annots = stmt_annots[config.number_of_val_examples:]
    logging.info('Processing %i %s records...', len(stmt_annots), split)

    if config.debug:
        logging.warn('DEBUG MODE!!!!!!!')
        stmt_annots = stmt_annots[:100]

    for index, (image_id, annot) in enumerate(stmt_annots):
        # Pad action-reason.
        (number_of_statements, statement_strings,
         statement_lengths) = encode_and_pad_sentences(
             stmt_vocab, annot['pos_examples'], config.max_stmts_per_image,
             config.max_stmt_len)

        # Pad densecap.
        if not config.use_single_densecap:
            (number_of_densecaps, densecap_strings,
             densecap_lengths) = encode_and_pad_sentences(
                 dense_vocab, dense_annots[image_id],
                 config.max_densecaps_per_image, config.max_densecap_len)
        else:  # Concatenate all densecaps to form a single sentence.
            dense_string_concat = ' '.join(dense_annots[image_id])
            (number_of_densecaps, densecap_strings,
             densecap_lengths) = encode_and_pad_sentences(
                 dense_vocab, [dense_string_concat], 1,
                 config.max_densecap_len * config.max_densecaps_per_image)

        # Pad symbols.
        symbols = symbol_annots.get(image_id, [])
        number_of_symbols = len(symbols)
        symbols += [0] * config.max_symbols_per_image
        symbols = symbols[:config.max_symbols_per_image]

        feed_dict['image_id'].append(image_id)
        feed_dict['img_features'].append(image_features[image_id])
        feed_dict['roi_features'].append(region_features[image_id])
        feed_dict['number_of_statements'].append(
            np.array(number_of_statements, dtype=np.int32))
        feed_dict['statement_strings'].append(statement_strings)
        feed_dict['statement_lengths'].append(statement_lengths)
        feed_dict['number_of_densecaps'].append(
            np.array(number_of_densecaps, dtype=np.int32))
        feed_dict['densecap_strings'].append(densecap_strings)
        feed_dict['densecap_lengths'].append(densecap_lengths)
        feed_dict['number_of_symbols'].append(
            np.array(number_of_symbols, dtype=np.int32))
        feed_dict['symbols'].append(np.array(symbols))

        if split != 'train':
            # Pad strings for evaluation purpose.
            (number_of_eval_statements, eval_statement_strings,
             eval_statement_lengths) = encode_and_pad_sentences(
                 stmt_vocab, annot['all_examples'],
                 config.number_of_val_stmts_per_image, config.max_stmt_len)
            assert number_of_eval_statements == config.number_of_val_stmts_per_image
            feed_dict['eval_statement_strings'].append(eval_statement_strings)
            feed_dict['eval_statement_lengths'].append(eval_statement_lengths)

        total_images += 1
        total_statements += number_of_statements

        if index % 1000 == 0:
            logging.info('Load on %i/%i', index, len(stmt_annots))

    logging.info('Load %i images with %i statements.', total_images,
                 total_statements)

    # Legacy: GPU or CPU mode.
    if config.data_provider_mode == ads_mem_examples_pb2.AdsMemExamples.FROM_CPU:
        for k, v in feed_dict.items():
            feed_dict[data_placeholders[k]] = np.stack(v)
            del feed_dict[k]
        return data_placeholders, feed_dict


#  elif config.data_provider_mode == ads_mem_examples_pb2.AdsMemExamples.FROM_GPU:
#    data_tensors = {}
#    for k, v in feed_dict.items():
#      data_tensors[k] = tf.constant(np.stack(v))
#    return data_tensors, {}

    raise ValueError('Unknown mode %i.' % config.data_provider_mode)