Beispiel #1
0
    def __init__(self):
        melt.PredictorBase.__init__(self)
        ShowAndTell.__init__(self, is_training=False, is_predict=True)

        if FLAGS.pre_calc_image_feature:
            self.image_feature_len = FLAGS.image_feature_len or IMAGE_FEATURE_LEN
            #TODO for rl, need use feed dict, so predict will introduce ... need to feed, how to use with_default?
            #self.image_feature_feed = tf.placeholder(tf.float32, [None, self.image_feature_len], name='image_feature')
            self.image_feature_feed = tf.placeholder_with_default(
                [[0.] * self.image_feature_len],
                [None, self.image_feature_len],
                name='image_feature')
        else:
            #self.image_feature_feed =  tf.placeholder(tf.string, [None,], name='image_feature')
            # TODO HACK for nasnet... need this due to using average decay
            if os.path.exists('./test.jpg'):
                test_image = melt.read_image('./test.jpg')
            elif os.path.exists('/tmp/test.jpg'):
                test_image = melt.read_image('/tmp/test.jpg')
            else:
                test_image = None

            if test_image is not None:
                self.image_feature_feed = tf.placeholder_with_default(
                    tf.constant([test_image]), [
                        None,
                    ], name='image_feature')
            else:
                assert not FLAGS.image_model_name.startswith(
                    'nasnet'
                ), 'HACK for nasnet you need one test.jpg in current path or /tmp/ path'
                self.image_feature_feed = tf.placeholder(tf.string, [
                    None,
                ],
                                                         name='image_feature')

        tf.add_to_collection('feed', self.image_feature_feed)
        tf.add_to_collection('lfeed', self.image_feature_feed)

        self.text_feed = tf.placeholder(tf.int64, [None, TEXT_MAX_WORDS],
                                        name='text')
        tf.add_to_collection('rfeed', self.text_feed)

        self.text = None
        self.text_score = None
        self.beam_text = None
        self.beam_text_score = None

        self.image_model = None

        self.logprobs_history = False
        self.alignment_history = False

        self.feed_dict = {}
  def get_image_feature_feed(self):
    if self.image_feature_feed is None:
      if FLAGS.pre_calc_image_feature:
        self.image_feature_feed = tf.placeholder(tf.float32, [None, self.image_feature_len], name='image_feature')
      else:
        # for nasnet you need to always feeed this WHY ?  TODO FIMXE
        if os.path.exists('./test.jpg'):
          test_image = melt.read_image('./test.jpg')
        elif os.path.exists('/tmp/test.jpg'):
          test_image = melt.read_image('/tmp/test.jpg')
        else:
          test_image = None
      
        if test_image is not None:
          self.image_feature_feed =  tf.placeholder_with_default(tf.constant([test_image]), [None,], name='image_feature')
        else:
          assert not FLAGS.image_model_name.startswith('nasnet'), 'HACK for nasnet you need one test.jpg in current path or /tmp/ path' 
          self.image_feature_feed =  tf.placeholder(tf.string, [None,], name='image_feature')

      tf.add_to_collection('feed', self.image_feature_feed)
      tf.add_to_collection('lfeed', self.image_feature_feed)
    return self.image_feature_feed
Beispiel #3
0
def translation_predicts(imgs, img_features, predictor, results):
  if isinstance(img_features[0], np.string_):
    img_features = np.array([melt.read_image(pic_path) for pic_path in img_features])

  texts, _ = predictor.predict_text(img_features)
  #only use top prediction of beam search
  texts = [x[0] for x in texts]
  for i in range(len(texts)):
    #for eval even if only one predict must also be list, also exclude last end id
    if not FLAGS.eval_translation_reseg:
      texts[i] = [' '.join([str(x) for x in texts[i][:list(texts[i]).index(vocab.end_id())]])] 
    else:
      import jieba
      texts[i] = ''.join([vocab.key(int(x)) for x in texts[i][:list(texts[i]).index(vocab.end_id())]])
      texts[i] = [' '.join([x.encode('utf-8') for x in jieba.cut(texts[i])])]
    results[imgs[i]] = texts[i]
Beispiel #4
0
def convert_to_tfrecord(input_files, output_file):
  """Converts a file to TFRecords."""
  print('Generating %s' % output_file)
  with tf.python_io.TFRecordWriter(output_file) as record_writer:
    for input_file in tqdm(input_files, ascii=True):
      id = os.path.basename(input_file)[:-4]
      #img = cv2.imread(input_file)
      img = melt.read_image(input_file)
      # turn to channel first
      #img = img.transpose(2,0,1)
      if 'test' not in output_file:  
        label = m[id]
      else:
        label = -1
      example = tf.train.Example(features=tf.train.Features(
          feature={
              'id': melt.bytes_feature(id),
              #'image': melt.bytes_feature(img.tobytes()),
              'image': melt.bytes_feature(img),
              'label': melt.int64_feature(label)
          }))
      record_writer.write(example.SerializeToString())
Beispiel #5
0
def deal_file_with_imgdir(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_path = os.path.join(FLAGS.image_dir, img.replace('/', '_'))
            encoded_image = melt.read_image(image_path)

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_data': melt.bytes_feature(encoded_image),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_data':
                            melt.bytes_feature(encoded_image),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        #actually save pic path instead of image feature
                        image_features.append(
                            os.path.join(FLAGS.image_dir,
                                         img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
Beispiel #6
0
      word_ids = text2ids.text2ids(text)
      seg_text = text2ids.ids2text(word_ids, print_end=False)
      print('label:', text, seg_text)
      words_importance = sim_predictor.words_importance([word_ids])
      words_importance = words_importance[0]
      print('word importance:')
      for i in range(len(word_ids)):
        if word_ids[i] == 0:
          break 
        print(vocab.key(int(word_ids[i])), words_importance[i], end='|')  
      print()
  except Exception:
    print(traceback.format_exc(), file=sys.stderr)    
    pass

  image = melt.read_image(image_path)
  word_ids, scores = predictor.word_ids([image])
  word_id = word_ids[0]
  score = scores[0]
  print('best predict:', ids2text.translate(word_id[0]),  score[0], '/'.join([vocab.key(int(id)) for id in word_id[0] if id != vocab.end_id()]))
  
  l = [id for id in word_id[0] if id != vocab.end_id()]
  l = gezi.pad(l, TEXT_MAX_WORDS)
  words_importance = sim_predictor.words_importance([l])
  words_importance = words_importance[0]

  print('word importance:')
  for i in range(len(word_id[0])):
    if word_id[0][i] == vocab.end_id():
      break
    print(vocab.key(int(word_id[0][i])), words_importance[i], end='|')
Beispiel #7
0
import melt
p = melt.SimplePredictor('./mount/temp/cifar10/model/resnet.momentum.decay/epoch/model.ckpt-30.00-10530', key='pre_logits')
feature = p.inference([melt.read_image('./mount/data/kaggle/cifar-10/test/10.png')])

print(feature)
Beispiel #8
0
def predict(image_name, num_show=1):
    image_path = os.path.join(image_dir, image_name)

    if not os.path.exists(image_path):
        print('path not exists:%s' % image_path)
        return

    img = melt.read_image(image_path)
    feature = image_model.gen_feature(img) if image_model is not None else img
    timer = gezi.Timer()
    init_states = predictor.inference(
        [
            'beam_search_beam_size', 'beam_search_initial_state',
            'beam_search_initial_ids', 'beam_search_initial_logprobs',
            'beam_search_initial_alignments'
        ],
        feed_dict={tf.get_collection('feed')[0]: feature})

    step_func = lambda input_feed, state_feed: predictor.inference(
        [
            'beam_search_state',
            'beam_search_ids',
            'beam_search_logprobs',
            'beam_search_alignments',
        ],
        feed_dict={
            #TODO...attetion still need input_text feed, see rnn_decoder.py  beam_search_step
            #but not hurt perfomance much because encoder is fast? Is it possible to avoid this?
            #anyway if no attention  will not need input_text_feed
            tf.get_collection('feed')[0]:
            feature,
            tf.get_collection('beam_search_input_feed')[0]:
            input_feed,
            tf.get_collection('beam_search_state_feed')[0]:
            state_feed
        })

    beams = melt.seq2seq.beam_search(init_states,
                                     step_func,
                                     end_id=ids2text.end_id(),
                                     max_words=decode_max_words,
                                     length_normalization_factor=1.)

    for i, beam in enumerate(beams):
        print(i, beam.words, ids2text.ids2text(beam.words),
              math.exp(beam.score))

        # Plot images with attention weights
        words = beam.words
        img = ndimage.imread(image_path)

        num_features = melt.image.get_num_features(image_model_name)
        dim = int(np.sqrt(num_features))
        #print('dim:', dim)

        n_words = len(words)
        w = np.round(np.sqrt(n_words))
        h = np.ceil(np.float32(n_words) / w)

        plt.subplot(w, h, 1)
        plt.imshow(img)
        plt.axis('off')

        #img = scipy.misc.imresize(img, (dim, dim))

        smooth = True  #TODO smooth = Ture seems not work not back ground pic
        smooth = False
        if i == 0:
            for j in range(len(words)):
                plt.subplot(w, h, j + 2)
                lab = pinyin.Convert(
                    ids2text.vocab.key(words[j]).decode('utf8').encode('gbk'))
                lab += '(%0.2f)' % math.exp(beam.logprobs[j])
                plt.text(0, 1, lab, backgroundcolor='white', fontsize=10)
                plt.text(0, 1, lab, color='black', fontsize=10)
                plt.imshow(img)
                if smooth:
                    alpha_img = skimage.transform.pyramid_expand(
                        beam.alignments_list[j].reshape(dim, dim),
                        upscale=16,
                        sigma=20)
                else:
                    alpha_img = skimage.transform.resize(
                        beam.alignments_list[j].reshape(dim, dim),
                        [img.shape[0], img.shape[1]])
                plt.imshow(alpha_img, alpha=0.8)
                plt.set_cmap(cm.Greys_r)
                plt.axis('off')
            #plt.show()
            plt.savefig('test%d.pdf' % i)
Beispiel #9
0
import sys, os
from deepiu.util.sim_predictor import SimPredictor
from deepiu.util import vocabulary

import melt

image_dir = '/home/gezi/data2/data/ai_challenger/image_caption/pic/'

image_file = '6275b5349168ac3fab6a493c509301d023cf39d3.jpg'
if len(sys.argv) > 1:
    image_file = sys.argv[1]

image_path = os.path.join(image_dir, image_file)
image_model_checkpoint_path = '/home/gezi/data/image_model_check_point/inception_resnet_v2_2016_08_30.ckpt'
model_dir = '/home/gezi/new/temp/image-caption/ai-challenger/model/bow/'
vocab_path = '/home/gezi/new/temp/image-caption/ai-challenger/tfrecord/seq-basic/vocab.txt'

vocabulary.init(vocab_path)
vocab = vocabulary.vocab

predictor = SimPredictor(model_dir,
                         image_model_checkpoint_path,
                         image_model_name='InceptionResnetV2')

scores, word_ids = predictor.top_words([melt.read_image(image_path)], 50)
scores = scores[0]
word_ids = word_ids[0]

for word_id, score in zip(word_ids, scores):
    print(vocab.key(int(word_id)), score)
Beispiel #10
0
def predicts_txt2im(text_strs, texts, predictor, rank_metrics, exact_predictor=None):
  timer = gezi.Timer('preidctor.predict text2im')
  if exact_predictor is None:
    if assistant_predictor:
      exact_predictor = predictor
      predictor = assistant_predictor

  _, img_features = get_image_names_and_features()
  # TODO gpu outofmem predict for showandtell
  #---NOTICE this might be too much mem cost if image is original encoded binary not image feature
  img_features = img_features[:FLAGS.max_images]
  if isinstance(img_features[0], np.string_):
    assert(len(img_features) < 2000) #otherwise too big mem ..
    img_features = [melt.read_image(pic_path) for pic_path in img_features]
  
  step = len(img_features)
  if FLAGS.metric_eval_images_size > 0 and FLAGS.metric_eval_images_size < step:
    step = FLAGS.metric_eval_images_size
  start = 0
  scores = []
  while start < len(img_features):
    end = start + step 
    if end > len(img_features):
      end = len(img_features)
    #print('predicts images start:', start, 'end:', end, file=sys.stderr, end='\r')
    
    #here might not accept raw image for bow predictor as assistant predictor TODO how to add image process here to gen feature first?
    score = predictor.predict(img_features[start: end], texts)
   
    scores.append(score)
    start = end
  #score = predictor.predict(img_features, texts)
  score = np.concatenate(scores, 0)
  score = score.transpose()
  #print('image_feature_shape:', img_features.shape, 'text_feature_shape:', texts.shape, 'score_shape:', score.shape)
  timer.print()

  text2img = get_bidrectional_lable_map_txt2im()
  num_imgs = img_features.shape[0]

  for i, text_str in enumerate(text_strs):
    indexes = (-score[i]).argsort()

    #rerank
    if exact_predictor:
      top_indexes = indexes[:FLAGS.assistant_rerank_num]
      exact_imgs = img_features[top_indexes]
      exact_score = exact_predictor.elementwise_predict(exact_imgs, [texts[i]])
      exact_score = exact_score[0]
      exact_indexes = (-exact_score).argsort()
      new_indexes = [x for x in indexes]
      for j in range(len(exact_indexes)):
        new_indexes[j] = indexes[exact_indexes[j]]
      indexes = new_indexes
    
    hits = text2img[text_str]

    num_positions = min(num_imgs, FLAGS.metric_topn)
    #num_positions = num_imgs
    
    labels = [indexes[j] in hits for j in xrange(num_positions)]

    rank_metrics.add(labels)
Beispiel #11
0
def predicts(imgs, img_features, predictor, rank_metrics, exact_predictor=None, exact_ratio=1.):
  # TODO gpu outofmem predict for showandtell#
  if exact_predictor is None:
    if assistant_predictor is not None:
      exact_predictor = predictor
      predictor = assistant_predictor

  #print(predictor, exact_predictor)

  if isinstance(img_features[0], np.string_):
    assert(len(img_features) < 2000) #otherwise too big mem ..
    img_features = np.array([melt.read_image(pic_path) for pic_path in img_features])  

  img2text = get_bidrectional_lable_map()

  random = True
  need_shuffle = False
  if FLAGS.max_texts > 0 and len(all_distinct_texts) > FLAGS.max_texts:
    assert random
    if not random:
      texts = all_distinct_texts[:FLAGS.max_texts]
    else:
      need_shuffle = True

      all_hits = set()
      for img in (imgs):
        hits = img2text[img]
        for hit in hits:
          all_hits.add(hit)
      
      index = np.random.choice(len(all_distinct_texts), FLAGS.max_texts, replace=False)
      index = [x for x in index if x not in all_hits]
      index = list(all_hits) + index 
      index = index[:FLAGS.max_texts]
      index = np.array(index)
      texts = all_distinct_texts[index]
  else:
    texts = all_distinct_texts
  text_strs = all_distinct_text_strs

  step = len(texts)
  if FLAGS.metric_eval_texts_size > 0 and FLAGS.metric_eval_texts_size < step:
    step = FLAGS.metric_eval_texts_size
  start = 0
  scores = []
  while start < len(texts):
    end = start + step 
    if end > len(texts):
      end = len(texts)
    #print('predicts texts start:', start, 'end:', end, end='\r', file=sys.stderr)
    score = predictor.predict(img_features, texts[start: end])
    scores.append(score)
    start = end
  score = np.concatenate(scores, 1)
  #print('image_feature_shape:', img_features.shape, 'text_feature_shape:', texts.shape, 'score_shape:', score.shape)
  num_texts = texts.shape[0]

  for i, img in enumerate(imgs):
    indexes = (-score[i]).argsort()
    #rerank
    if exact_predictor:
      top_indexes = indexes[:FLAGS.assistant_rerank_num]
      exact_texts = texts[top_indexes]
      exact_score = exact_predictor.elementwise_predict([img_features[i]], exact_texts)
      exact_score = np.squeeze(exact_score)
      if exact_ratio < 1.:
        for j in range(len(top_indexes)):
          exact_score[j] = exact_ratio * exact_score[j] + (1. - exact_ratio) * score[i][top_indexes[j]]

      #print(exact_score)
      exact_indexes = (-exact_score).argsort()

      #print(exact_indexes)
      
      new_indexes = [x for x in indexes]
      for j in range(len(exact_indexes)):
        new_indexes[j] = indexes[exact_indexes[j]]
      indexes = new_indexes

    hits = img2text[img]

    if FLAGS.show_info_interval and i % FLAGS.show_info_interval == 0:
      label_text = '|'.join([text_strs[x] for x in hits])
      img_str = img
      if is_img(img):
        img_str = '{0}<p><a href={1} target=_blank><img src={1} height=200></a></p>'.format(img, get_img_url(img))
      logging.info('<P>obj: {} label: {}</P>'.format(img_str, label_text))
      for j in range(5):
        is_hit = indexes[j] in hits if not need_shuffle else index[indexes[j]] in hits
        logging.info('<P>{} {} {} {}</P>'.format(j, is_hit, ids2text(texts[indexes[j]]), exact_score[exact_indexes[j]] if exact_predictor else score[i][indexes[j]]))

    #notice only work for recall@ or precision@ not work for ndcg@, if ndcg@ must use all
    num_positions = min(num_texts, FLAGS.metric_topn)
    #num_positions = num_texts

    if not need_shuffle:
      labels = [indexes[j] in hits for j in xrange(num_positions)]
    else:
      labels = [index[indexes[j]] in hits for j in xrange(num_positions)]

    rank_metrics.add(labels)