def __init__(self): melt.PredictorBase.__init__(self) ShowAndTell.__init__(self, is_training=False, is_predict=True) if FLAGS.pre_calc_image_feature: self.image_feature_len = FLAGS.image_feature_len or IMAGE_FEATURE_LEN #TODO for rl, need use feed dict, so predict will introduce ... need to feed, how to use with_default? #self.image_feature_feed = tf.placeholder(tf.float32, [None, self.image_feature_len], name='image_feature') self.image_feature_feed = tf.placeholder_with_default( [[0.] * self.image_feature_len], [None, self.image_feature_len], name='image_feature') else: #self.image_feature_feed = tf.placeholder(tf.string, [None,], name='image_feature') # TODO HACK for nasnet... need this due to using average decay if os.path.exists('./test.jpg'): test_image = melt.read_image('./test.jpg') elif os.path.exists('/tmp/test.jpg'): test_image = melt.read_image('/tmp/test.jpg') else: test_image = None if test_image is not None: self.image_feature_feed = tf.placeholder_with_default( tf.constant([test_image]), [ None, ], name='image_feature') else: assert not FLAGS.image_model_name.startswith( 'nasnet' ), 'HACK for nasnet you need one test.jpg in current path or /tmp/ path' self.image_feature_feed = tf.placeholder(tf.string, [ None, ], name='image_feature') tf.add_to_collection('feed', self.image_feature_feed) tf.add_to_collection('lfeed', self.image_feature_feed) self.text_feed = tf.placeholder(tf.int64, [None, TEXT_MAX_WORDS], name='text') tf.add_to_collection('rfeed', self.text_feed) self.text = None self.text_score = None self.beam_text = None self.beam_text_score = None self.image_model = None self.logprobs_history = False self.alignment_history = False self.feed_dict = {}
def get_image_feature_feed(self): if self.image_feature_feed is None: if FLAGS.pre_calc_image_feature: self.image_feature_feed = tf.placeholder(tf.float32, [None, self.image_feature_len], name='image_feature') else: # for nasnet you need to always feeed this WHY ? TODO FIMXE if os.path.exists('./test.jpg'): test_image = melt.read_image('./test.jpg') elif os.path.exists('/tmp/test.jpg'): test_image = melt.read_image('/tmp/test.jpg') else: test_image = None if test_image is not None: self.image_feature_feed = tf.placeholder_with_default(tf.constant([test_image]), [None,], name='image_feature') else: assert not FLAGS.image_model_name.startswith('nasnet'), 'HACK for nasnet you need one test.jpg in current path or /tmp/ path' self.image_feature_feed = tf.placeholder(tf.string, [None,], name='image_feature') tf.add_to_collection('feed', self.image_feature_feed) tf.add_to_collection('lfeed', self.image_feature_feed) return self.image_feature_feed
def translation_predicts(imgs, img_features, predictor, results): if isinstance(img_features[0], np.string_): img_features = np.array([melt.read_image(pic_path) for pic_path in img_features]) texts, _ = predictor.predict_text(img_features) #only use top prediction of beam search texts = [x[0] for x in texts] for i in range(len(texts)): #for eval even if only one predict must also be list, also exclude last end id if not FLAGS.eval_translation_reseg: texts[i] = [' '.join([str(x) for x in texts[i][:list(texts[i]).index(vocab.end_id())]])] else: import jieba texts[i] = ''.join([vocab.key(int(x)) for x in texts[i][:list(texts[i]).index(vocab.end_id())]]) texts[i] = [' '.join([x.encode('utf-8') for x in jieba.cut(texts[i])])] results[imgs[i]] = texts[i]
def convert_to_tfrecord(input_files, output_file): """Converts a file to TFRecords.""" print('Generating %s' % output_file) with tf.python_io.TFRecordWriter(output_file) as record_writer: for input_file in tqdm(input_files, ascii=True): id = os.path.basename(input_file)[:-4] #img = cv2.imread(input_file) img = melt.read_image(input_file) # turn to channel first #img = img.transpose(2,0,1) if 'test' not in output_file: label = m[id] else: label = -1 example = tf.train.Example(features=tf.train.Features( feature={ 'id': melt.bytes_feature(id), #'image': melt.bytes_feature(img.tobytes()), 'image': melt.bytes_feature(img), 'label': melt.int64_feature(label) })) record_writer.write(example.SerializeToString())
def deal_file_with_imgdir(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_path = os.path.join(FLAGS.image_dir, img.replace('/', '_')) encoded_image = melt.read_image(image_path) is_top_text = True for text in texts: text = normalize.norm(text) if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) #actually save pic path instead of image feature image_features.append( os.path.join(FLAGS.image_dir, img.replace('/', '_'))) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
word_ids = text2ids.text2ids(text) seg_text = text2ids.ids2text(word_ids, print_end=False) print('label:', text, seg_text) words_importance = sim_predictor.words_importance([word_ids]) words_importance = words_importance[0] print('word importance:') for i in range(len(word_ids)): if word_ids[i] == 0: break print(vocab.key(int(word_ids[i])), words_importance[i], end='|') print() except Exception: print(traceback.format_exc(), file=sys.stderr) pass image = melt.read_image(image_path) word_ids, scores = predictor.word_ids([image]) word_id = word_ids[0] score = scores[0] print('best predict:', ids2text.translate(word_id[0]), score[0], '/'.join([vocab.key(int(id)) for id in word_id[0] if id != vocab.end_id()])) l = [id for id in word_id[0] if id != vocab.end_id()] l = gezi.pad(l, TEXT_MAX_WORDS) words_importance = sim_predictor.words_importance([l]) words_importance = words_importance[0] print('word importance:') for i in range(len(word_id[0])): if word_id[0][i] == vocab.end_id(): break print(vocab.key(int(word_id[0][i])), words_importance[i], end='|')
import melt p = melt.SimplePredictor('./mount/temp/cifar10/model/resnet.momentum.decay/epoch/model.ckpt-30.00-10530', key='pre_logits') feature = p.inference([melt.read_image('./mount/data/kaggle/cifar-10/test/10.png')]) print(feature)
def predict(image_name, num_show=1): image_path = os.path.join(image_dir, image_name) if not os.path.exists(image_path): print('path not exists:%s' % image_path) return img = melt.read_image(image_path) feature = image_model.gen_feature(img) if image_model is not None else img timer = gezi.Timer() init_states = predictor.inference( [ 'beam_search_beam_size', 'beam_search_initial_state', 'beam_search_initial_ids', 'beam_search_initial_logprobs', 'beam_search_initial_alignments' ], feed_dict={tf.get_collection('feed')[0]: feature}) step_func = lambda input_feed, state_feed: predictor.inference( [ 'beam_search_state', 'beam_search_ids', 'beam_search_logprobs', 'beam_search_alignments', ], feed_dict={ #TODO...attetion still need input_text feed, see rnn_decoder.py beam_search_step #but not hurt perfomance much because encoder is fast? Is it possible to avoid this? #anyway if no attention will not need input_text_feed tf.get_collection('feed')[0]: feature, tf.get_collection('beam_search_input_feed')[0]: input_feed, tf.get_collection('beam_search_state_feed')[0]: state_feed }) beams = melt.seq2seq.beam_search(init_states, step_func, end_id=ids2text.end_id(), max_words=decode_max_words, length_normalization_factor=1.) for i, beam in enumerate(beams): print(i, beam.words, ids2text.ids2text(beam.words), math.exp(beam.score)) # Plot images with attention weights words = beam.words img = ndimage.imread(image_path) num_features = melt.image.get_num_features(image_model_name) dim = int(np.sqrt(num_features)) #print('dim:', dim) n_words = len(words) w = np.round(np.sqrt(n_words)) h = np.ceil(np.float32(n_words) / w) plt.subplot(w, h, 1) plt.imshow(img) plt.axis('off') #img = scipy.misc.imresize(img, (dim, dim)) smooth = True #TODO smooth = Ture seems not work not back ground pic smooth = False if i == 0: for j in range(len(words)): plt.subplot(w, h, j + 2) lab = pinyin.Convert( ids2text.vocab.key(words[j]).decode('utf8').encode('gbk')) lab += '(%0.2f)' % math.exp(beam.logprobs[j]) plt.text(0, 1, lab, backgroundcolor='white', fontsize=10) plt.text(0, 1, lab, color='black', fontsize=10) plt.imshow(img) if smooth: alpha_img = skimage.transform.pyramid_expand( beam.alignments_list[j].reshape(dim, dim), upscale=16, sigma=20) else: alpha_img = skimage.transform.resize( beam.alignments_list[j].reshape(dim, dim), [img.shape[0], img.shape[1]]) plt.imshow(alpha_img, alpha=0.8) plt.set_cmap(cm.Greys_r) plt.axis('off') #plt.show() plt.savefig('test%d.pdf' % i)
import sys, os from deepiu.util.sim_predictor import SimPredictor from deepiu.util import vocabulary import melt image_dir = '/home/gezi/data2/data/ai_challenger/image_caption/pic/' image_file = '6275b5349168ac3fab6a493c509301d023cf39d3.jpg' if len(sys.argv) > 1: image_file = sys.argv[1] image_path = os.path.join(image_dir, image_file) image_model_checkpoint_path = '/home/gezi/data/image_model_check_point/inception_resnet_v2_2016_08_30.ckpt' model_dir = '/home/gezi/new/temp/image-caption/ai-challenger/model/bow/' vocab_path = '/home/gezi/new/temp/image-caption/ai-challenger/tfrecord/seq-basic/vocab.txt' vocabulary.init(vocab_path) vocab = vocabulary.vocab predictor = SimPredictor(model_dir, image_model_checkpoint_path, image_model_name='InceptionResnetV2') scores, word_ids = predictor.top_words([melt.read_image(image_path)], 50) scores = scores[0] word_ids = word_ids[0] for word_id, score in zip(word_ids, scores): print(vocab.key(int(word_id)), score)
def predicts_txt2im(text_strs, texts, predictor, rank_metrics, exact_predictor=None): timer = gezi.Timer('preidctor.predict text2im') if exact_predictor is None: if assistant_predictor: exact_predictor = predictor predictor = assistant_predictor _, img_features = get_image_names_and_features() # TODO gpu outofmem predict for showandtell #---NOTICE this might be too much mem cost if image is original encoded binary not image feature img_features = img_features[:FLAGS.max_images] if isinstance(img_features[0], np.string_): assert(len(img_features) < 2000) #otherwise too big mem .. img_features = [melt.read_image(pic_path) for pic_path in img_features] step = len(img_features) if FLAGS.metric_eval_images_size > 0 and FLAGS.metric_eval_images_size < step: step = FLAGS.metric_eval_images_size start = 0 scores = [] while start < len(img_features): end = start + step if end > len(img_features): end = len(img_features) #print('predicts images start:', start, 'end:', end, file=sys.stderr, end='\r') #here might not accept raw image for bow predictor as assistant predictor TODO how to add image process here to gen feature first? score = predictor.predict(img_features[start: end], texts) scores.append(score) start = end #score = predictor.predict(img_features, texts) score = np.concatenate(scores, 0) score = score.transpose() #print('image_feature_shape:', img_features.shape, 'text_feature_shape:', texts.shape, 'score_shape:', score.shape) timer.print() text2img = get_bidrectional_lable_map_txt2im() num_imgs = img_features.shape[0] for i, text_str in enumerate(text_strs): indexes = (-score[i]).argsort() #rerank if exact_predictor: top_indexes = indexes[:FLAGS.assistant_rerank_num] exact_imgs = img_features[top_indexes] exact_score = exact_predictor.elementwise_predict(exact_imgs, [texts[i]]) exact_score = exact_score[0] exact_indexes = (-exact_score).argsort() new_indexes = [x for x in indexes] for j in range(len(exact_indexes)): new_indexes[j] = indexes[exact_indexes[j]] indexes = new_indexes hits = text2img[text_str] num_positions = min(num_imgs, FLAGS.metric_topn) #num_positions = num_imgs labels = [indexes[j] in hits for j in xrange(num_positions)] rank_metrics.add(labels)
def predicts(imgs, img_features, predictor, rank_metrics, exact_predictor=None, exact_ratio=1.): # TODO gpu outofmem predict for showandtell# if exact_predictor is None: if assistant_predictor is not None: exact_predictor = predictor predictor = assistant_predictor #print(predictor, exact_predictor) if isinstance(img_features[0], np.string_): assert(len(img_features) < 2000) #otherwise too big mem .. img_features = np.array([melt.read_image(pic_path) for pic_path in img_features]) img2text = get_bidrectional_lable_map() random = True need_shuffle = False if FLAGS.max_texts > 0 and len(all_distinct_texts) > FLAGS.max_texts: assert random if not random: texts = all_distinct_texts[:FLAGS.max_texts] else: need_shuffle = True all_hits = set() for img in (imgs): hits = img2text[img] for hit in hits: all_hits.add(hit) index = np.random.choice(len(all_distinct_texts), FLAGS.max_texts, replace=False) index = [x for x in index if x not in all_hits] index = list(all_hits) + index index = index[:FLAGS.max_texts] index = np.array(index) texts = all_distinct_texts[index] else: texts = all_distinct_texts text_strs = all_distinct_text_strs step = len(texts) if FLAGS.metric_eval_texts_size > 0 and FLAGS.metric_eval_texts_size < step: step = FLAGS.metric_eval_texts_size start = 0 scores = [] while start < len(texts): end = start + step if end > len(texts): end = len(texts) #print('predicts texts start:', start, 'end:', end, end='\r', file=sys.stderr) score = predictor.predict(img_features, texts[start: end]) scores.append(score) start = end score = np.concatenate(scores, 1) #print('image_feature_shape:', img_features.shape, 'text_feature_shape:', texts.shape, 'score_shape:', score.shape) num_texts = texts.shape[0] for i, img in enumerate(imgs): indexes = (-score[i]).argsort() #rerank if exact_predictor: top_indexes = indexes[:FLAGS.assistant_rerank_num] exact_texts = texts[top_indexes] exact_score = exact_predictor.elementwise_predict([img_features[i]], exact_texts) exact_score = np.squeeze(exact_score) if exact_ratio < 1.: for j in range(len(top_indexes)): exact_score[j] = exact_ratio * exact_score[j] + (1. - exact_ratio) * score[i][top_indexes[j]] #print(exact_score) exact_indexes = (-exact_score).argsort() #print(exact_indexes) new_indexes = [x for x in indexes] for j in range(len(exact_indexes)): new_indexes[j] = indexes[exact_indexes[j]] indexes = new_indexes hits = img2text[img] if FLAGS.show_info_interval and i % FLAGS.show_info_interval == 0: label_text = '|'.join([text_strs[x] for x in hits]) img_str = img if is_img(img): img_str = '{0}<p><a href={1} target=_blank><img src={1} height=200></a></p>'.format(img, get_img_url(img)) logging.info('<P>obj: {} label: {}</P>'.format(img_str, label_text)) for j in range(5): is_hit = indexes[j] in hits if not need_shuffle else index[indexes[j]] in hits logging.info('<P>{} {} {} {}</P>'.format(j, is_hit, ids2text(texts[indexes[j]]), exact_score[exact_indexes[j]] if exact_predictor else score[i][indexes[j]])) #notice only work for recall@ or precision@ not work for ndcg@, if ndcg@ must use all num_positions = min(num_texts, FLAGS.metric_topn) #num_positions = num_texts if not need_shuffle: labels = [indexes[j] in hits for j in xrange(num_positions)] else: labels = [index[indexes[j]] in hits for j in xrange(num_positions)] rank_metrics.add(labels)