num = 0 count = 0 for line in sys.stdin: if num % 1000 == 0: print(num, file=sys.stderr) num += 1 l = line.rstrip().split('\t') img = l[0] img_end = IMAGE_FEATURE_LEN + 1 img_feature = [float(x) for x in l[1:img_end]] texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: words = segmentor.Segment(text, FLAGS.seg_method) word_ids = [ vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK ] word_ids_length = len(word_ids) if num % 1000 == 0: #print(libgezi.gbk2utf8('\t'.join(words)), file=sys.stderr) print('\t'.join(words), file=sys.stderr) print(word_ids, file=sys.stderr) if len(word_ids) == 0: continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if writer is not None: example = tf.train.Example(features=tf.train.Features( feature={
predictor = melt.Predictor('./model.ckpt-12000') #vocabulary.init() #vocab = vocabulary.vocab vocab = Vocabulary(FLAGS.vocab, NUM_RESERVED_IDS) ids_list = [] text_list = [] for line in open('./test.txt'): text = line.strip().split('\t')[-1] text_list.append(text) words = line.split() ids = [ vocab.id(word) for word in text.split(WORDS_SEP) if vocab.has(word) or ENCODE_UNK ] ids = gezi.pad(ids, TEXT_MAX_WORDS) ids_list.append(ids) ids_list = np.array(ids_list) def bulk_predict(predictor, images, texts): scores = predictor.inference( 'score', { '%s/%s' % (FLAGS.algo, FLAGS.image_feature_place): images, '%s/%s' % (FLAGS.algo, FLAGS.text_place): texts }) return scores