def _text2ids(text, max_words): word_ids = text2ids.text2ids(text, seg_method=FLAGS.seg_method_, feed_single=FLAGS.feed_single_, allow_all_zero=True, pad=False) word_ids = gezi.pad(word_ids, max_words, 0) return word_ids
def _text2ids(text, max_words): word_ids = text2ids.text2ids(text, seg_method=FLAGS.seg_method, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) word_ids = word_ids[:max_words] word_ids = gezi.pad(word_ids, max_words, 0) return word_ids
def _text2ids(text, max_words): word_ids = text2ids.text2ids(text, seg_method=FLAGS.seg_method_, feed_single=FLAGS.feed_single_, append_start=False, append_end=False, allow_all_zero=True, pad=True, max_words=max_words) return word_ids
def _text2ids(text, max_words): word_ids = text2ids.text2ids(text, seg_method='basic', feed_single=True, allow_all_zero=True, pad=False) word_ids = word_ids[:max_words] word_ids = gezi.pad(word_ids, max_words, 0) return word_ids
def predict(text): timer = gezi.Timer() text_ids = text2ids.text2ids(text, FLAGS.seg_method_, feed_single=True) print('text_ids', text_ids) #seq_len = 50 #print('words', words) argmax_encode = predictor.inference( ['text_importance'], feed_dict={'rnn/main/text:0': [text_ids]}) print('argmax_encode', argmax_encode[0]) argmax_encode = argmax_encode[0][0] text_ids = text2ids.text2ids(text, FLAGS.seg_method_, feed_single=True, append_start=True, append_end=True) words = text2ids.ids2words(text_ids) seq_len = 0 for x in words: if x != 0: seq_len += 1 else: break print(text_ids) # visualize model import matplotlib.pyplot as plt argmaxs = [np.sum((argmax_encode == k)) for k in range(seq_len)] print('argmaxs', argmaxs, np.sum(argmaxs), seq_len) x = range(len(argmax_encode)) y = [100.0 * n / np.sum(argmaxs) for n in argmaxs] #print(words, y) print(text) for word, score in zip(words, y): print(word, score)
def predict(self, inputs, seg_method='basic', feed_single=True, max_words=None): if not isinstance(inputs, (list, tuple, np.ndarray)): inputs = [inputs] if isinstance(inputs[0][0], str): word_ids = [ text2ids.text2ids(input, seg_method=seg_method, feed_single=feed_single, max_words=max_words) for input in inputs ] else: word_ids = inputs return self.predictor.predict(word_ids), word_ids
END_WORD = '</S>' NUM_WORD = '<NUM>' print('seg_method:', FLAGS.seg_method_, file=sys.stderr) num = 0 for line in sys.stdin: if num % 10000 == 0: print(num, file=sys.stderr) l = line.rstrip().split('\t') texts = l[1].split('\x01') for text in texts: ids = text2ids.text2ids(text, seg_method=FLAGS.seg_method_, feed_single=FLAGS.feed_single_, allow_all_zero=True, pad=False, append_start=True, append_end=True, to_lower=True, norm_digit=True) if num % 10000 == 0: print(ids, file=sys.stderr) print(text2ids.ids2text(ids), file=sys.stderr) ids = map(str, ids) if ids: print('\t'.join(ids)) num += 1
dir = '/home/gezi/new/temp/makeup/title2name/tfrecord/seq-basic/' text2ids.init(os.path.join(dir, 'vocab.txt')) vocab = text2ids.vocab embsim = melt.EmbeddingSim(os.path.join(dir, 'word2vec'), name='w_in') corpus_pattern = os.path.join('/home/gezi/data/product/makeup/tb/title2name/valid/*') max_words = 50 #itexts = ['ÑÅÊ«À¼÷ìË®Èó˪', 'ÑÅÊ«À¼÷ìС×ØÆ¿', 'ÑÅÊ«À¼÷ìºìʯÁñ', 'æÃÃÀ¿óÎïȪ²¹Ë®¾«»ª', 'Adidas°¢µÏ´ï˹ÄÐÊ¿ÏãË®ÄÐÊ¿¹ÅÁúµÏãË® ±ùµãÄÐÏã100ml¡¾¾©¶«³¬ÊС¿'] itexts = ['ÑÅÊ«À¼÷ìanrÐÞ»¤¼¡Í¸¾«»ªÂ¶'] left_ids = [text2ids.text2ids(x, seg_method='basic', feed_single=True, max_words=max_words) for x in itexts] lids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) rids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) nids_ = embsim.top_sim(lids_, rids_) sess = embsim._sess corpus_text = [] for file in glob.glob(corpus_pattern): corpus_text += open(file).readlines() corpus_text = [x.strip() for x in corpus_text] r_text = [x.split('\t')[1] for x in corpus_text] r_text = list(set(r_text)) right_ids = [text2ids.text2ids(x, seg_method='basic', feed_single=True, max_words=max_words) for x in r_text]
image_file = raw_input('image_file like 6275b5349168ac3fab6a493c509301d023cf39d3.jpg:') if image_file == 'q': break image_path = os.path.join(image_dir, image_file) print('image_path:', image_path) if not os.path.exists(image_path): print('image path not find!') continue try: hits = img2text[image_file] texts = [text_strs[hit] for hit in hits] for text in texts: word_ids = text2ids.text2ids(text) seg_text = text2ids.ids2text(word_ids, print_end=False) print('label:', text, seg_text) words_importance = sim_predictor.words_importance([word_ids]) words_importance = words_importance[0] print('word importance:') for i in range(len(word_ids)): if word_ids[i] == 0: break print(vocab.key(int(word_ids[i])), words_importance[i], end='|') print() except Exception: print(traceback.format_exc(), file=sys.stderr) pass image = melt.read_image(image_path)
from libword_counter import Vocabulary from deepiu.util import text2ids dir = '/home/gezi/new/temp/makeup/title2name/tfrecord/seq-basic/' text2ids.init(os.path.join(dir, 'vocab.txt')) vocab = text2ids.vocab embsim = melt.EmbeddingSim(os.path.join(dir, 'word2vec'), name='w_in') corpus_file = os.path.join('/home/gezi/data/product/makeup/tb/title2name/valid/name.filtered.rand.valid.txt_0') max_words = 50 itext = 'ÑÅÊ«À¼÷ìË®Èó˪' left_ids = text2ids.text2ids(itext, seg_method='basic', feed_single=True, max_words=max_words) corpus_text = open(corpus_file).readlines() corpus_text = [x.split()[0] for x in corpus_text] right_ids = [text2ids.text2ids(x, seg_method='basic', feed_single=True, max_words=max_words) for x in corpus_text[:1000]] print(right_ids) lids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) rids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) nids_ = embsim.top_sim(lids_, rids_) sess = embsim._sess
vocab = text2ids.vocab embsim = melt.EmbeddingSim(os.path.join(dir, 'word2vec'), name='w_in') corpus_pattern = os.path.join( '/home/gezi/data/product/makeup/tb/title2name/valid/*') max_words = 50 itexts = [ 'ÑÅÊ«À¼÷ìË®Èó˪', 'ÑÅÊ«À¼÷ìС×ØÆ¿', 'ÑÅÊ«À¼÷ìºìʯÁñ', 'æÃÃÀ¿óÎïȪ²¹Ë®¾«»ª', 'Adidas°¢µÏ´ï˹ÄÐÊ¿ÏãË®ÄÐÊ¿¹ÅÁúµÏãË® ±ùµãÄÐÏã100ml¡¾¾©¶«³¬ÊС¿' ] left_ids = [ text2ids.text2ids(x, seg_method='basic', feed_single=True, max_words=max_words) for x in itexts ] lids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) rids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) nids_ = embsim.top_sim(lids_, rids_) sess = embsim._sess corpus_text = [] for file in glob.glob(corpus_pattern): corpus_text += open(file).readlines() corpus_text = [x.strip() for x in corpus_text] r_text = [x.split('\t')[1] for x in corpus_text] r_text = list(set(r_text))
def deal_file(file, thread_index): out_file = '{}/{}_{}'.format( FLAGS.output_directory, FLAGS.name, thread_index) if FLAGS.threads > 1 else '{}/{}'.format( FLAGS.output_directory, FLAGS.name) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip().split('\t') img = l[0] img_end = IMAGE_FEATURE_LEN + 1 img_feature = [float(x) for x in l[1:img_end]] texts = [x.split('\x01')[0] for x in l[img_end:]] for text in texts: if text.strip() == '': continue #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict #words = segmentor.Segment(text, FLAGS.seg_method) #word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK] word_ids = text2ids.text2ids(text, seg_method=FLAGS.seg_method, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if len(word_ids) == 0: continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(text) #add pos info? weght info? or @TODO add click num info example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(img_feature), 'text': melt.int_feature(word_ids), 'text_str': melt.bytes_feature(text), })) writer.write(example) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length num += 1 texts_dict[thread_index] = gtexts[thread_index] text_strs_dict[thread_index] = gtext_strs[thread_index]