Beispiel #1
0
def _text2ids(text, max_words):
    word_ids = text2ids.text2ids(text,
                                 seg_method=FLAGS.seg_method_,
                                 feed_single=FLAGS.feed_single_,
                                 allow_all_zero=True,
                                 pad=False)
    word_ids = gezi.pad(word_ids, max_words, 0)

    return word_ids
Beispiel #2
0
 def _text2ids(text, max_words):
   word_ids = text2ids.text2ids(text, 
                                seg_method=FLAGS.seg_method, 
                                feed_single=FLAGS.feed_single, 
                                allow_all_zero=True, 
                                pad=False)
   word_ids_length = len(word_ids)
   word_ids = word_ids[:max_words]
   word_ids = gezi.pad(word_ids, max_words, 0)
   return word_ids
Beispiel #3
0
def _text2ids(text, max_words):
  word_ids = text2ids.text2ids(text, 
                               seg_method=FLAGS.seg_method_, 
                               feed_single=FLAGS.feed_single_, 
                               append_start=False,
                               append_end=False,
                               allow_all_zero=True, 
                               pad=True,
                               max_words=max_words)
  return word_ids
Beispiel #4
0
def _text2ids(text, max_words):
  word_ids = text2ids.text2ids(text, 
                               seg_method='basic', 
                               feed_single=True, 
                               allow_all_zero=True, 
                               pad=False)
  word_ids = word_ids[:max_words]
  word_ids = gezi.pad(word_ids, max_words, 0)

  return word_ids
Beispiel #5
0
def predict(text):
    timer = gezi.Timer()
    text_ids = text2ids.text2ids(text, FLAGS.seg_method_, feed_single=True)
    print('text_ids', text_ids)

    #seq_len = 50

    #print('words', words)
    argmax_encode = predictor.inference(
        ['text_importance'], feed_dict={'rnn/main/text:0': [text_ids]})
    print('argmax_encode', argmax_encode[0])

    argmax_encode = argmax_encode[0][0]

    text_ids = text2ids.text2ids(text,
                                 FLAGS.seg_method_,
                                 feed_single=True,
                                 append_start=True,
                                 append_end=True)
    words = text2ids.ids2words(text_ids)

    seq_len = 0
    for x in words:
        if x != 0:
            seq_len += 1
        else:
            break

    print(text_ids)

    # visualize model
    import matplotlib.pyplot as plt
    argmaxs = [np.sum((argmax_encode == k)) for k in range(seq_len)]
    print('argmaxs', argmaxs, np.sum(argmaxs), seq_len)
    x = range(len(argmax_encode))
    y = [100.0 * n / np.sum(argmaxs) for n in argmaxs]
    #print(words, y)
    print(text)
    for word, score in zip(words, y):
        print(word, score)
Beispiel #6
0
    def predict(self,
                inputs,
                seg_method='basic',
                feed_single=True,
                max_words=None):
        if not isinstance(inputs, (list, tuple, np.ndarray)):
            inputs = [inputs]
        if isinstance(inputs[0][0], str):
            word_ids = [
                text2ids.text2ids(input,
                                  seg_method=seg_method,
                                  feed_single=feed_single,
                                  max_words=max_words) for input in inputs
            ]
        else:
            word_ids = inputs

        return self.predictor.predict(word_ids), word_ids
Beispiel #7
0
END_WORD = '</S>'
NUM_WORD = '<NUM>'

print('seg_method:', FLAGS.seg_method_, file=sys.stderr)

num = 0
for line in sys.stdin:
  if num % 10000 == 0:
    print(num, file=sys.stderr)
  l = line.rstrip().split('\t')
  
  texts = l[1].split('\x01')
  
  for text in texts:
    ids = text2ids.text2ids(text, 
        seg_method=FLAGS.seg_method_,
        feed_single=FLAGS.feed_single_, 
        allow_all_zero=True, 
        pad=False, 
        append_start=True,
        append_end=True,
        to_lower=True,
        norm_digit=True)
    if num % 10000 == 0:
      print(ids, file=sys.stderr)
      print(text2ids.ids2text(ids), file=sys.stderr)
    ids = map(str, ids)
    if ids:
      print('\t'.join(ids))
  num += 1
Beispiel #8
0
dir = '/home/gezi/new/temp/makeup/title2name/tfrecord/seq-basic/'

text2ids.init(os.path.join(dir, 'vocab.txt'))
vocab = text2ids.vocab

embsim = melt.EmbeddingSim(os.path.join(dir, 'word2vec'), name='w_in')

corpus_pattern = os.path.join('/home/gezi/data/product/makeup/tb/title2name/valid/*')

max_words = 50
#itexts = ['ÑÅÊ«À¼÷ìË®Èó˪', 'ÑÅÊ«À¼÷ìС×ØÆ¿', 'ÑÅÊ«À¼÷ìºìʯÁñ', 'æÃÃÀ¿óÎïȪ²¹Ë®¾«»ª', 'Adidas°¢µÏ´ï˹ÄÐÊ¿ÏãË®ÄÐÊ¿¹ÅÁúµ­ÏãË® ±ùµãÄÐÏã100ml¡¾¾©¶«³¬ÊС¿']

itexts = ['ÑÅÊ«À¼÷ìanrÐÞ»¤¼¡Í¸¾«»ªÂ¶']

left_ids = [text2ids.text2ids(x, seg_method='basic', feed_single=True, max_words=max_words) for x in itexts]


lids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) 
rids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) 
nids_ = embsim.top_sim(lids_, rids_)
sess = embsim._sess 

corpus_text = []
for file in glob.glob(corpus_pattern):
  corpus_text += open(file).readlines()
corpus_text = [x.strip() for x in corpus_text]

r_text = [x.split('\t')[1] for x in corpus_text]
r_text = list(set(r_text))
right_ids = [text2ids.text2ids(x, seg_method='basic', feed_single=True, max_words=max_words) for x in r_text] 
Beispiel #9
0
  image_file = raw_input('image_file like 6275b5349168ac3fab6a493c509301d023cf39d3.jpg:')
  if image_file == 'q':
    break

  image_path = os.path.join(image_dir, image_file)
  print('image_path:', image_path)

  if not os.path.exists(image_path):
    print('image path not find!')
    continue

  try:
    hits = img2text[image_file]
    texts = [text_strs[hit] for hit in hits]
    for text in texts:
      word_ids = text2ids.text2ids(text)
      seg_text = text2ids.ids2text(word_ids, print_end=False)
      print('label:', text, seg_text)
      words_importance = sim_predictor.words_importance([word_ids])
      words_importance = words_importance[0]
      print('word importance:')
      for i in range(len(word_ids)):
        if word_ids[i] == 0:
          break 
        print(vocab.key(int(word_ids[i])), words_importance[i], end='|')  
      print()
  except Exception:
    print(traceback.format_exc(), file=sys.stderr)    
    pass

  image = melt.read_image(image_path)
from libword_counter import Vocabulary

from deepiu.util import text2ids

dir = '/home/gezi/new/temp/makeup/title2name/tfrecord/seq-basic/'

text2ids.init(os.path.join(dir, 'vocab.txt'))
vocab = text2ids.vocab

embsim = melt.EmbeddingSim(os.path.join(dir, 'word2vec'), name='w_in')

corpus_file = os.path.join('/home/gezi/data/product/makeup/tb/title2name/valid/name.filtered.rand.valid.txt_0')

max_words = 50
itext = 'ÑÅÊ«À¼÷ìË®Èó˪'
left_ids = text2ids.text2ids(itext, seg_method='basic', feed_single=True, max_words=max_words)


corpus_text = open(corpus_file).readlines()

corpus_text = [x.split()[0] for x in corpus_text]

right_ids = [text2ids.text2ids(x, seg_method='basic', feed_single=True, max_words=max_words) for x in corpus_text[:1000]]

print(right_ids)


lids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) 
rids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words]) 
nids_ = embsim.top_sim(lids_, rids_)
sess = embsim._sess 
Beispiel #11
0
vocab = text2ids.vocab

embsim = melt.EmbeddingSim(os.path.join(dir, 'word2vec'), name='w_in')

corpus_pattern = os.path.join(
    '/home/gezi/data/product/makeup/tb/title2name/valid/*')

max_words = 50
itexts = [
    'ÑÅÊ«À¼÷ìË®Èó˪', 'ÑÅÊ«À¼÷ìС×ØÆ¿', 'ÑÅÊ«À¼÷ìºìʯÁñ', 'æÃÃÀ¿óÎïȪ²¹Ë®¾«»ª',
    'Adidas°¢µÏ´ï˹ÄÐÊ¿ÏãË®ÄÐÊ¿¹ÅÁúµ­ÏãË® ±ùµãÄÐÏã100ml¡¾¾©¶«³¬ÊС¿'
]

left_ids = [
    text2ids.text2ids(x,
                      seg_method='basic',
                      feed_single=True,
                      max_words=max_words) for x in itexts
]

lids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words])
rids_ = tf.placeholder(dtype=tf.int32, shape=[None, max_words])
nids_ = embsim.top_sim(lids_, rids_)
sess = embsim._sess

corpus_text = []
for file in glob.glob(corpus_pattern):
    corpus_text += open(file).readlines()
corpus_text = [x.strip() for x in corpus_text]

r_text = [x.split('\t')[1] for x in corpus_text]
r_text = list(set(r_text))
Beispiel #12
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)
            l = line.rstrip().split('\t')
            img = l[0]
            img_end = IMAGE_FEATURE_LEN + 1
            img_feature = [float(x) for x in l[1:img_end]]
            texts = [x.split('\x01')[0] for x in l[img_end:]]
            for text in texts:
                if text.strip() == '':
                    continue
                #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict
                #words = segmentor.Segment(text, FLAGS.seg_method)
                #word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK]
                word_ids = text2ids.text2ids(text,
                                             seg_method=FLAGS.seg_method,
                                             feed_single=FLAGS.feed_single,
                                             allow_all_zero=True,
                                             pad=False)
                word_ids_length = len(word_ids)
                if num % 1000 == 0:
                    print(text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    continue
                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

                if FLAGS.np_save:
                    gtexts[thread_index].append(word_ids)
                    gtext_strs[thread_index].append(text)

                #add pos info? weght info? or @TODO add click num info
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'image_name': melt.bytes_feature(img),
                        'image_feature': melt.float_feature(img_feature),
                        'text': melt.int_feature(word_ids),
                        'text_str': melt.bytes_feature(text),
                    }))
                writer.write(example)

                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
            num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]