Esempio n. 1
0
def predict(predictor, input_text):
  word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  print('word_ids', word_ids, 'len:', len(word_ids))
  print(text2ids.ids2text(word_ids))

  #tf.while_loop has debug problem ValueError: Causality violated in timing relations of debug dumps: seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/Merge_7 (1489649052260629): these input(s) are not satisfied: [(u'seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/Enter_7', 0), (u'seq2seq/main/decode_4/dynamic_rnn_decoder/rnn/while/NextIteration_7', 0)
  #https://github.com/tensorflow/tensorflow/issues/8337 From your error message, it appears that you are using tf.while_loop. Can you try setting its paralle_iterations parameter to 1 and see if the error still happens?
  #There may be a bug in how tfdbg handles while_loops with parallel_iterations > 1.
  #I think it might be a GPU thing.
  #The example below errors if run as python tf_8337_minimal.py but is fine is run as CUDA_VISIBLE_DEVICES=-1 
  timer = gezi.Timer()
  text, score = predictor.inference(['text', 'text_score'], 
                                    feed_dict= {
                                      'seq2seq/model_init_1/input_text:0': [word_ids]
                                      })
  
  for result in text:
    print(result, text2ids.ids2text(result), 'decode time(ms):', timer.elapsed_ms())
  
  timer = gezi.Timer()
  texts, scores = predictor.inference(['beam_text', 'beam_text_score'], 
                                    feed_dict= {
                                      'seq2seq/model_init_1/input_text:0': [word_ids]
                                      })

  texts = texts[0]
  scores = scores[0]
  for text, score in zip(texts, scores):
    print(text, text2ids.ids2text(text), score)

  print('beam_search using time(ms):', timer.elapsed_ms())
Esempio n. 2
0
def predict(predictor, image_path):
    timer = gezi.Timer()
    image_feature = image_model.process_one_image(image_path)
    text, score = predictor.inference(
        ['text', 'text_score'],
        feed_dict={
            'show_and_tell/model_init_1/image_feature:0': image_feature
        })

    for result in text:
        print(result, text2ids.ids2text(result), 'decode time(ms):',
              timer.elapsed_ms())

    timer = gezi.Timer()
    texts, scores = predictor.inference(
        ['beam_text', 'beam_text_score'],
        feed_dict={
            'show_and_tell/model_init_1/image_feature:0': image_feature
        })

    texts = texts[0]
    scores = scores[0]
    for text, score in zip(texts, scores):
        print(text, text2ids.ids2text(text), score)

    print('beam_search using time(ms):', timer.elapsed_ms())
Esempio n. 3
0
def read_once(sess, step, ops, neg_ops=None):
  global max_index
  if not hasattr(read_once, "timer"):
    read_once.timer = Timer()

  image_name, image_feature, text, text_str, input_text, input_text_str = sess.run(ops)

  
  if step % 100 == 0:
    print('step:', step)
    print('duration:', read_once.timer.elapsed())
    print('image_name:', image_name[0])
    print('text:', text[0])
    print('len(text)', len(text[0]))
    print(text2ids.ids2text(text[0]))
    print('text_str:', text_str[0])
    print('len(text_str):', len(text_str[0]))
    print('input_text:', input_text[0])
    print('len(input_text)', len(input_text[0]))
    print(text2ids.ids2text(input_text[0]))
    print('input_text_str:', input_text_str[0])
    print('len(input_text_str):', len(input_text_str[0]))
    
    

  cur_max_index = np.max(text)
  if cur_max_index > max_index:
    max_index = cur_max_index
    def _gen_rl_feed_dict():
        global counter
        counter += 1

        image_names, sampled_captions, greedy_captions = sess.run([
            gtrain_image_name, trainer.rl.sampled_caption,
            trainer.rl.greedy_caption
        ])
        #notice suggest not using tokenize, especially for cn , no use and will print log..
        rewards, baseline = reinforcement_learning.calc_score(
            sampled_captions,
            greedy_captions,
            image_names,
            tokenize=FLAGS.use_tokenize)
        if counter % 100 == 0:
            logging.info('label__caption: {}'.format('|'.join(
                evaluator.refs[image_names[0]])))
            logging.info('sample_caption: {}'.format(
                text2ids.ids2text(sampled_captions[0])))
            logging.info('greedy_caption: {}'.format(
                text2ids.ids2text(greedy_captions[0])))
            logging.info('rewards: {} baseline: {}'.format(
                rewards[0], baseline[0]))
        feed_dict = {
            trainer.rl.rewards_feed: rewards,
            trainer.rl.baseline_feed: baseline
        }
        return feed_dict
Esempio n. 5
0
def print_img_text_negscore(img, i, text, score, text_ids, neg_text=None, neg_score=None, neg_text_ids=None):
  print_img(img, i)
  text_words = ids2text(text_ids)
  if neg_text is not None:
    neg_text_words = ids2text(neg_text_ids)
  logging.info(content_html.format('pos   :[ {} ] {:.6f} {}'.format(text, score, text_words)))
  if neg_text is not None:
    logging.info(content_html.format('neg  :[ {} ] {:.6f} {}'.format(neg_text, neg_score, neg_text_words)))  
Esempio n. 6
0
def predict(predictor, input_text):
  word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  print('word_ids', word_ids, 'len:', len(word_ids))
  print(text2ids.ids2text(word_ids))

  timer = gezi.Timer()
  initial_state, ids, logprobs = predictor.inference([
                                        'beam_search_initial_state', 
                                        'beam_search_initial_ids', 
                                        'beam_search_initial_logprobs'
                                        ], 
                                        feed_dict= {
                                          tf.get_collection('input_text_feed')[0] : [word_ids]
                                        })

  print('inital_state_shape', np.shape(initial_state))
  #[1, beam_size]
  ids = ids[0]
  logprobs = logprobs[0]

  print(ids, text2ids.ids2text(ids))
  print('logprob', logprobs)
  print('prob', [math.exp(x) for x in logprobs])
  print('inital_state', initial_state[0])

  print('first step using time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()

  input_feed = np.array(ids)
  state_feed = np.array([initial_state[0]] * len(ids))
  print('input_feed_shape', np.shape(input_feed))
  print('state_feed_shape', np.shape(state_feed))
  #state_feed = np.array(initial_state)

  state, ids, logprobs = predictor.inference([
                                        'beam_search_state', 
                                        'beam_search_ids', 
                                        'beam_search_logprobs'
                                        ], 
                                        feed_dict= {
                                          tf.get_collection('beam_search_input_feed')[0] : input_feed,
                                          tf.get_collection('beam_search_state_feed')[0] : state_feed
                                        })

  #print(state)
  print(ids)
  print(logprobs)

  ids = ids[0]
  logprobs = logprobs[0]

  print(ids, text2ids.ids2text(ids))
  print('logprob', logprobs)
  print('prob', [math.exp(x) for x in logprobs])
  print('state', state[0])

  print('second step using time(ms):', timer.elapsed_ms())
Esempio n. 7
0
def print_generated_text_score(generated_text, score, id=-1, name='gen'):
    if id >= 0:
        logging.info(
            content_html.format('{}_{}:[ {} ] {:.6f}'.format(
                name, id, ids2text(generated_text), score)))
    else:
        logging.info(
            content_html.format('{}:[ {} ] {:.6f}'.format(
                name, ids2text(generated_text), score)))
Esempio n. 8
0
def print_img_text_generatedtext(img, i, input_text, input_text_ids, 
                                 text, score, text_ids,
                                 generated_text, generated_text_beam=None):
  print_img(img, i)
  score = math.exp(-score)
  input_text_words = ids2text(input_text_ids)
  text_words = ids2text(text_ids)
  logging.info(content_html.format('in_ [ {} ] {}'.format(input_text, input_text_words)))
  logging.info(content_html.format('pos [ {} ] {:.6f} {}'.format(text, score, text_words)))
  print_generated_text(generated_text)
  if generated_text_beam is not None:
    print_generated_text(generated_text_beam)
def predict(predictor, input_text):
  word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  print('word_ids', word_ids, 'len:', len(word_ids))
  print(text2ids.ids2text(word_ids))

  timer = gezi.Timer()
  
  #print(tf.get_collection('beam_search_initial_alignments'))
  #print(tf.get_collection('beam_search_alignments'))
  init_states = predictor.inference([
                                        'beam_search_beam_size',
                                        'beam_search_initial_state', 
                                        'beam_search_initial_ids', 
                                        'beam_search_initial_logprobs',
                                        'beam_search_initial_alignments' 
                                        ], 
                                        feed_dict= {
                                          tf.get_collection('input_text_feed')[0] : [word_ids]
                                        })

  step_func = lambda input_feed, state_feed : predictor.inference([
                                        'beam_search_state', 
                                        'beam_search_ids', 
                                        'beam_search_logprobs',
                                        #'attention_alignments'
                                        'beam_search_alignments', #must use this
                                        ], 
                                        feed_dict= {
                                          #TODO...attetion still need input_text feed, see rnn_decoder.py  beam_search_step
                                          #but not hurt perfomance much because encoder is fast? Is it possible to avoid this?
                                          #anyway if no attention  will not need input_text_feed
                                          tf.get_collection('input_text_feed')[0] : [word_ids],
                                          tf.get_collection('beam_search_input_feed')[0] : input_feed,
                                          tf.get_collection('beam_search_state_feed')[0] : state_feed
                                        })

  max_words = FLAGS.decode_max_words if FLAGS.decode_max_words else TEXT_MAX_WORDS
  beams = melt.seq2seq.beam_search(init_states, 
                                   step_func, 
                                   end_id=text2ids.end_id(), 
                                   max_words=max_words, 
                                   length_normalization_factor=0.)

  for i, beam in enumerate(beams):
    print(i, beam.words, text2ids.ids2text(beam.words), math.exp(beam.logprob), beam.logprob, beam.score, beam.logprobs)
    print(beam.alignments_list)

  print('beam search using time(ms):', timer.elapsed_ms())
Esempio n. 10
0
def read_once(sess, step, ops, neg_ops=None):
    global max_index
    if not hasattr(read_once, "timer"):
        read_once.timer = Timer()

    if neg_ops is None:
        image_name, image_feature, text, text_str = sess.run(ops)
    else:
        squeezed_neg_text_str = tf.squeeze(neg_ops[1])
        neg_ops += [squeezed_neg_text_str]
        ops = list(ops)
        ops.extend(neg_ops)

        image_name, image_feature, text, text_str, neg_text, neg_text_str, neg_text_str_squeeze = sess.run(
            ops)

    if step % 100 == 0:
        print('step:', step)
        print('duration:', read_once.timer.elapsed())
        print('image_name:', image_name[0])
        print('text:', text[0])
        print('len(text)', len(text[0]))
        print(text2ids.ids2text(text[0]))
        print('text_str:', text_str[0])
        print('len(text_str):', len(text_str[0]))

    cur_max_index = np.max(text)
    if cur_max_index > max_index:
        max_index = cur_max_index
 def _deal_debug_results(results):
     if FLAGS.use_weights or FLAGS.use_idf_weights:
         # NOTICE need FLAGS.train_loss_per_example = True, so will not flatten losses
         print('targets:', text2ids.ids2text(results[-2][0]))
         print('mask:', results[-1][0])
     print('results:', [x for x in results if len(x.shape) == 0])
     print('shapes:', [x.shape for x in results])
Esempio n. 12
0
def predict(predictor, input_text, text):
  input_word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
  print('input_word_ids', input_word_ids, 'len:', len(input_word_ids))
  print(text2ids.ids2text(input_word_ids))
  word_ids = _text2ids(text, INPUT_TEXT_MAX_WORDS)
  print('word_ids', word_ids, 'len:', len(word_ids))
  print(text2ids.ids2text(word_ids))

  timer = gezi.Timer()
  score = predictor.inference(['score'], 
                              feed_dict= {
                                      FLAGS.input_text_name: [input_word_ids],
                                      FLAGS.text_name: [word_ids]
                                      })
  
  print('score:', score)
  print('calc score time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()
  exact_score = predictor.inference(['exact_score'], 
                                    feed_dict= {
                                      FLAGS.input_text_name: [input_word_ids],
                                      FLAGS.text_name: [word_ids]
                                      })
  
  print('exact_score:', exact_score)
  print('calc score time(ms):', timer.elapsed_ms())

  timer = gezi.Timer()
  exact_prob, logprobs = predictor.inference(['exact_prob', 'seq2seq_logprobs'], 
                                    feed_dict= {
                                      FLAGS.input_text_name: [input_word_ids],
                                      FLAGS.text_name: [word_ids]
                                      })
  
  exact_prob = exact_prob[0]
  logprobs = logprobs[0]
  print('exact_prob:', exact_prob, 'ecact_logprob:', math.log(exact_prob))
  print('logprobs:', logprobs)
  print('sum_logprobs:', gezi.gen_sum_list(logprobs))
  print('calc prob time(ms):', timer.elapsed_ms())
Esempio n. 13
0
def print_img_text_generatedtext_score(img,
                                       i,
                                       input_text,
                                       input_text_ids,
                                       text,
                                       score,
                                       text_ids,
                                       generated_text,
                                       generated_text_score,
                                       generated_text_beam=None,
                                       generated_text_score_beam=None):
    print_img(img, i)
    score = math.exp(-score)
    input_text_words = ids2text(input_text_ids)
    text_words = ids2text(text_ids)
    logging.info(
        content_html.format('in_ [ {} ] {}'.format(input_text,
                                                   input_text_words)))
    logging.info(
        content_html.format('pos [ {} ] {:.6f} {}'.format(
            text, score, text_words)))

    try:
        print_generated_text_score(generated_text, generated_text_score)
    except Exception:
        for i, text in enumerate(generated_text):
            print_generated_text_score(text,
                                       generated_text_score[i],
                                       name='gen__max',
                                       id=i)

    if generated_text_beam is not None:
        try:
            print_generated_text_score(generated_text_beam,
                                       generated_text_score_beam)
        except Exception:
            for i, text in enumerate(generated_text_beam):
                print_generated_text_score(text,
                                           generated_text_score_beam[i],
                                           name='gen_beam',
                                           id=i)
Esempio n. 14
0
def main(_):
  text2ids.init()
  global_scope = ''
  if FLAGS.add_global_scope:
    global_scope = FLAGS.global_scope if FLAGS.global_scope else FLAGS.algo
 
  global sess
  sess = melt.get_session(log_device_placement=FLAGS.log_device_placement)
  with tf.variable_scope(global_scope):
    predictor =  algos_factory.gen_predictor(FLAGS.algo)
    with tf.variable_scope(FLAGS.main_scope) as scope:
      text, score, beam_text, beam_score = gen_predict_graph(predictor, scope)

  predictor.load(FLAGS.model_dir) 
  #input_text = "������������_��������ǰ��Ա���Ƭ"
  input_texts = ['���������һ�Ը�Ů�ڿ�����ջ�͸����˿¶�δ���������ڿ�Ů��-�Ա���',
                 '����̫����ô����',
                 '����������ʵ��С��ô��,����������ʵ��С���δ�ʩ',
                 '����ף�Ŀǰ4����1�굶']

  for input_text in input_texts:
    word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)

    print(word_ids)
    print(text2ids.ids2text(word_ids))

    timer = gezi.Timer()
    text_, score_ = sess.run([text, score], {predictor.input_text_place : [word_ids]})
    print(text_[0], text2ids.ids2text(text_[0]), score_[0], 'time(ms):', timer.elapsed_ms())

    timer = gezi.Timer()
    texts, scores = sess.run([beam_text, beam_score], {predictor.input_text_place : [word_ids]})

    texts = texts[0]
    scores = scores[0]
    for text_, score_ in zip(texts, scores):
      print(text_, text2ids.ids2text(text_), score_)

    print('beam_search using time(ms):', timer.elapsed_ms())
Esempio n. 15
0
def predicts(predictor, input_texts):
  word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts]
  timer = gezi.Timer()
  texts_list, scores_list = predictor.inference(['beam_text', 'beam_text_score'], 
                                    feed_dict= {
                                      tf.get_collection('lfeed')[-1]: word_ids_list
                                      })

  for texts, scores in zip(texts_list, scores_list):
    for text, score in zip(texts, scores):
      print(text, text2ids.ids2text(text), score, math.log(score))

  print('beam_search using time(ms):', timer.elapsed_ms())
Esempio n. 16
0
def deal_file(file):
  out_file = '{}/{}'.format(FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
  print('out_file:', out_file)
  with melt.tfrecords.Writer(out_file) as writer:
    num = 0
    for line in open(file):
      if num % 1000 == 0:
        print(num)
      
      l = line.rstrip('\n').split('\t')

      text = l[FLAGS.text_index]

      input_text = l[FLAGS.input_text_index]
      
      input_words = text2ids.Segmentor.Segment(input_text, FLAGS.seg_method)
      input_word_ids = text2ids.words2ids(input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      if len(input_word_ids) == 0:
        continue
      input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
      if FLAGS.pad:
        input_word_ids = gezi.pad(input_word_ids)

      words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
      word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
      word_ids_length = len(word_ids)
      if num % 1000 == 0:
        print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
      if word_ids_length == 0:
        continue 
      if is_luanma(words, word_ids):
        print('luanma', text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr)
        continue 
                  
      word_ids = word_ids[:TEXT_MAX_WORDS]
      if FLAGS.pad:
        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
      if not FLAGS.write_sequence_example:
        example = tf.train.Example(features=tf.train.Features(feature={
          'input_text_str': melt.bytes_feature(input_text),
          'input_text': melt.int64_feature(input_word_ids),
          'text_str': melt.bytes_feature(text),
          'text': melt.int64_feature(word_ids),
          }))
      else:
        example = tf.train.SequenceExample(
              context=melt.features(
              {
                'input_text_str': melt.bytes_feature(input_text),
                'text_str': melt.bytes_feature(text),
              }),
              feature_lists=melt.feature_lists(
              { 
                'input_text': melt.int64_feature_list(input_word_ids),
                'text': melt.int64_feature_list(word_ids)
              }))
      writer.write(example)
      
        #global counter, max_num_words, sum_words
      with record_counter.get_lock():
        record_counter.value += 1
      if word_ids_length > max_num_words.value:
        with max_num_words.get_lock():
          max_num_words.value = word_ids_length
      with sum_words.get_lock():
        sum_words.value += word_ids_length
      
      if FLAGS.np_save:
        assert FLAGS.threads == 1
        gtexts.append(word_ids)
        gtext_strs.append(text)
        
      num += 1   
      if num == FLAGS.num_max_records:
        break
Esempio n. 17
0
def train_process(trainer, predictor=None):
  input_app = InputApp.InputApp()
  input_results = input_app.gen_input()

  with tf.variable_scope(FLAGS.main_scope) as scope:
    ops, gen_feed_dict, deal_results = gen_train(
      input_app, 
      input_results, 
      trainer)
    scope.reuse_variables()

    if predictor is not None and FLAGS.gen_predict:
      beam_text, beam_text_score = gen_predict_graph(predictor)

    eval_ops, gen_eval_feed_dict, deal_eval_results = gen_validate(
      input_app, 
      input_results, 
      trainer, 
      predictor)

    metric_eval_fn = None
    if FLAGS.metric_eval:
      #generative can do this also but it is slow so just ingore this
      if not algos_factory.is_generative(FLAGS.algo): 
        metric_eval_fn = lambda: evaluator.evaluate_scores(predictor, random=True)

  if FLAGS.mode == 'train':
    melt.print_global_varaiables()
    melt.apps.train_flow(ops, 
                         gen_feed_dict_fn=gen_feed_dict,
                         deal_results_fn=deal_results,
                         eval_ops=eval_ops,
                         gen_eval_feed_dict_fn=gen_eval_feed_dict,
                         deal_eval_results_fn=deal_eval_results,
                         optimizer=FLAGS.optimizer,
                         learning_rate=FLAGS.learning_rate,
                         num_steps_per_epoch=input_app.num_steps_per_epoch,
                         model_dir=FLAGS.model_dir,
                         metric_eval_fn=metric_eval_fn,
                         sess=sess)#notice if use melt.constant in predictor then must pass sess
  else: #test predict
    predictor.load(FLAGS.model_dir)
    import conf  
    from conf import TEXT_MAX_WORDS, INPUT_TEXT_MAX_WORDS, NUM_RESERVED_IDS, ENCODE_UNK

    print('-------------------------', tf.get_collection('scores'))

    #TODO: now copy from prpare/gen-records.py
    def _text2ids(text, max_words):
      word_ids = text2ids.text2ids(text, 
                                   seg_method=FLAGS.seg_method, 
                                   feed_single=FLAGS.feed_single, 
                                   allow_all_zero=True, 
                                   pad=False)
      word_ids_length = len(word_ids)
      word_ids = word_ids[:max_words]
      word_ids = gezi.pad(word_ids, max_words, 0)
      return word_ids

    input_texts = [
                   #'包邮买二送一性感女内裤低腰诱惑透视蕾丝露臀大蝴蝶三角内裤女夏-淘宝网',
                   '大棚辣椒果实变小怎么办,大棚辣椒果实变小防治措施',
                   ]

    for input_text in input_texts:
      word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)
      print('word_ids', word_ids, 'len:', len(word_ids))
      print(text2ids.ids2text(word_ids))
      #similar as inference.py this is only ok for no attention mode TODO FIXME
      texts, scores = sess.run([tf.get_collection('text')[0], tf.get_collection('text_score')[0]], 
                             feed_dict={'seq2seq/model_init_1/input_text:0' : [word_ids]})
      print(texts[0], text2ids.ids2text(texts[0]), scores[0])

      texts, scores  = sess.run([beam_text, beam_text_score], 
                               feed_dict={predictor.input_text_feed: [word_ids]})

      texts = texts[0]
      scores = scores[0]
      for text, score in zip(texts, scores):
        print(text, text2ids.ids2text(text), score)
    
    input_texts = [
                   '大棚辣椒果实变小怎么办,大棚辣椒果实变小防治措施',
                   #'包邮买二送一性感女内裤低腰诱惑透视蕾丝露臀大蝴蝶三角内裤女夏-淘宝网',
                   "宝宝太胖怎么办呢",
                   '大棚辣椒果实变小怎么办,大棚辣椒果实变小防治措施',
                   #'大棚辣椒果实变小怎么办,大棚辣椒果实变小防治措施',
                   #'邹红建是阿拉斯加',
                   ]

    word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts]
    timer = gezi.Timer()
    texts_list, scores_list = sess.run([beam_text, beam_text_score], 
                               feed_dict={predictor.input_text_feed: word_ids_list})
    
    for texts, scores in zip(texts_list, scores_list):
      for text, score in zip(texts, scores):
        print(text, text2ids.ids2text(text), score, math.log(score))

    print('beam_search using time(ms):', timer.elapsed_ms())
Esempio n. 18
0
print('seg_method:', FLAGS.seg_method_, file=sys.stderr)

out_id = open(FLAGS.out_id, 'w')
out_text = open(FLAGS.out_text, 'w')

num = 0
for line in sys.stdin:
  if num % 10000 == 0:
    print(num, file=sys.stderr)
  text = line.strip()
  ids = text2ids.text2ids(text, 
      seg_method=FLAGS.seg_method_,
      feed_single=FLAGS.feed_single_, 
      allow_all_zero=True, 
      pad=False, 
      append_start=True,
      append_end=True,
      to_lower=True,
      norm_digit=True)
  seg_text = text2ids.ids2text(ids)
  if num % 10000 == 0:
    print(ids, file=sys.stderr)
    print(seg_text, file=sys.stderr)
  if ids:
    out_id.write('\t'.join(map(str, ids)))
    out_id.write('\n')
    out_text.write(seg_text)
    out_text.write('\n')
  num += 1
Esempio n. 19
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_dir,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            #line = line.lower()
            if num % 1000 == 0:
                print(num)
            if FLAGS.max_lines and num >= FLAGS.max_lines:
                break
            l = line.rstrip().split('\t')

            if len(l) != 2:
                continue

            ltext, rtext_list = l

            for rtext in rtext_list.split('\x01'):
                lword_ids = _text2ids(ltext, TEXT_MAX_WORDS)
                rword_ids = _text2ids(rtext, TEXT_MAX_WORDS)

                if not lword_ids or not rword_ids:
                    continue

                if num % 1000 == 0:
                    print(ltext,
                          lword_ids,
                          text2ids.ids2text(lword_ids),
                          file=sys.stderr)
                    print(rtext,
                          rword_ids,
                          text2ids.ids2text(rword_ids),
                          file=sys.stderr)

                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'ltext_str': melt.bytes_feature(ltext),
                        'ltext': melt.int_feature(lword_ids),
                        'rtext_str': melt.bytes_feature(rtext),
                        'rtext': melt.int_feature(rword_ids),
                    }))
                writer.write(example)

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    ltexts.append(lword_ids)
                    ltext_strs.append(ltext)
                    rtexts.append(rword_ids)
                    rtext_strs.append(rtext)

                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1

                word_ids = lword_ids
                word_ids_length = len(word_ids)
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
            num += 1
Esempio n. 20
0
END_WORD = '</S>'
NUM_WORD = '<NUM>'

print('seg_method:', FLAGS.seg_method_, file=sys.stderr)

num = 0
for line in sys.stdin:
  if num % 10000 == 0:
    print(num, file=sys.stderr)
  l = line.rstrip().split('\t')
  
  texts = l[1].split('\x01')
  
  for text in texts:
    ids = text2ids.text2ids(text, 
        seg_method=FLAGS.seg_method_,
        feed_single=FLAGS.feed_single_, 
        allow_all_zero=True, 
        pad=False, 
        append_start=True,
        append_end=True,
        to_lower=True,
        norm_digit=True)
    if num % 10000 == 0:
      print(ids, file=sys.stderr)
      print(text2ids.ids2text(ids), file=sys.stderr)
    ids = map(str, ids)
    if ids:
      print('\t'.join(ids))
  num += 1
Esempio n. 21
0
def main(_):
  text2ids.init()
  global_scope = ''
  if FLAGS.add_global_scope:
    global_scope = FLAGS.global_scope if FLAGS.global_scope else FLAGS.algo
 
  sess = melt.get_session(log_device_placement=FLAGS.log_device_placement)
  with tf.variable_scope(global_scope):
    predictor =  algos_factory.gen_predictor(FLAGS.algo)
    with tf.variable_scope(FLAGS.main_scope) as scope:
      ##--notice if not add below len(tf.get_collection('encode_state') is 1, add below will be 2
      ## even though in init_predict_text(decode_method=SeqDecodeMethod.beam) will call generate_sequence_greedy
      #text, score = predictor.init_predict_text(decode_method=SeqDecodeMethod.greedy, 
      #                                          beam_size=FLAGS.beam_size,
      #                                          convert_unk=False)   
      #scope.reuse_variables()
      #score = predictor.init_predict(exact_loss=True)
      #score = predictor.init_predict(exact_prob=True)
      score = predictor.init_predict()
      scope.reuse_variables()
      beam_text, beam_score = predictor.init_predict_text(decode_method=SeqDecodeMethod.beam, 
                                                          beam_size=FLAGS.beam_size,
                                                          convert_unk=False)  

  predictor.load(FLAGS.model_dir, sess=sess) 

  for item in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
    print(item)
  #input_text = "王凯整容了吗_王凯整容前后对比照片"
  input_texts = [
                 #'包邮买二送一性感女内裤低腰诱惑透视蕾丝露臀大蝴蝶三角内裤女夏-淘宝网',
                 #'大棚辣椒果实变小怎么办,大棚辣椒果实变小防治措施',
                 #'宝宝太胖怎么办呢',
                 #'蛋龟缸,目前4虎纹1剃刀',
                 #'大棚辣椒果实变小怎么办,大棚辣椒果实变小防治措施',
                 #'2015羊年中国风年会晚会签到板设计',
                 '完美 玛丽艳脱角质霜'
                 ]

  for input_text in input_texts:
    word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)

    print(word_ids)
    print(text2ids.ids2text(word_ids))

    #timer = gezi.Timer()
    #text_, score_ = sess.run([text, score], {predictor.input_text_feed : [word_ids]})
    #print(text_[0], text2ids.ids2text(text_[0]), score_[0], 'time(ms):', timer.elapsed_ms())

    timer = gezi.Timer()
    #texts, scores, preids, paids, seqlens = sess.run([beam_text, beam_score, 
    #                         tf.get_collection('preids')[-1], 
    #                         tf.get_collection('paids')[-1],
    #                         tf.get_collection('seqlens')[-1]],
    #                                        {predictor.input_text_feed : [word_ids]})

    #print(preids)
    #print(paids)
    #print(seqlens)

    score = sess.run(score, {predictor.input_text_feed: [word_ids], predictor.text_feed: [word_ids]})
    print(score)

    texts, scores = sess.run([beam_text, beam_score],
                                            {predictor.input_text_feed : [word_ids]})

    texts = texts[0]
    scores = scores[0]
    for text_, score_ in zip(texts, scores):
      print(text_, text2ids.ids2text(text_), score_, math.log(score_))

    print('beam_search using time(ms):', timer.elapsed_ms())
Esempio n. 22
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature))
            if len(image_feature) != IMAGE_FEATURE_LEN:
                print('bad line:', line)
                continue

            input_texts = l[FLAGS.input_text_index].split('\x01')
            for input_text in input_texts:
                input_words = text2ids.Segmentor.Segment(
                    input_text, FLAGS.seg_method)
                input_word_ids = text2ids.words2ids(
                    input_words,
                    feed_single=FLAGS.feed_single,
                    allow_all_zero=True,
                    pad=False)
                if len(input_word_ids) == 0:
                    continue

                input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS]
                if FLAGS.pad:
                    input_word_ids = gezi.pad(input_word_ids)

                is_top_text = True
                for text in texts:
                    if text.strip() == '':
                        continue

                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                    if word_ids_length == 0:
                        continue
                    if is_luanma(words, word_ids):
                        print('luanma',
                              img,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(image_feature),
                              file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name': melt.bytes_feature(img),
                                'image_feature': melt.float_feature(
                                    image_feature),
                                'input_text_str': melt.bytes_feature(
                                    input_text),
                                'input_text': melt.int64_feature(
                                    input_word_ids),
                                'text_str': melt.bytes_feature(text),
                                'text': melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'image_feature':
                                melt.float_feature(image_feature),
                                'input_text_str':
                                melt.bytes_feature(input_text),
                                'text_str':
                                melt.bytes_feature(text),
                            }),
                            feature_lists=melt.feature_lists({
                                'input_text':
                                melt.int64_feature_list(input_word_ids),
                                'text':
                                melt.int64_feature_list(word_ids)
                            }))
                    writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        image_features.append(image_feature)

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
Esempio n. 23
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            line = line.lower()
            if num % 1000 == 0:
                print(num)
            if FLAGS.max_lines and num >= FLAGS.max_lines:
                break
            l = line.strip().split('\t')
            #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict
            #words = segmentor.Segment(text, FLAGS.seg_method)
            #word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK]

            #text is what to predict which is clickquery right now  decoder
            #input text is what to predict from, encoder, here will may be ct0, title, real_title

            if title.strip() is '':
                title = real_title

            if clickquery.startswith('http://'):
                clickquery = l[3]

            text = clickquery
            word_ids = _text2ids(text, TEXT_MAX_WORDS)

            if not word_ids:
                continue

            if FLAGS.np_save:
                gtexts[thread_index].append(word_ids)
                gtext_strs[thread_index].append(text)

            ct0_ids = _text2ids(ct0, INPUT_TEXT_MAX_WORDS)

            title_ids = _text2ids(title, INPUT_TEXT_MAX_WORDS)
            real_title_ids = _text2ids(real_title, INPUT_TEXT_MAX_WORDS)

            if len(ct0_ids) == 0:
                ct0_ids = real_title_ids
                ct0 = real_title

            if num % 1000 == 0:
                print(text,
                      word_ids,
                      text2ids.ids2text(word_ids),
                      file=sys.stderr)
                print(ct0,
                      ct0_ids,
                      text2ids.ids2text(ct0_ids),
                      file=sys.stderr)

            image = l[1]
            url = l[2]

            example = tf.train.Example(features=tf.train.Features(
                feature={
                    'image_name': melt.bytes_feature(image),
                    'url': melt.bytes_feature(url),
                    'text_str': melt.bytes_feature(text),
                    'ct0_str': melt.bytes_feature(ct0),
                    'title_str': melt.bytes_feature(title),
                    'real_title_str': melt.bytes_feature(real_title),
                    'text': melt.int_feature(word_ids),
                    'ct0': melt.int_feature(ct0_ids),
                    'title': melt.int_feature(title_ids),
                    'real_title': melt.int_feature(real_title_ids),
                }))
            writer.write(example)

            global counter, max_num_words, sum_words
            with counter.get_lock():
                counter.value += 1
            word_ids_length = len(word_ids)
            if word_ids_length > max_num_words.value:
                with max_num_words.get_lock():
                    max_num_words.value = word_ids_length
            with sum_words.get_lock():
                sum_words.value += word_ids_length
            num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]
Esempio n. 24
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            texts = l[FLAGS.text_index].split('\x01')

            image_feature = [
                float(x)
                for x in l[FLAGS.image_feature_index].strip().split('\x01')
            ]
            #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')]
            #image_feature = [0.] * IMAGE_FEATURE_LEN
            assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % (
                img, len(image_feature))

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          len(image_feature),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_feature': melt.float_feature(image_feature),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_feature':
                            melt.float_feature(image_feature),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        if FLAGS.small_feature:
                            image_features.append(image_feature)
                        else:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.big_feature_image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
Esempio n. 25
0
def deal_imgtextfile(file):
    """
  since img text or encoded img both big.. say for 2w pic will be 18G, while for image feature (23820, 2048) will only be 373M
  this is not used much, only if you do not want to do metric evaluate(recall@1,... for images), and you do not want to 
  convert and store image binaries from imatext(preprocess)
  """
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('file:', file, 'out_file:', out_file, file=sys.stderr)
    assert len(pic_info_map) > 0
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num, file=sys.stderr)

            l = line.rstrip('\n').split('\t')
            img = l[0]

            if img not in pic_info_map:
                continue

            img_text = l[-1]
            encoded_image = urllib.unquote_plus(img_text)

            text_info = pic_info_map[img]
            texts = text_info.split('\x01')

            is_top_text = True
            for text in texts:
                text = normalize.norm(text)
                if text.strip() == '':
                    print('empty line', line, file=sys.stderr)
                    continue

                word_ids = _text2ids(text, TEXT_MAX_WORDS)
                word_ids_length = len(word_ids)
                if num % 10000 == 0:
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    print('empy wordids!', file=sys.stderr)
                    print(img,
                          text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                    continue
                #if is_luanma(words, word_ids):
                #  print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr)
                #  continue

                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                if not FLAGS.write_sequence_example:
                    example = tf.train.Example(features=tf.train.Features(
                        feature={
                            'image_name': melt.bytes_feature(img),
                            'image_data': melt.bytes_feature(encoded_image),
                            'text_str': melt.bytes_feature(text),
                            'text': melt.int64_feature(word_ids),
                        }))
                else:
                    example = tf.train.SequenceExample(
                        context=melt.features({
                            'image_name':
                            melt.bytes_feature(img),
                            'image_data':
                            melt.bytes_feature(encoded_image),
                            'text_str':
                            melt.bytes_feature(text),
                        }),
                        feature_lists=melt.feature_lists(
                            {'text': melt.int64_feature_list(word_ids)}))
                writer.write(example)

                #global counter, max_num_words, sum_words
                with record_counter.get_lock():
                    record_counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length

                if FLAGS.np_save:
                    assert FLAGS.threads == 1
                    gtexts.append(word_ids)
                    gtext_strs.append(text)

                    #Depreciated not use image_labels
                    if img not in image_labels:
                        image_labels[img] = set()
                    image_labels[img].add(text)

                if is_top_text:
                    is_top_text = False
                    with image_counter.get_lock():
                        image_counter.value += 1

                    if FLAGS.np_save:
                        if img not in image_labels:
                            image_labels[img] = set()

                        image_names.append(img)
                        ##--well too big for encoded_image and so not consider evaluation?  TODO
                        #image_features.append(encoded_image)
                        if FLAGS.image_dir:
                            #actually save pic path instead of image feature
                            image_features.append(
                                os.path.join(FLAGS.image_dir,
                                             img.replace('/', '_')))

                    if FLAGS.num_max_records > 0:
                        #if fixed valid only get one click for each image
                        break

            num += 1
            if num == FLAGS.num_max_records:
                break
Esempio n. 26
0
def main(_):
  text2ids.init()
  global_scope = ''
  if FLAGS.add_global_scope:
    global_scope = FLAGS.global_scope if FLAGS.global_scope else FLAGS.algo
 
  sess = melt.get_session(log_device_placement=FLAGS.log_device_placement)
  with tf.variable_scope(global_scope):
    predictor =  algos_factory.gen_predictor(FLAGS.algo)
    with tf.variable_scope(FLAGS.main_scope) as scope:
      ##--notice if not add below len(tf.get_collection('encode_state') is 1, add below will be 2
      ## even though in init_predict_text(decode_method=SeqDecodeMethod.beam) will call generate_sequence_greedy
      #text, score = predictor.init_predict_text(decode_method=SeqDecodeMethod.greedy, 
      #                                          beam_size=FLAGS.beam_size,
      #                                          convert_unk=False)   
      #scope.reuse_variables()
      beam_text, beam_score = predictor.init_predict_text(decode_method=SeqDecodeMethod.beam, 
                                                          beam_size=FLAGS.beam_size,
                                                          convert_unk=False)  

  predictor.load(FLAGS.model_dir, sess=sess) 

  for item in tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES):
    print(item)
  #input_text = "������������_��������ǰ��Ա���Ƭ"
  input_texts = [
                 #'���������һ�Ը�Ů�ڿ�����ջ�͸����˿¶�δ���������ڿ�Ů��-�Ա���',
                 #'����������ʵ��С��ô��,����������ʵ��С���δ�ʩ',
                 #'����̫����ô����',
                 '����ף�Ŀǰ4����1�굶',
                 '����������ʵ��С��ô��,����������ʵ��С���δ�ʩ',
                 ]

  for input_text in input_texts:
    word_ids = _text2ids(input_text, INPUT_TEXT_MAX_WORDS)

    print(word_ids)
    print(text2ids.ids2text(word_ids))

    #timer = gezi.Timer()
    #text_, score_ = sess.run([text, score], {predictor.input_text_feed : [word_ids]})
    #print(text_[0], text2ids.ids2text(text_[0]), score_[0], 'time(ms):', timer.elapsed_ms())

    timer = gezi.Timer()
    texts, scores = sess.run([beam_text, beam_score], 
                                            {predictor.input_text_feed : [word_ids]})


    texts = texts[0]
    scores = scores[0]
    for text_, score_ in zip(texts, scores):
      print(text_, text2ids.ids2text(text_), score_, math.log(score_))

    print('beam_search using time(ms):', timer.elapsed_ms())

  input_texts = [
                 '����������ʵ��С��ô��,����������ʵ��С���δ�ʩ',
                 '���������һ�Ը�Ů�ڿ�����ջ�͸����˿¶�δ���������ڿ�Ů��-�Ա���',
                 #'���������һ�Ը�Ů�ڿ�����ջ�͸����˿¶�δ����', #same length as lajiao sentence 15
                 #"����̫����ô����",
                 #'����������ʵ��С��ô��,����������ʵ��С���δ�ʩ',
                 #'����������ʵ��С��ô��,����������ʵ��С���δ�ʩ',
                 #'�޺콨�ǰ���˹��',
                 ]

  word_ids_list = [_text2ids(input_text, INPUT_TEXT_MAX_WORDS) for input_text in input_texts]
  timer = gezi.Timer()
  texts_list, scores_list = sess.run([beam_text, beam_score], 
                             feed_dict={predictor.input_text_feed: word_ids_list})
  

  for texts, scores in zip(texts_list, scores_list):
    for text, score in zip(texts, scores):
      print(text, text2ids.ids2text(text), score, math.log(score))

  print('beam_search using time(ms):', timer.elapsed_ms())
Esempio n. 27
0
  if image_file == 'q':
    break

  image_path = os.path.join(image_dir, image_file)
  print('image_path:', image_path)

  if not os.path.exists(image_path):
    print('image path not find!')
    continue

  try:
    hits = img2text[image_file]
    texts = [text_strs[hit] for hit in hits]
    for text in texts:
      word_ids = text2ids.text2ids(text)
      seg_text = text2ids.ids2text(word_ids, print_end=False)
      print('label:', text, seg_text)
      words_importance = sim_predictor.words_importance([word_ids])
      words_importance = words_importance[0]
      print('word importance:')
      for i in range(len(word_ids)):
        if word_ids[i] == 0:
          break 
        print(vocab.key(int(word_ids[i])), words_importance[i], end='|')  
      print()
  except Exception:
    print(traceback.format_exc(), file=sys.stderr)    
    pass

  image = melt.read_image(image_path)
  word_ids, scores = predictor.word_ids([image])
Esempio n. 28
0
def print_generated_text(generated_text, id=-1, name='greedy'):
  if id >= 0:
    logging.info(content_html.format('{}_{}:[ {} ]'.format(name, id, ids2text(generated_text))))
  else:
    logging.info(content_html.format('{}:[ {} ]'.format(name, ids2text(generated_text))))
Esempio n. 29
0
def predicts(imgs, img_features, predictor, rank_metrics, exact_predictor=None, exact_ratio=1.):
  # TODO gpu outofmem predict for showandtell#
  if exact_predictor is None:
    if assistant_predictor is not None:
      exact_predictor = predictor
      predictor = assistant_predictor

  #print(predictor, exact_predictor)

  if isinstance(img_features[0], np.string_):
    assert(len(img_features) < 2000) #otherwise too big mem ..
    img_features = np.array([melt.read_image(pic_path) for pic_path in img_features])  

  img2text = get_bidrectional_lable_map()

  random = True
  need_shuffle = False
  if FLAGS.max_texts > 0 and len(all_distinct_texts) > FLAGS.max_texts:
    assert random
    if not random:
      texts = all_distinct_texts[:FLAGS.max_texts]
    else:
      need_shuffle = True

      all_hits = set()
      for img in (imgs):
        hits = img2text[img]
        for hit in hits:
          all_hits.add(hit)
      
      index = np.random.choice(len(all_distinct_texts), FLAGS.max_texts, replace=False)
      index = [x for x in index if x not in all_hits]
      index = list(all_hits) + index 
      index = index[:FLAGS.max_texts]
      index = np.array(index)
      texts = all_distinct_texts[index]
  else:
    texts = all_distinct_texts
  text_strs = all_distinct_text_strs

  step = len(texts)
  if FLAGS.metric_eval_texts_size > 0 and FLAGS.metric_eval_texts_size < step:
    step = FLAGS.metric_eval_texts_size
  start = 0
  scores = []
  while start < len(texts):
    end = start + step 
    if end > len(texts):
      end = len(texts)
    #print('predicts texts start:', start, 'end:', end, end='\r', file=sys.stderr)
    score = predictor.predict(img_features, texts[start: end])
    scores.append(score)
    start = end
  score = np.concatenate(scores, 1)
  #print('image_feature_shape:', img_features.shape, 'text_feature_shape:', texts.shape, 'score_shape:', score.shape)
  num_texts = texts.shape[0]

  for i, img in enumerate(imgs):
    indexes = (-score[i]).argsort()
    #rerank
    if exact_predictor:
      top_indexes = indexes[:FLAGS.assistant_rerank_num]
      exact_texts = texts[top_indexes]
      exact_score = exact_predictor.elementwise_predict([img_features[i]], exact_texts)
      exact_score = np.squeeze(exact_score)
      if exact_ratio < 1.:
        for j in range(len(top_indexes)):
          exact_score[j] = exact_ratio * exact_score[j] + (1. - exact_ratio) * score[i][top_indexes[j]]

      #print(exact_score)
      exact_indexes = (-exact_score).argsort()

      #print(exact_indexes)
      
      new_indexes = [x for x in indexes]
      for j in range(len(exact_indexes)):
        new_indexes[j] = indexes[exact_indexes[j]]
      indexes = new_indexes

    hits = img2text[img]

    if FLAGS.show_info_interval and i % FLAGS.show_info_interval == 0:
      label_text = '|'.join([text_strs[x] for x in hits])
      img_str = img
      if is_img(img):
        img_str = '{0}<p><a href={1} target=_blank><img src={1} height=200></a></p>'.format(img, get_img_url(img))
      logging.info('<P>obj: {} label: {}</P>'.format(img_str, label_text))
      for j in range(5):
        is_hit = indexes[j] in hits if not need_shuffle else index[indexes[j]] in hits
        logging.info('<P>{} {} {} {}</P>'.format(j, is_hit, ids2text(texts[indexes[j]]), exact_score[exact_indexes[j]] if exact_predictor else score[i][indexes[j]]))

    #notice only work for recall@ or precision@ not work for ndcg@, if ndcg@ must use all
    num_positions = min(num_texts, FLAGS.metric_topn)
    #num_positions = num_texts

    if not need_shuffle:
      labels = [indexes[j] in hits for j in xrange(num_positions)]
    else:
      labels = [index[indexes[j]] in hits for j in xrange(num_positions)]

    rank_metrics.add(labels)
Esempio n. 30
0
def deal_file(file):
    out_file = '{}/{}'.format(
        FLAGS.output_directory,
        '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]]))
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)

            l = line.rstrip('\n').split('\t')
            cs = l[0]  #cs
            simid = l[3]
            objurl = l[1]
            fromurl = l[2]
            keyword = l[4].split('\x01')[0]
            extended_keyword = l[5].split('\x01')[0]

            img = objurl
            #img = cs

            idl4w_end = IDL4W_FEATURE_LEN + 6
            idl4w_feature = [float(x) for x in l[6:idl4w_end]]

            titles = l[idl4w_end + 1]
            descs = l[idl4w_end + 2]

            inception_feature = [float(x) for x in l[idl4w_end + 3:]]

            assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % (
                len(inception_feature), cs)

            click_query = l[idl4w_end]
            show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format(
                click_query, extended_keyword, keyword, titles, descs)
            if click_query == 'noclickquery':
                click_query = ''
                #TODO now only consider click_query
                continue
            else:
                click_queries = click_query.split('$*$')
                is_top_text = True
                for click_query in click_queries:
                    if click_query.strip() == '':
                        continue

                    text_str = '{} {}'.format(click_query, show_str)

                    text = click_query
                    words = text2ids.Segmentor.Segment(text, FLAGS.seg_method)
                    word_ids = text2ids.words2ids(
                        words,
                        feed_single=FLAGS.feed_single,
                        allow_all_zero=True,
                        pad=False)
                    word_ids_length = len(word_ids)
                    if num % 1000 == 0:
                        print(cs,
                              simid,
                              text,
                              word_ids,
                              text2ids.ids2text(word_ids),
                              len(idl4w_feature),
                              len(inception_feature),
                              file=sys.stderr)
                    if len(word_ids) == 0:
                        continue
                    if is_bad(words, word_ids):
                        #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr)
                        continue

                    word_ids = word_ids[:TEXT_MAX_WORDS]
                    if FLAGS.pad:
                        word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
                    if not FLAGS.write_sequence_example:
                        example = tf.train.Example(features=tf.train.Features(
                            feature={
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                                'text':
                                melt.int64_feature(word_ids),
                            }))
                    else:
                        example = tf.train.SequenceExample(
                            context=melt.features({
                                'image_name':
                                melt.bytes_feature(img),
                                'idl4w_feature':
                                melt.float_feature(idl4w_feature),
                                'inception_feature':
                                melt.float_feature(inception_feature),
                                'text_str':
                                melt.bytes_feature(text_str),
                            }),
                            feature_lists=melt.feature_lists(
                                {'text': melt.int64_feature_list(word_ids)}))
                    writer.write(example)

                    #global counter, max_num_words, sum_words
                    with record_counter.get_lock():
                        record_counter.value += 1
                    if word_ids_length > max_num_words.value:
                        with max_num_words.get_lock():
                            max_num_words.value = word_ids_length
                    with sum_words.get_lock():
                        sum_words.value += word_ids_length

                    if FLAGS.np_save:
                        assert FLAGS.threads == 1
                        texts.append(word_ids)
                        text_strs.append(text)

                        if img not in image_labels:
                            image_labels[img] = set()
                        image_labels[img].add(text)

                    if is_top_text:
                        is_top_text = False
                        with image_counter.get_lock():
                            image_counter.value += 1

                        if FLAGS.np_save:
                            if img not in image_labels:
                                image_labels[img] = set()

                            image_names.append(img)
                            #image_features.append(image_feature)
                            idl4w_features.append(idl4w_feature)
                            inception_features.append(inception_feature)

                        if FLAGS.num_max_records > 0:
                            #if fixed valid only get one click for each image
                            break

            num += 1
            if num == FLAGS.num_max_records:
                break