Exemple #1
0
def deal_file(file, thread_index):
    out_file = '{}/{}_{}'.format(
        FLAGS.output_directory, FLAGS.name,
        thread_index) if FLAGS.threads > 1 else '{}/{}'.format(
            FLAGS.output_directory, FLAGS.name)
    print('out_file:', out_file)
    with melt.tfrecords.Writer(out_file) as writer:
        num = 0
        for line in open(file):
            if num % 1000 == 0:
                print(num)
            l = line.rstrip().split('\t')
            img = l[0]
            img_end = IMAGE_FEATURE_LEN + 1
            img_feature = [float(x) for x in l[1:img_end]]
            texts = [x.split('\x01')[0] for x in l[img_end:]]
            for text in texts:
                if text.strip() == '':
                    continue
                #@TODO from text -> ids should move out so online code can share it for evaluation or use for feed dict
                #words = segmentor.Segment(text, FLAGS.seg_method)
                #word_ids = [vocabulary.id(word) for word in words if vocabulary.has(word) or ENCODE_UNK]
                word_ids = text2ids.text2ids(text,
                                             seg_method=FLAGS.seg_method,
                                             feed_single=FLAGS.feed_single,
                                             allow_all_zero=True,
                                             pad=False)
                word_ids_length = len(word_ids)
                if num % 1000 == 0:
                    print(text,
                          word_ids,
                          text2ids.ids2text(word_ids),
                          file=sys.stderr)
                if len(word_ids) == 0:
                    continue
                word_ids = word_ids[:TEXT_MAX_WORDS]
                if FLAGS.pad:
                    word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)

                if FLAGS.np_save:
                    gtexts[thread_index].append(word_ids)
                    gtext_strs[thread_index].append(text)

                #add pos info? weght info? or @TODO add click num info
                example = tf.train.Example(features=tf.train.Features(
                    feature={
                        'image_name': melt.bytes_feature(img),
                        'image_feature': melt.float_feature(img_feature),
                        'text': melt.int_feature(word_ids),
                        'text_str': melt.bytes_feature(text),
                    }))
                writer.write(example)

                global counter, max_num_words, sum_words
                with counter.get_lock():
                    counter.value += 1
                if word_ids_length > max_num_words.value:
                    with max_num_words.get_lock():
                        max_num_words.value = word_ids_length
                with sum_words.get_lock():
                    sum_words.value += word_ids_length
            num += 1

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]
  
  writer = melt.tfrecords.Writer(outfile)

num = 0
count = 0
for line in sys.stdin:
  if num % 1000 == 0:
   print(num, file=sys.stderr)
  num += 1
  l = line.rstrip().split('\t')
  img = l[0]
  img_end = IMAGE_FEATURE_LEN + 1
  img_feature = [float(x) for x in l[1: img_end]]
  texts = [x.split('\x01')[0] for x in l[img_end:]]
  for text in texts:
    word_ids = text2ids.text2ids(text, seg_method=FLAGS.seg_method, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False)
    word_ids_length = len(word_ids)
    if num % 1000 == 0:
     #print(libgezi.gbk2utf8('\t'.join(words)), file=sys.stderr)
     #print('\t'.join(words), file=sys.stderr)
     print(word_ids, file=sys.stderr)
    if len(word_ids) == 0:
      continue
    word_ids = word_ids[:TEXT_MAX_WORDS]
    if FLAGS.pad:
      word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0)
    
    if writer is not None:
      example = tf.train.Example(features=tf.train.Features(feature={
        'image_name': melt.bytes_feature(img),
        'image_feature': melt.float_feature(img_feature),