def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d'%(img, len(image_feature)) if len(image_feature) != IMAGE_FEATURE_LEN: print('bad line:', line) continue input_texts = l[FLAGS.input_text_index].split('\x01') for input_text in input_texts: input_words = text2ids.Segmentor.Segment( input_text, FLAGS.seg_method) input_word_ids = text2ids.words2ids( input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) if len(input_word_ids) == 0: continue input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS] if FLAGS.pad: input_word_ids = gezi.pad(input_word_ids) is_top_text = True for text in texts: if text.strip() == '': continue words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids( words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if word_ids_length == 0: continue if is_luanma(words, word_ids): print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature( image_feature), 'input_text_str': melt.bytes_feature( input_text), 'input_text': melt.int64_feature( input_word_ids), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'input_text_str': melt.bytes_feature(input_text), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists({ 'input_text': melt.int64_feature_list(input_word_ids), 'text': melt.int64_feature_list(word_ids) })) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) image_features.append(image_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] texts = l[FLAGS.text_index].split('\x01') image_feature = [ float(x) for x in l[FLAGS.image_feature_index].strip().split('\x01') ] #image_feature = [float(x) for x in l[FLAGS.image_feature_index].strip().split(' ')] #image_feature = [0.] * IMAGE_FEATURE_LEN assert len(image_feature) == IMAGE_FEATURE_LEN, '%s %d' % ( img, len(image_feature)) is_top_text = True for text in texts: text = normalize.norm(text) if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) if FLAGS.small_feature: image_features.append(image_feature) else: #actually save pic path instead of image feature image_features.append( os.path.join(FLAGS.big_feature_image_dir, img.replace('/', '_'))) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def deal_imgtextfile(file): """ since img text or encoded img both big.. say for 2w pic will be 18G, while for image feature (23820, 2048) will only be 373M this is not used much, only if you do not want to do metric evaluate(recall@1,... for images), and you do not want to convert and store image binaries from imatext(preprocess) """ out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('file:', file, 'out_file:', out_file, file=sys.stderr) assert len(pic_info_map) > 0 with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num, file=sys.stderr) l = line.rstrip('\n').split('\t') img = l[0] if img not in pic_info_map: continue img_text = l[-1] encoded_image = urllib.unquote_plus(img_text) text_info = pic_info_map[img] texts = text_info.split('\x01') is_top_text = True for text in texts: text = normalize.norm(text) if text.strip() == '': print('empty line', line, file=sys.stderr) continue word_ids = _text2ids(text, TEXT_MAX_WORDS) word_ids_length = len(word_ids) if num % 10000 == 0: print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if len(word_ids) == 0: print('empy wordids!', file=sys.stderr) print(img, text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) continue #if is_luanma(words, word_ids): # print('luanma', img, text, word_ids, text2ids.ids2text(word_ids), len(image_feature), file=sys.stderr) # continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'image_data': melt.bytes_feature(encoded_image), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) #Depreciated not use image_labels if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) ##--well too big for encoded_image and so not consider evaluation? TODO #image_features.append(encoded_image) if FLAGS.image_dir: #actually save pic path instead of image feature image_features.append( os.path.join(FLAGS.image_dir, img.replace('/', '_'))) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break
def _parse_line(line, writer, thread_index = 0): l = line.rstrip().split('\t') image_name = l[0] image_feature = [float(x) for x in l[1:]] if image_name not in text_map: print('image ', image_name, 'ignore ', 'name_len ', len(image_name), len(image_name.strip())) return else: image_path = FLAGS.image_dir + '/' + image_name #print(image_path) if FLAGS.write_raw_image_bytes: with tf.gfile.FastGFile(image_path, "r") as f: encoded_image = f.read() else: encoded_image = '' #---------below will hang if multi process #try: # decoder.decode_jpeg(encoded_image) #except (tf.errors.InvalidArgumentError, AssertionError): # print("Skipping file with invalid JPEG data: %s" % image_path) # return for text, ori_text in text_map[image_name]: word_ids = [vocabulary.id(word) for word in text.split(WORDS_SEP) if vocabulary.has(word) or ENCODE_UNK] if not word_ids: continue word_ids_length = len(word_ids) word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features(feature={ 'image_name': melt.bytes_feature(image_name), 'image_data': melt.bytes_feature(encoded_image), 'image_feature': melt.float_feature(image_feature), 'text': melt.int64_feature(word_ids), 'text_str': melt.bytes_feature(ori_text), })) else: example = tf.train.SequenceExample( context=melt.features( { 'image_name': melt.bytes_feature(image_name), 'image_data': melt.bytes_feature(encoded_image), 'image_feature': melt.float_feature(image_feature), 'text_str': melt.bytes_feature(ori_text), }), feature_lists=melt.feature_lists( { 'text': melt.int64_feature_list(word_ids) })) if FLAGS.np_save: gtexts[thread_index].append(word_ids) gtext_strs[thread_index].append(ori_text) #NOTICE not test here for num_threads > 1 if FLAGS.num_records: if image_name not in images: images[image_name] = 1 print(image_name, len(images)) writer.write(example.SerializeToString()) if len(images) == FLAGS.num_records: print('Done') exit(1) else: writer.write(example.SerializeToString()) global counter, max_num_words, sum_words with counter.get_lock(): counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length
def deal_file(file): out_file = '{}/{}'.format(FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') text = l[FLAGS.text_index] input_text = l[FLAGS.input_text_index] input_words = text2ids.Segmentor.Segment(input_text, FLAGS.seg_method) input_word_ids = text2ids.words2ids(input_words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) if len(input_word_ids) == 0: continue input_word_ids = input_word_ids[:INPUT_TEXT_MAX_WORDS] if FLAGS.pad: input_word_ids = gezi.pad(input_word_ids) words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids(words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) if word_ids_length == 0: continue if is_luanma(words, word_ids): print('luanma', text, word_ids, text2ids.ids2text(word_ids), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features(feature={ 'input_text_str': melt.bytes_feature(input_text), 'input_text': melt.int64_feature(input_word_ids), 'text_str': melt.bytes_feature(text), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features( { 'input_text_str': melt.bytes_feature(input_text), 'text_str': melt.bytes_feature(text), }), feature_lists=melt.feature_lists( { 'input_text': melt.int64_feature_list(input_word_ids), 'text': melt.int64_feature_list(word_ids) })) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 gtexts.append(word_ids) gtext_strs.append(text) num += 1 if num == FLAGS.num_max_records: break
def deal_file(file): out_file = '{}/{}'.format( FLAGS.output_directory, '-'.join([FLAGS.name, file.split('/')[-1].split('-')[-1]])) print('out_file:', out_file) with melt.tfrecords.Writer(out_file) as writer: num = 0 for line in open(file): if num % 1000 == 0: print(num) l = line.rstrip('\n').split('\t') cs = l[0] #cs simid = l[3] objurl = l[1] fromurl = l[2] keyword = l[4].split('\x01')[0] extended_keyword = l[5].split('\x01')[0] img = objurl #img = cs idl4w_end = IDL4W_FEATURE_LEN + 6 idl4w_feature = [float(x) for x in l[6:idl4w_end]] titles = l[idl4w_end + 1] descs = l[idl4w_end + 2] inception_feature = [float(x) for x in l[idl4w_end + 3:]] assert len(inception_feature) == INCEPTION_FEATURE_LEN, '%d %s' % ( len(inception_feature), cs) click_query = l[idl4w_end] show_str = 'click:{} ex_key:{} key:{} titles:{} descs:{}'.format( click_query, extended_keyword, keyword, titles, descs) if click_query == 'noclickquery': click_query = '' #TODO now only consider click_query continue else: click_queries = click_query.split('$*$') is_top_text = True for click_query in click_queries: if click_query.strip() == '': continue text_str = '{} {}'.format(click_query, show_str) text = click_query words = text2ids.Segmentor.Segment(text, FLAGS.seg_method) word_ids = text2ids.words2ids( words, feed_single=FLAGS.feed_single, allow_all_zero=True, pad=False) word_ids_length = len(word_ids) if num % 1000 == 0: print(cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr) if len(word_ids) == 0: continue if is_bad(words, word_ids): #print('luan_ma', cs, simid, text, word_ids, text2ids.ids2text(word_ids), len(idl4w_feature), len(inception_feature), file=sys.stderr) continue word_ids = word_ids[:TEXT_MAX_WORDS] if FLAGS.pad: word_ids = gezi.pad(word_ids, TEXT_MAX_WORDS, 0) if not FLAGS.write_sequence_example: example = tf.train.Example(features=tf.train.Features( feature={ 'image_name': melt.bytes_feature(img), 'idl4w_feature': melt.float_feature(idl4w_feature), 'inception_feature': melt.float_feature(inception_feature), 'text_str': melt.bytes_feature(text_str), 'text': melt.int64_feature(word_ids), })) else: example = tf.train.SequenceExample( context=melt.features({ 'image_name': melt.bytes_feature(img), 'idl4w_feature': melt.float_feature(idl4w_feature), 'inception_feature': melt.float_feature(inception_feature), 'text_str': melt.bytes_feature(text_str), }), feature_lists=melt.feature_lists( {'text': melt.int64_feature_list(word_ids)})) writer.write(example) #global counter, max_num_words, sum_words with record_counter.get_lock(): record_counter.value += 1 if word_ids_length > max_num_words.value: with max_num_words.get_lock(): max_num_words.value = word_ids_length with sum_words.get_lock(): sum_words.value += word_ids_length if FLAGS.np_save: assert FLAGS.threads == 1 texts.append(word_ids) text_strs.append(text) if img not in image_labels: image_labels[img] = set() image_labels[img].add(text) if is_top_text: is_top_text = False with image_counter.get_lock(): image_counter.value += 1 if FLAGS.np_save: if img not in image_labels: image_labels[img] = set() image_names.append(img) #image_features.append(image_feature) idl4w_features.append(idl4w_feature) inception_features.append(inception_feature) if FLAGS.num_max_records > 0: #if fixed valid only get one click for each image break num += 1 if num == FLAGS.num_max_records: break