def parse_text_file(text_file): if not os.path.exists(text_file): print('No text file', text_file, file=sys.stderr) return num_lines = gezi.get_num_lines(text_file) pb = ProgressBar(num_lines, 'parse text file %s' % text_file) for line in open(text_file): pb.progress() l = line.split('\t') image = l[0] image = image[:image.index('#')] text = l[-1].strip() #why text and ori_text ? because fo cn corpus text will be \x01 seperated(segged text) #for en corpus text and ori_text is the same ori_text = l[FLAGS.ori_text_index].strip() if text == '': continue if image not in text_map: text_map_[image] = set([text]) text_map[image] = [(text, ori_text)] else: if text not in text_map_: text_map_[image].add(text) text_map[image].append((text, ori_text)) for image in text_map: text_map[image] = list(text_map[image])
def convert_to(feat_file, name): num_shards = FLAGS.shards num_threads = FLAGS.threads if FLAGS.threads > 1: assert(num_threads == num_shards) f = open(feat_file).readlines() num_lines = len(f) if FLAGS.debug: num_lines = NUM_DEBUG_LINES shard_ranges = np.linspace(0, num_lines, num_shards + 1).astype(int) record = [] for i in xrange(num_threads): args = (f, name, i, shard_ranges[i], shard_ranges[i + 1]) process = multiprocessing.Process(target=_convert_to,args=args) process.start() record.append(process) for process in record: process.join() return #--------------single thread num_lines = gezi.get_num_lines(feat_file) if FLAGS.debug: num_lines = NUM_DEBUG_LINES shard_ranges = np.linspace(0, num_lines, num_shards + 1).astype(int) pb = ProgressBar(num_lines, "convert") shard = 0 count = 0; output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) output_file = os.path.join(FLAGS.output_directory, output_filename) print('Writing', output_file, count) writer = tf.python_io.TFRecordWriter(output_file) for line in open(feat_file): pb.progress() if count >= shard_ranges[shard + 1]: shard += 1 writer.close() output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) output_file = os.path.join(FLAGS.output_directory, output_filename) print('Writing', output_file, count) writer = tf.python_io.TFRecordWriter(output_file) _parse_line(line, writer) count += 1 if FLAGS.debug and count >= NUM_DEBUG_LINES: break writer.close()
def _convert_to(f, name, thread_index, start, end): num_shards = FLAGS.shards shard = thread_index output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards) output_file = os.path.join(FLAGS.output_directory, output_filename) print('Writing', output_file, start) writer = tf.python_io.TFRecordWriter(output_file) num_lines = end - start pb = ProgressBar(num_lines) for i in xrange(start, end): pb.progress() line = f[i] _parse_line(line, writer, thread_index) texts_dict[thread_index] = gtexts[thread_index] text_strs_dict[thread_index] = gtext_strs[thread_index]
import conf from conf import IMAGE_FEATURE_LEN import json import nltk.tokenize from libprogress_bar import ProgressBar START_WORD = '<S>' END_WORD = '</S>' with tf.gfile.FastGFile(FLAGS.captions_file, "r") as f: caption_data = json.load(f) pb = ProgressBar(len(caption_data["annotations"])) id_to_filename = [(x["id"], x["file_name"]) for x in caption_data["images"]] print(len(id_to_filename)) ids = set() for x in caption_data["images"]: ids.add(x["id"]) print(len(ids)) print(len(caption_data["annotations"])) caption_data = None for annotation in caption_data["annotations"]: