コード例 #1
0
def parse_text_file(text_file):
    if not os.path.exists(text_file):
        print('No text file', text_file, file=sys.stderr)
        return
    num_lines = gezi.get_num_lines(text_file)
    pb = ProgressBar(num_lines, 'parse text file %s' % text_file)
    for line in open(text_file):
        pb.progress()
        l = line.split('\t')
        image = l[0]
        image = image[:image.index('#')]
        text = l[-1].strip()
        #why text and ori_text ? because fo cn corpus text will be \x01 seperated(segged text)
        #for en corpus text and ori_text is the same
        ori_text = l[FLAGS.ori_text_index].strip()
        if text == '':
            continue
        if image not in text_map:
            text_map_[image] = set([text])
            text_map[image] = [(text, ori_text)]
        else:
            if text not in text_map_:
                text_map_[image].add(text)
                text_map[image].append((text, ori_text))
    for image in text_map:
        text_map[image] = list(text_map[image])
コード例 #2
0
ファイル: gen-records.py プロジェクト: tangqiqi123/hasky
def convert_to(feat_file, name):
  num_shards = FLAGS.shards
  num_threads = FLAGS.threads
  if FLAGS.threads > 1:
    assert(num_threads == num_shards)
    f = open(feat_file).readlines()
    num_lines = len(f)
    if FLAGS.debug:
      num_lines = NUM_DEBUG_LINES
    shard_ranges = np.linspace(0,
                             num_lines,
                             num_shards + 1).astype(int)
    
    record = []
    for i in xrange(num_threads):
      args = (f, name, i, shard_ranges[i], shard_ranges[i + 1])
      process = multiprocessing.Process(target=_convert_to,args=args)
      process.start()
      record.append(process)

    for process in record:
      process.join()
    return

  #--------------single thread
  num_lines = gezi.get_num_lines(feat_file)
  if FLAGS.debug:
    num_lines = NUM_DEBUG_LINES
  shard_ranges = np.linspace(0,
                             num_lines,
                             num_shards + 1).astype(int)
  pb = ProgressBar(num_lines, "convert")
  shard = 0
  count = 0;
  
  output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
  output_file = os.path.join(FLAGS.output_directory, output_filename)
  print('Writing', output_file, count)
  writer = tf.python_io.TFRecordWriter(output_file)
 
  for line in open(feat_file):
    pb.progress()
    if count >= shard_ranges[shard + 1]:
      shard += 1
      writer.close()
      output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
      output_file = os.path.join(FLAGS.output_directory, output_filename)
      print('Writing', output_file, count)
      writer = tf.python_io.TFRecordWriter(output_file)
    _parse_line(line, writer)

    count += 1
    if FLAGS.debug and count >= NUM_DEBUG_LINES:
      break
   
  writer.close()
コード例 #3
0
ファイル: gen-records.py プロジェクト: tangqiqi123/hasky
def _convert_to(f, name, thread_index, start, end):
    num_shards = FLAGS.shards
    shard = thread_index
    output_filename = '%s-%.5d-of-%.5d' % (name, shard, num_shards)
    output_file = os.path.join(FLAGS.output_directory, output_filename)
    print('Writing', output_file, start)
    writer = tf.python_io.TFRecordWriter(output_file)
    num_lines = end - start
    pb = ProgressBar(num_lines)
    for i in xrange(start, end):
        pb.progress()
        line = f[i]
        _parse_line(line, writer, thread_index)

    texts_dict[thread_index] = gtexts[thread_index]
    text_strs_dict[thread_index] = gtext_strs[thread_index]
コード例 #4
0
import conf
from conf import IMAGE_FEATURE_LEN

import json
import nltk.tokenize

from libprogress_bar import ProgressBar

START_WORD = '<S>'
END_WORD = '</S>'

with tf.gfile.FastGFile(FLAGS.captions_file, "r") as f:
    caption_data = json.load(f)

pb = ProgressBar(len(caption_data["annotations"]))

id_to_filename = [(x["id"], x["file_name"]) for x in caption_data["images"]]

print(len(id_to_filename))

ids = set()
for x in caption_data["images"]:
    ids.add(x["id"])
print(len(ids))

print(len(caption_data["annotations"]))

caption_data = None

for annotation in caption_data["annotations"]: