Example #1
0
def main(_):
    writer = TFRecordWriter(FLAGS.output_path)
    path = os.path.join(os.getcwd(), FLAGS.img_path)
    print(FLAGS.csv_input)
    examples = pd.read_csv(FLAGS.csv_input)
    grouped = split(examples, 'filename')
    for group in grouped:
        tf_example = create_tf_example(group, path)
        writer.write(tf_example.SerializeToString())

    writer.close()
    output_path = os.path.join(os.getcwd(), FLAGS.output_path)
    print('Successfully created the TFRecords: {}'.format(output_path))
Example #2
0
def write_examples_as_tfrecord(examples,
                               output_filebase,
                               example_encoder,
                               num_shards=1):
    """Serialize examples as a TFRecord dataset.

    Note: Adapted from https://github.com/tensorflow/models/blob/master/research/
        object_detection/g3doc/using_your_own_dataset.md

    Parameters
    ----------
    examples: list(dict-like)
        A list of key/value maps, each map contains relevant info
        for a single data example.
    output_filebase: str
        The base path for all shards
    example_encoder: func
        A function that encodes an input example as a tf.Example.
    num_shards: int
        The number of shards to divide the examples among. If > 1 multiple
        tfrecord files will be created with names appended with a shard index.
    """
    if num_shards == 1:
        writer = TFRecordWriter(output_filebase)
        for example in tqdm(examples):
            tf_example = example_encoder(example)
            writer.write(tf_example.SerializeToString())
        writer.close()
    else:
        with contextlib.ExitStack() as tf_record_close_stack:
            output_tfrecords = open_sharded_output_tfrecords(
                tf_record_close_stack, output_filebase, num_shards)
            for index, example in tqdm(enumerate(examples),
                                       total=len(examples)):
                tf_example = example_encoder(example)
                output_shard_index = index % num_shards
                output_tfrecords[output_shard_index].write(
                    tf_example.SerializeToString())
Example #3
0
def merge_shards(filename, num_shards_to_merge, out_tmp_dir, batch_size,
                 ensure_batch_multiple):
    tfoptions = TFRecordOptions(TFRecordCompressionType.ZLIB)
    record_writer = TFRecordWriter(filename, tfoptions)

    binaryInputNCHWPackeds = []
    globalInputNCs = []
    policyTargetsNCMoves = []
    globalTargetsNCs = []
    scoreDistrNs = []
    valueTargetsNCHWs = []

    for input_idx in range(num_shards_to_merge):
        shard_filename = os.path.join(out_tmp_dir, str(input_idx) + ".npz")
        with np.load(shard_filename) as npz:
            assert (set(npz.keys()) == set(keys))

            binaryInputNCHWPacked = npz["binaryInputNCHWPacked"]
            globalInputNC = npz["globalInputNC"]
            policyTargetsNCMove = npz["policyTargetsNCMove"].astype(np.float32)
            globalTargetsNC = npz["globalTargetsNC"]
            scoreDistrN = npz["scoreDistrN"].astype(np.float32)
            valueTargetsNCHW = npz["valueTargetsNCHW"].astype(np.float32)

            binaryInputNCHWPackeds.append(binaryInputNCHWPacked)
            globalInputNCs.append(globalInputNC)
            policyTargetsNCMoves.append(policyTargetsNCMove)
            globalTargetsNCs.append(globalTargetsNC)
            scoreDistrNs.append(scoreDistrN)
            valueTargetsNCHWs.append(valueTargetsNCHW)

    ###
    #WARNING - if adding anything here, also add it to joint_shuffle below!
    ###
    binaryInputNCHWPacked = np.concatenate(binaryInputNCHWPackeds)
    globalInputNC = np.concatenate(globalInputNCs)
    policyTargetsNCMove = np.concatenate(policyTargetsNCMoves)
    globalTargetsNC = np.concatenate(globalTargetsNCs)
    scoreDistrN = np.concatenate(scoreDistrNs)
    valueTargetsNCHW = np.concatenate(valueTargetsNCHWs)

    joint_shuffle((binaryInputNCHWPacked, globalInputNC, policyTargetsNCMove,
                   globalTargetsNC, scoreDistrN, valueTargetsNCHW))

    num_rows = binaryInputNCHWPacked.shape[0]
    #Just truncate and lose the batch at the end, it's fine
    num_batches = (
        num_rows //
        (batch_size * ensure_batch_multiple)) * ensure_batch_multiple
    for i in range(num_batches):
        start = i * batch_size
        stop = (i + 1) * batch_size

        example = tfrecordio.make_tf_record_example(
            binaryInputNCHWPacked, globalInputNC, policyTargetsNCMove,
            globalTargetsNC, scoreDistrN, valueTargetsNCHW, start, stop)
        record_writer.write(example.SerializeToString())

    jsonfilename = os.path.splitext(filename)[0] + ".json"
    with open(jsonfilename, "w") as f:
        json.dump({"num_rows": num_rows, "num_batches": num_batches}, f)

    record_writer.close()
    return num_batches * batch_size
Example #4
0
def merge_shards(filename, num_shards_to_merge, out_tmp_dir, batch_size):
  #print("Merging shards for output file: %s (%d shards to merge)" % (filename,num_shards_to_merge))
  tfoptions = TFRecordOptions(TFRecordCompressionType.ZLIB)
  record_writer = TFRecordWriter(filename,tfoptions)

  binaryInputNCHWPackeds = []
  globalInputNCs = []
  policyTargetsNCMoves = []
  globalTargetsNCs = []
  scoreDistrNs = []
  selfBonusScoreNs = []
  valueTargetsNCHWs = []

  for input_idx in range(num_shards_to_merge):
    shard_filename = os.path.join(out_tmp_dir, str(input_idx) + ".npz")
    #print("Merge loading shard: %d (mem usage %dMB)" % (input_idx,memusage_mb()))

    npz = np.load(shard_filename)
    assert(set(npz.keys()) == set(keys))

    binaryInputNCHWPacked = npz["binaryInputNCHWPacked"]
    globalInputNC = npz["globalInputNC"]
    policyTargetsNCMove = npz["policyTargetsNCMove"].astype(np.float32)
    globalTargetsNC = npz["globalTargetsNC"]
    scoreDistrN = npz["scoreDistrN"].astype(np.float32)
    selfBonusScoreN = npz["selfBonusScoreN"].astype(np.float32)
    valueTargetsNCHW = npz["valueTargetsNCHW"].astype(np.float32)

    binaryInputNCHWPackeds.append(binaryInputNCHWPacked)
    globalInputNCs.append(globalInputNC)
    policyTargetsNCMoves.append(policyTargetsNCMove)
    globalTargetsNCs.append(globalTargetsNC)
    scoreDistrNs.append(scoreDistrN)
    selfBonusScoreNs.append(selfBonusScoreN)
    valueTargetsNCHWs.append(valueTargetsNCHW)

  ###
  #WARNING - if adding anything here, also add it to joint_shuffle below!
  ###
  #print("Merge concatenating... (mem usage %dMB)" % memusage_mb())
  binaryInputNCHWPacked = np.concatenate(binaryInputNCHWPackeds)
  globalInputNC = np.concatenate(globalInputNCs)
  policyTargetsNCMove = np.concatenate(policyTargetsNCMoves)
  globalTargetsNC = np.concatenate(globalTargetsNCs)
  scoreDistrN = np.concatenate(scoreDistrNs)
  selfBonusScoreN = np.concatenate(selfBonusScoreNs)
  valueTargetsNCHW = np.concatenate(valueTargetsNCHWs)

  #print("Merge shuffling... (mem usage %dMB)" % memusage_mb())
  joint_shuffle((binaryInputNCHWPacked,globalInputNC,policyTargetsNCMove,globalTargetsNC,scoreDistrN,selfBonusScoreN,valueTargetsNCHW))

  #print("Merge writing in batches...")
  num_rows = binaryInputNCHWPacked.shape[0]
  #Just truncate and lose the batch at the end, it's fine
  num_batches = num_rows // batch_size
  for i in range(num_batches):
    start = i*batch_size
    stop = (i+1)*batch_size

    example = tfrecordio.make_tf_record_example(
      binaryInputNCHWPacked,
      globalInputNC,
      policyTargetsNCMove,
      globalTargetsNC,
      scoreDistrN,
      selfBonusScoreN,
      valueTargetsNCHW,
      start,
      stop
    )
    record_writer.write(example.SerializeToString())

  jsonfilename = os.path.splitext(filename)[0] + ".json"
  with open(jsonfilename,"w") as f:
    json.dump({"num_rows":num_rows,"num_batches":num_batches},f)

  #print("Merge done %s (%d rows)" % (filename, num_batches * batch_size))

  record_writer.close()
  return num_batches * batch_size
    return tf.train.Feature(float_list=tf.train.FloatList(value=value))


input_roots = '/data/dataTrain/val_*/'
output_name = '/data/dataTrain/val.tfrecords'

writer = TFRecordWriter(output_name)

h5files = glob.glob(os.path.join(input_roots, '*.h5'))

for h5file in tqdm(h5files):
    try:
        data = h5py.File(h5file, 'r')
        for i in range(200):
            img = data['CameraRGB'][i]
            target = data['targets'][i]

            feature_dict = {
                'image': _bytes_feature(img.tostring()),
                'targets': _float_feature(target)
            }

            example = tf.train.Example(features=tf.train.Features(
                feature=feature_dict))
            writer.write(example.SerializeToString())
        data.close()
    except:
        print('filename: {}'.format(h5file))

writer.close()
Example #6
0
def merge_shards(filename, num_shards_to_merge, out_tmp_dir, batch_size,
                 ensure_batch_multiple, output_npz):
    np.random.seed(
        [int.from_bytes(os.urandom(4), byteorder='little') for i in range(5)])

    if output_npz:
        record_writer = None
    else:
        tfoptions = TFRecordOptions(TFRecordCompressionType.ZLIB)
        record_writer = TFRecordWriter(filename, tfoptions)

    binaryInputNCHWPackeds = []
    globalInputNCs = []
    policyTargetsNCMoves = []
    globalTargetsNCs = []
    scoreDistrNs = []
    valueTargetsNCHWs = []

    for input_idx in range(num_shards_to_merge):
        shard_filename = os.path.join(out_tmp_dir, str(input_idx) + ".npz")
        with np.load(shard_filename) as npz:
            assert (set(npz.keys()) == set(keys))

            binaryInputNCHWPacked = npz["binaryInputNCHWPacked"]
            globalInputNC = npz["globalInputNC"]
            policyTargetsNCMove = npz["policyTargetsNCMove"].astype(np.float32)
            globalTargetsNC = npz["globalTargetsNC"]
            scoreDistrN = npz["scoreDistrN"].astype(np.float32)
            valueTargetsNCHW = npz["valueTargetsNCHW"].astype(np.float32)

            binaryInputNCHWPackeds.append(binaryInputNCHWPacked)
            globalInputNCs.append(globalInputNC)
            policyTargetsNCMoves.append(policyTargetsNCMove)
            globalTargetsNCs.append(globalTargetsNC)
            scoreDistrNs.append(scoreDistrN)
            valueTargetsNCHWs.append(valueTargetsNCHW)

    ###
    #WARNING - if adding anything here, also add it to joint_shuffle below!
    ###
    binaryInputNCHWPacked = np.concatenate(binaryInputNCHWPackeds)
    globalInputNC = np.concatenate(globalInputNCs)
    policyTargetsNCMove = np.concatenate(policyTargetsNCMoves)
    globalTargetsNC = np.concatenate(globalTargetsNCs)
    scoreDistrN = np.concatenate(scoreDistrNs)
    valueTargetsNCHW = np.concatenate(valueTargetsNCHWs)

    num_rows = binaryInputNCHWPacked.shape[0]
    assert (globalInputNC.shape[0] == num_rows)
    assert (policyTargetsNCMove.shape[0] == num_rows)
    assert (globalTargetsNC.shape[0] == num_rows)
    assert (scoreDistrN.shape[0] == num_rows)
    assert (valueTargetsNCHW.shape[0] == num_rows)

    [
        binaryInputNCHWPacked, globalInputNC, policyTargetsNCMove,
        globalTargetsNC, scoreDistrN, valueTargetsNCHW
    ] = (joint_shuffle_take_first_n(num_rows, [
        binaryInputNCHWPacked, globalInputNC, policyTargetsNCMove,
        globalTargetsNC, scoreDistrN, valueTargetsNCHW
    ]))

    assert (binaryInputNCHWPacked.shape[0] == num_rows)
    assert (globalInputNC.shape[0] == num_rows)
    assert (policyTargetsNCMove.shape[0] == num_rows)
    assert (globalTargetsNC.shape[0] == num_rows)
    assert (scoreDistrN.shape[0] == num_rows)
    assert (valueTargetsNCHW.shape[0] == num_rows)

    #Just truncate and lose the batch at the end, it's fine
    num_batches = (
        num_rows //
        (batch_size * ensure_batch_multiple)) * ensure_batch_multiple
    if output_npz:
        start = 0
        stop = num_batches * batch_size
        np.savez_compressed(
            filename,
            binaryInputNCHWPacked=binaryInputNCHWPacked[start:stop],
            globalInputNC=globalInputNC[start:stop],
            policyTargetsNCMove=policyTargetsNCMove[start:stop],
            globalTargetsNC=globalTargetsNC[start:stop],
            scoreDistrN=scoreDistrN[start:stop],
            valueTargetsNCHW=valueTargetsNCHW[start:stop])
    else:
        for i in range(num_batches):
            start = i * batch_size
            stop = (i + 1) * batch_size

            example = tfrecordio.make_tf_record_example(
                binaryInputNCHWPacked, globalInputNC, policyTargetsNCMove,
                globalTargetsNC, scoreDistrN, valueTargetsNCHW, start, stop)
            record_writer.write(example.SerializeToString())

    jsonfilename = os.path.splitext(filename)[0] + ".json"
    with open(jsonfilename, "w") as f:
        json.dump({"num_rows": num_rows, "num_batches": num_batches}, f)

    if record_writer is not None:
        record_writer.close()
    return num_batches * batch_size