def run(args):
    filename_generator = file_helpers.input_filename_generator_hdf5(args.hdf5_path)
    filenames = list(filename_generator)

    if not os.path.isdir(args.tfrecords_path):
        os.makedirs(args.tfrecords_path)
    for filename in filenames:
        hdf5_record_batch = data_record.read_hdf5_records_v2(filename)
        output_filename = os.path.join(args.tfrecords_path, os.path.basename(filename[:-len('.hdf5')]) + '.tfrecords')
        print(output_filename)
        tfrecords_writer = tf.python_io.TFRecordWriter(output_filename)
        for example in data_record.generate_tfrecords_from_batch(hdf5_record_batch):
            tfrecords_writer.write(example.SerializeToString())
        tfrecords_writer.close()
def run():
    filename_prefix = "datasets/16x16x16_1-2-3-4-5"
    filename_template = os.path.join(filename_prefix,
                                     file_helpers.DEFAULT_HDF5_TEMPLATE)
    filename_generator = file_helpers.input_filename_generator_hdf5(
        filename_template)
    i = 0
    while True:
        filename = next(filename_generator, None)
        if filename is None:
            break
        # filename_tf = tf.constant(filename, shape=[1])
        # filename_queue = tf.train.string_input_producer(filename_tf)
        # record = data_record.read_and_decode_tf_example(filename_queue)
        records = data_record.read_hdf5_records_as_list(filename)
        print(filename)
        # for record in records:
        #     print("  ", record.action, record.reward)
        i += 1
def run(args):
    filename_generator = file_helpers.input_filename_generator_hdf5(args.data_path)
    filenames = list(filename_generator)
    np.random.shuffle(filenames)
    plt.ion()
    for filename in filenames:
        print("Filename {}".format(filename))
        records = data_record.read_hdf5_records_v4_as_list(filename)
        indices = np.arange(len(records))
        np.random.shuffle(indices)
        for index in indices:
            print("  record # {}".format(index))
            record = records[index]

            depth_image = record.depth_image
            print(np.mean(depth_image))
            if depth_image.shape[0] == 1:
                fig = plt.figure(1)
                plt.clf()
                # plt.plot(np.arange(depth_image.shape[1]), depth_image[0, ...])
                plt.step(np.arange(depth_image.shape[1]), depth_image[0, ...])
                plt.title("Depth image")
                fig.canvas.draw()
                plt.show(block=False)
            else:
                cv2.imshow("depth_image", depth_image / np.max(depth_image))
                cv2.waitKey(50)

            in_grid_3d = record.in_grid_3d[..., 2:4]
            fig = 1
            fig = plot_grid(in_grid_3d[..., 0], in_grid_3d[..., 1], title_prefix="In ", show=False, fig_offset=fig)
            out_grid_3d = record.out_grid_3d[..., 2:4]
            fig = plot_grid(out_grid_3d[..., 0], out_grid_3d[..., 1], title_prefix="Out ", show=False, fig_offset=fig)

            diff_grid_3d = out_grid_3d - in_grid_3d
            plot_diff_grid(diff_grid_3d[..., 0], diff_grid_3d[..., 1], title_prefix="Diff ", show=False, fig_offset=fig)
            print("squared average diff occupancy: {}".format(np.mean(np.square(diff_grid_3d[..., 0]))))
            print("squared average diff observation count: {}".format(np.mean(np.square(diff_grid_3d[..., 1]))))

            mlab.show(stop=True)
def run(args):
    if args.input_path is not None:
        input_path = args.input_path
        input_files = list(
            file_helpers.input_filename_generator_hdf5(
                input_path, file_helpers.DEFAULT_HDF5_PATTERN))
    else:
        input_list_file = args.input_list_file
        with file(input_list_file, "r") as fin:
            input_files = [l.strip() for l in fin.readlines()]

    dataset_kwargs = {}
    if args.compression:
        dataset_kwargs.update({"compression": args.compression})
        if args.compression_level >= 0:
            dataset_kwargs.update({"compression_opts": args.compression_level})

    print("Counting {} input files".format(len(input_files)))

    for i, input_file in enumerate(input_files):
        print("Reading input file #{} out of {}".format(i, len(input_files)))
        field_dict = None
        data, attr_dict = hdf5_utils.read_hdf5_file_to_numpy_dict(
            input_file, field_dict, read_attributes=True)
        print("Writing {} samples with new compression settings".format(
            data[list(data.keys())[0]].shape[0]))
        hdf5_utils.write_numpy_dict_to_hdf5_file(input_file + "_recompressed",
                                                 data, attr_dict,
                                                 **dataset_kwargs)
        if args.check_written_samples:
            print("Reading samples from file {}".format(input_file +
                                                        "_recompressed"))
            written_data, written_attr_dict = hdf5_utils.read_hdf5_file_to_numpy_dict(
                input_file + "_recompressed", field_dict, read_attributes=True)
            for key in data:
                assert (np.all(data[key] == written_data[key]))
            for key in attr_dict:
                assert (np.all(attr_dict[key] == written_attr_dict[key]))
        os.remove(input_file)
        os.rename(input_file + "_recompressed", input_file)
def run(args):
    if args.input_path is not None:
        input_path = args.input_path
        input_files = list(
            file_helpers.input_filename_generator_hdf5(
                input_path, file_helpers.DEFAULT_HDF5_PATTERN))
    else:
        input_list_file = args.input_list_file
        with open(input_list_file, "r") as fin:
            input_files = [l.strip() for l in fin.readlines()]

    print("Counting {} input files".format(len(input_files)))

    # Count total number of samples
    total_num_samples = 0
    for i, input_file in enumerate(input_files):
        print("Reading input file #{} out of {} ({})".format(
            i, len(input_files), input_file))
        field_dict = {"scores": False}
        data, attr = hdf5_utils.read_hdf5_file_to_numpy_dict(
            input_file, field_dict, read_attributes=True)
        total_num_samples += data["scores"].shape[0]

    print("Total number of samples: {}".format(total_num_samples))
Beispiel #6
0
def run(args):
    if args.input_path is not None:
        input_path = args.input_path
        input_files = list(
            file_helpers.input_filename_generator_hdf5(
                input_path, file_helpers.DEFAULT_HDF5_PATTERN))
    else:
        input_list_file = args.input_list_file
        with open(input_list_file, "r") as fin:
            input_files = [l.strip() for l in fin.readlines()]

    print("Merging {} input files".format(len(input_files)))

    output_path1 = args.output_path1
    if not os.path.isdir(output_path1):
        os.makedirs(output_path1)
    filename_template1 = os.path.join(output_path1,
                                      file_helpers.DEFAULT_HDF5_TEMPLATE)

    output_path2 = args.output_path2
    if not os.path.isdir(output_path2):
        os.makedirs(output_path2)
    filename_template2 = os.path.join(output_path2,
                                      file_helpers.DEFAULT_HDF5_TEMPLATE)

    samples_per_file = args.samples_per_file
    split_ratio1 = args.split_ratio1
    assert (split_ratio1 > 0)
    assert (split_ratio1 < 1)
    dry_run = args.dry_run

    dataset_kwargs = {}
    if args.compression:
        dataset_kwargs.update({"compression": args.compression})
        if args.compression_level >= 0:
            dataset_kwargs.update({"compression_opts": args.compression_level})

    # Count total number of samples
    num_samples = 0
    for i, input_file in enumerate(input_files):
        print("Reading input file #{} out of {}".format(i, len(input_files)))
        samples = data_record.read_hdf5_records_v4_as_list(input_file)
        num_samples += len(samples)

    num_samples1 = round(split_ratio1 * num_samples)
    num_samples2 = num_samples - num_samples1
    if num_samples1 <= 0 or num_samples2 <= 0:
        import sys
        sys.stderr.write("Data split will result in empty data set\n")
        sys.exit(1)

    def write_samples(samples, next_file_num, filename_template):
        # filename, next_file_num = get_next_output_tf_filename(next_file_num)
        filename, next_file_num = file_helpers.get_next_output_hdf5_filename(
            next_file_num, template=filename_template)
        print("Writing {} samples to file {}".format(len(samples), filename))
        if not dry_run:
            data_record.write_samples_to_hdf5_file(filename, samples,
                                                   attr_dict, **dataset_kwargs)
            if args.check_written_samples:
                print("Reading samples from file {}".format(filename))
                written_samples, written_attr_dict = data_record.read_samples_from_hdf5_file(
                    filename, read_attributes=True)
                assert (len(samples) == len(written_samples))
                for i in range(len(samples)):
                    for key in sample[i]:
                        assert (np.all(
                            samples[i][key] == written_samples[i][key]))
                for key in attr_dict:
                    assert (np.all(attr_dict[key] == written_attr_dict[key]))
        return next_file_num

    # Start with output 1
    filename_template = filename_template1

    finished_output1 = False
    written_samples = 0
    next_file_num = 0
    samples = []
    for i, input_file in enumerate(input_files):
        print("Reading input file #{} out of {}".format(i, len(input_files)))
        field_dict = None
        samples, attr_dict = data_record.read_samples_from_hdf5_file(
            input_file, field_dict)
        for sample in samples:
            samples.append(sample)

            do_write_samples = len(samples) % samples_per_file == 0
            if not finished_output1 and written_samples + len(
                    samples) >= num_samples1:
                do_write_samples = True

            if do_write_samples:
                write_samples(samples, next_file_num, filename_template)
                written_samples += len(samples)
                samples = []
            if not finished_output1 and written_samples >= num_samples1:
                finished_output1 = True
                filename_template = filename_template2
                next_file_num = 0

    if len(samples) > 0:
        write_samples(samples, next_file_num, filename_template)
        samples = []
    assert (len(samples) == 0)
def write_hdf5_files_to_lmdb(input_and_target_retriever,
                             tf_cfg,
                             hdf5_path,
                             lmdb_path,
                             max_num_files,
                             batch_size,
                             serialization_name,
                             compression,
                             compression_arg,
                             append=False,
                             max_num_samples=None,
                             verbose=False):
    logger.info("Data path: {}".format(hdf5_path))
    filename_generator = file_helpers.input_filename_generator_hdf5(hdf5_path)
    filenames = list(filename_generator)
    # Write filenames to lmdb storage path (for later reference)
    if append:
        file_mode = "a"
    else:
        file_mode = "w"
    with open(lmdb_path + "_filenames", file_mode) as fout:
        for filename in filenames:
            fout.write("{}\n".format(os.path.basename(filename)))

    if len(filenames) == 0:
        raise RuntimeError("No dataset file")
    else:
        logger.info("Found {} dataset files".format(len(filenames)))

    # Limit dataset size?
    if max_num_files > 0:
        filenames = filenames[:max_num_files]
        logger.info("Using {} dataset files".format(len(filenames)))
        # Force recomputation of num of samples if not using all data files
    else:
        logger.info("Using all dataset files")

    if verbose:
        # Retrieve input and target shapes
        tmp_data = input_and_target_retriever.read_data_from_file(filenames[0])
        input_shape = input_and_target_retriever.get_input_from_data(
            tmp_data).shape[1:]
        target_shape = input_and_target_retriever.get_target_from_data(
            tmp_data).shape[1:]
        del tmp_data
        logger.info("Input and target shapes:")
        logger.info("  Shape of input: {}".format(input_shape))
        logger.info("  Shape of target: {}".format(target_shape))

    if verbose:
        input_pipeline.print_data_stats(filenames[0],
                                        input_and_target_retriever)

    coord = tf.train.Coordinator()

    # Create HDF5 readers
    hdf5_pipeline = hdf5_utils.HDF5ReaderProcessCoordinator(
        filenames,
        coord,
        read_data_fn=input_and_target_retriever.read_data_from_file,
        shuffle=False,
        repeats=1,
        num_processes=tf_cfg.cpu_queue_processes,
        data_queue_capacity=tf_cfg.cpu_data_queue_capacity,
        verbose=verbose)

    try:
        custom_threads = []

        hdf5_pipeline.start()
        custom_threads.extend(hdf5_pipeline.threads)

        class HDF5DataFlow(tensorpack.dataflow.DataFlow):
            def __init__(self, hdf5_pipeline, max_num_samples=None):
                self._hdf5_pipeline = hdf5_pipeline
                self._num_samples = 0
                self._max_num_samples = max_num_samples

            def get_data(self):
                input_field_name = input_and_target_retriever.input_field_name
                target_field_name = input_and_target_retriever.target_field_name
                while True:
                    if verbose:
                        logger.debug(
                            "Fetching next data block from hdf5 pipeline")
                    full_data = self._hdf5_pipeline.get_next_data()
                    if full_data == hdf5_utils.QUEUE_END:
                        return
                    data = {
                        input_field_name: full_data[input_field_name],
                        target_field_name: full_data[target_field_name],
                    }
                    # TODO: Remove this. Kind of a hack.
                    data = {
                        key: np.asarray(array, dtype=np.float32)
                        for key, array in data.items()
                    }
                    #self._num_samples += data[input_field_name].shape[0]
                    if verbose:
                        logger.debug("Iterating through data block")
                    for i in range(data[input_field_name].shape[0]):
                        sample = [{
                            key: batch[i, ...]
                            for key, batch in data.items()
                        }]
                        yield sample
                        self._num_samples += 1
                        if self._max_num_samples is not None and self._max_num_samples > 0:
                            if self._num_samples >= self._max_num_samples:
                                return

            @property
            def num_samples(self):
                return self._num_samples

        if append:
            logger.info(
                "Appending HDF5 data from {} to lmdb database {}".format(
                    hdf5_path, lmdb_path))
            lmdb_df = tensorpack_utils.AutoLMDBData(lmdb_path, shuffle=False)
            initial_num_samples = lmdb_df.size()
            logger.info("initial_num_samples: {}".format(initial_num_samples))
        else:
            logger.info("Writing HDF5 data from {} to lmdb database {}".format(
                hdf5_path, lmdb_path))
            initial_num_samples = 0

        if max_num_samples is not None and max_num_samples > 0:
            logger.info(
                "Limiting num of samples to {}".format(max_num_samples))
            max_num_samples = max_num_samples - initial_num_samples
            if max_num_samples < 0:
                logger.info("WARNING: Database already has enough samples")
                return
            if append:
                logger.info(
                    "Appending at most {} samples".format(max_num_samples))

        hdf5_df = HDF5DataFlow(hdf5_pipeline, max_num_samples)
        tensorpack_utils.dump_compressed_dataflow_to_lmdb(
            hdf5_df,
            lmdb_path,
            batch_size,
            write_frequency=10,
            serialization_name=serialization_name,
            compression=compression,
            compression_arg=compression_arg,
            append=append)

        if batch_size > 0:
            num_dropped_samples = hdf5_df.num_samples % batch_size
        else:
            num_dropped_samples = 0

        if batch_size > 0:
            lmdb_df = tensorpack_utils.LMDBDataWithMetaData(lmdb_path,
                                                            shuffle=False)
            logger.info("Database has {} batches".format(lmdb_df.size()))
            logger.info("hdf5_df.num_samples: {}".format(hdf5_df.num_samples))
            logger.info("num_dropped_samples: {}".format(num_dropped_samples))
            logger.info("batch_size: {}".format(batch_size))
            logger.info("lmdb_df.size(): {}".format(lmdb_df.size()))
            if initial_num_samples + hdf5_df.num_samples - num_dropped_samples != batch_size * lmdb_df.size(
            ):
                logger.info(
                    "initial_num_samples: {}".format(initial_num_samples))
                logger.info("hdf5_df.num_samples: {}".format(
                    hdf5_df.num_samples))
                logger.info(
                    "num_dropped_samples: {}".format(num_dropped_samples))
                logger.info("batch_size: {}".format(batch_size))
                logger.info("lmdb_df.size(): {}".format(lmdb_df.size()))
            assert (initial_num_samples + hdf5_df.num_samples -
                    num_dropped_samples == batch_size * lmdb_df.size())

        lmdb_df = tensorpack_utils.AutoLMDBData(lmdb_path, shuffle=False)
        logger.info("Database has {} samples".format(lmdb_df.size()))
        logger.info("hdf5_df.num_samples: {}".format(hdf5_df.num_samples))
        logger.info("num_dropped_samples: {}".format(num_dropped_samples))
        logger.info("batch_size: {}".format(batch_size))
        logger.info("lmdb_df.size(): {}".format(lmdb_df.size()))
        assert (initial_num_samples + hdf5_df.num_samples -
                num_dropped_samples == lmdb_df.size())

        # Check that we can read data without errors
        lmdb_df.reset_state()
        it = lmdb_df.get_data()
        q = next(it)
        for key in q[0]:
            logger.info(q[0][key].shape)
            logger.info(q[0][key].dtype)
            assert (q[0][key].dtype == np.float32)

        if num_dropped_samples > 0:
            logger.warn("Dropped {} samples from input dataset".format(
                num_dropped_samples))

        hdf5_pipeline.stop()

    except Exception as exc:
        logger.info(
            "Exception while converting hdf5 data to LMDB database: {}".format(
                exc))
        traceback.print_exc()
        coord.request_stop(exc)
        raise exc
    finally:
        logger.info("Requesting stop")
        coord.request_stop()
        coord.join(custom_threads, stop_grace_period_secs=10)
def run(args):
    if args.input_path is not None:
        input_path = args.input_path
        input_files = list(
            file_helpers.input_filename_generator_hdf5(
                input_path, file_helpers.DEFAULT_HDF5_PATTERN))
    else:
        input_list_file = args.input_list_file
        with file(input_list_file, "r") as fin:
            input_files = [l.strip() for l in fin.readlines()]

    print("Merging {} input files".format(len(input_files)))

    output_path = args.output_path
    if not os.path.isdir(output_path):
        os.makedirs(output_path)
    filename_template = os.path.join(output_path,
                                     file_helpers.DEFAULT_HDF5_TEMPLATE)

    samples_per_file = args.samples_per_file
    check_written_samples = args.check_written_samples
    dry_run = args.dry_run

    dataset_kwargs = {}
    if args.compression:
        dataset_kwargs.update({"compression": args.compression})
        if args.compression_level >= 0:
            dataset_kwargs.update({"compression_opts": args.compression_level})

    def write_samples(samples, next_file_num):
        # filename, next_file_num = get_next_output_tf_filename(next_file_num)
        filename, next_file_num = file_helpers.get_next_output_hdf5_filename(
            next_file_num, template=filename_template)
        print("Writing {} samples to file {}".format(len(samples), filename))
        if not dry_run:
            data_record.write_samples_to_hdf5_file(filename, samples,
                                                   attr_dict, **dataset_kwargs)
            if args.check_written_samples:
                print("Reading samples from file {}".format(filename))
                written_samples, written_attr_dict = data_record.read_samples_from_hdf5_file(
                    filename, read_attributes=True)
                assert (len(samples) == len(written_samples))
                for i in range(len(samples)):
                    for key in sample[i]:
                        assert (np.all(
                            samples[i][key] == written_samples[i][key]))
                for key in attr_dict:
                    assert (np.all(attr_dict[key] == written_attr_dict[key]))
        return next_file_num

    total_num_samples = 0
    next_file_num = 0
    samples = []
    for i, input_file in enumerate(input_files):
        print("Reading input file #{} out of {}".format(i, len(input_files)))
        field_dict = None
        samples, attr_dict = data_record.read_samples_from_hdf5_file(
            input_file, field_dict)
        for sample in samples:
            samples.append(sample)

            if len(samples) % samples_per_file == 0:
                next_file_num = write_samples(samples, next_file_num)
                samples = []

    if len(samples) > 0:
        write_samples(samples, next_file_num)
        samples = []
    assert (len(samples) == 0)

    print("Total number of written samples: {}".format(total_num_samples))