Esempio n. 1
0
def convert_to_recordio_files(data_frame, dir_name, records_per_shard):
    """
    Convert a pandas DataFrame to recordio files.
    Args:
        data_frame: A pandas DataFrame to convert_to_recordio_files.
        dir_name: A directory to put the generated recordio files.
        records_per_shard: The record number per shard.
    """
    pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

    row_num = 0
    writer = None
    for index, row in data_frame.iterrows():
        if row_num % records_per_shard == 0:
            if writer:
                writer.close()

            shard = row_num // records_per_shard
            file_path_name = os.path.join(dir_name, "data-%05d" % shard)
            writer = recordio.Writer(file_path_name)

        feature = convert_series_to_tf_feature(
            row, data_frame.columns, data_frame.dtypes
        )
        result_string = tf.train.Example(
            features=tf.train.Features(feature=feature)
        ).SerializeToString()
        writer.write(result_string)

        row_num += 1

    if writer:
        writer.close()

    print("Finish data conversion in {}".format(dir_name))
Esempio n. 2
0
def convert(x, y, args, subdir):
    """Convert pairs of image and label in NumPy arrays into a set of
    RecordIO files.
    """
    logger = logging.getLogger("image_label::convert")
    logger.setLevel("INFO")
    row = 0
    shard = 0
    w = None
    while row < x.shape[0] * args.fraction:
        if row % args.records_per_shard == 0:
            if w:
                w.close()
            dn = os.path.join(args.dir, args.dataset, subdir)
            fn = os.path.join(dn, "data-%05d" % (shard))
            if not os.path.exists(dn):
                os.makedirs(os.path.dirname(fn))
            logger.info("Writing {} ...".format(fn))
            w = recordio.Writer(fn)
            shard = shard + 1

        w.write(
            tf.train.Example(features=tf.train.Features(
                feature={
                    "image":
                    tf.train.Feature(float_list=tf.train.FloatList(
                        value=x[row].flatten())),
                    "label":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=y[row].flatten())),
                })).SerializeToString())
        row = row + 1
    w.close()
    logger.info("Wrote {} of total {} records into {} files".format(
        row, x.shape[0], shard))
def convert_to_recordio_files(file_path, dir_name, records_per_shard):
    """
    Convert a pandas DataFrame to recordio files.
    Args:
        file_path: A path of the data file
        dir_name: A directory to put the generated recordio files.
        records_per_shard: The record number per shard.
    """
    pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

    writer = None
    with open(file_path, "r") as f:
        for index, row in enumerate(f):
            if index % records_per_shard == 0:
                if writer:
                    writer.close()

                shard = index // records_per_shard
                file_path_name = os.path.join(dir_name, "data-%05d" % shard)
                writer = recordio.Writer(file_path_name)

            feature = convert_series_to_tf_feature(
                row, DAC_COLUMNS, DAC_DTYPES
            )
            result_string = tf.train.Example(
                features=tf.train.Features(feature=feature)
            ).SerializeToString()
            writer.write(result_string)

        if writer:
            writer.close()

        print("Finish data conversion in {}".format(dir_name))
Esempio n. 4
0
def convert_to_recordio_files(file_path, dir_name, records_per_shard=10240):
    """
    Convert a CSV file to recordio files.
    Args:
        file_path: A path of the CSV file
        dir_name: A directory to put the generated recordio files.
        records_per_shard: The record number per shard.
    """
    pathlib.Path(dir_name).mkdir(parents=True, exist_ok=True)

    writer = None
    with open(file_path, "r") as f:
        for index, row in enumerate(f):
            if index % records_per_shard == 0:
                if writer:
                    writer.close()

                shard = index // records_per_shard
                file_path_name = os.path.join(dir_name, "data-%05d" % shard)
                writer = recordio.Writer(file_path_name)
            example = convert_data_to_tf_example(row, COLUMNS)
            writer.write(example)

        if writer:
            writer.close()
Esempio n. 5
0
def create_imagenet_recordio_file(size, shape):
    image_size = 1
    for s in shape:
        image_size *= s
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    with closing(recordio.Writer(temp_file.name)) as f:
        for _ in range(size):
            # image: float -> uint8 -> tensor -> bytes
            image = np.random.rand(image_size).reshape(shape).astype(np.uint8)
            image = tf.image.encode_jpeg(tf.convert_to_tensor(value=image))
            image = image.numpy()
            label = np.ndarray([1], dtype=np.int64)
            label[0] = np.random.randint(1, 11)
            example_dict = {
                "image": tf.train.Feature(
                    bytes_list=tf.train.BytesList(value=[image])
                ),
                "label": tf.train.Feature(
                    int64_list=tf.train.Int64List(value=[label])
                ),
            }
            example = tf.train.Example(
                features=tf.train.Features(feature=example_dict)
            )
            f.write(example.SerializeToString())
    return temp_file.name
Esempio n. 6
0
def create_recordio_file(size):
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    with closing(recordio.Writer(temp_file.name)) as f:
        for _ in range(size):
            x = np.random.rand(1).astype(np.float32)
            y = 2 * x + 1
            example_dict = {
                "x": tf.train.Feature(float_list=tf.train.FloatList(value=x)),
                "y": tf.train.Feature(float_list=tf.train.FloatList(value=y)),
            }
            example = tf.train.Example(features=tf.train.Features(
                feature=example_dict))
            f.write(example.SerializeToString())
    return temp_file.name
def write_recordio_shards_from_iterator(records_iter, features_list,
                                        output_dir, records_per_shard):
    """Writes RecordIO files from Python iterator of numpy arrays."""
    # Take the first record batch to check whether it contains multiple items
    first_record_batch = next(records_iter)
    is_first_record_batch_consumed = False
    is_multi_items_per_batch = any(
        isinstance(i, list) for i in first_record_batch)

    # Find the features of different types that will be used
    # in `_parse_row_to_example()` later
    record = (first_record_batch[0]
              if is_multi_items_per_batch else first_record_batch)
    feature_indices = _find_feature_indices_from_record(record)

    writer = None
    rows_written = 0
    shards_written = 0
    while True:
        try:
            # Make sure to consume the first record batch
            if is_first_record_batch_consumed:
                record_batch = next(records_iter)
            else:
                record_batch = first_record_batch
                is_first_record_batch_consumed = True
            if not is_multi_items_per_batch:
                record_batch = [record_batch]

            # Write each record in the batch to a RecordIO shard
            for record in record_batch:
                # Initialize the writer for the new shard
                if rows_written % records_per_shard == 0:
                    if writer is not None:
                        writer.close()
                    shard_file_path = os.path.join(
                        output_dir, "data-%05d" % shards_written)
                    writer = recordio.Writer(shard_file_path)
                    shards_written += 1

                writer.write(
                    _parse_row_to_example(record, features_list,
                                          feature_indices).SerializeToString())
                rows_written += 1
        except StopIteration:
            break

    writer.close()
Esempio n. 8
0
def create_frappe_recordio_file(size, shape, input_dim):
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    with closing(recordio.Writer(temp_file.name)) as f:
        for _ in range(size):
            # image: float -> uint8 -> tensor -> bytes
            feature = np.random.randint(input_dim, size=(shape, ))
            label = np.random.randint(2, size=(1, ))
            example_dict = {
                "feature":
                tf.train.Feature(int64_list=tf.train.Int64List(value=feature)),
                "label":
                tf.train.Feature(int64_list=tf.train.Int64List(value=[label])),
            }
            example = tf.train.Example(features=tf.train.Features(
                feature=example_dict))
            f.write(example.SerializeToString())
    return temp_file.name
Esempio n. 9
0
def create_recordio_file(size, shape):
    image_size = 1
    for s in shape:
        image_size *= s
    temp_file = tempfile.NamedTemporaryFile(delete=False)
    with closing(recordio.Writer(temp_file.name)) as f:
        for _ in range(size):
            image = np.random.rand(image_size).astype(np.float32)
            label = np.ndarray([1], dtype=np.int64)
            label[0] = np.random.randint(0, 10)
            example_dict = {
                "image":
                tf.train.Feature(float_list=tf.train.FloatList(value=image)),
                "label":
                tf.train.Feature(int64_list=tf.train.Int64List(value=[label])),
            }
            example = tf.train.Example(features=tf.train.Features(
                feature=example_dict))
            f.write(example.SerializeToString())
    return temp_file.name
Esempio n. 10
0
def create_recordio_file(size, dataset_name, shape, temp_dir=None):
    """Creates a temporary file containing data of `recordio` format.

    Args:
        size: The number of records in the temporary file.
        dataset_name: A dataset name from `DatasetName`.
        shape: The shape of records to be created.
        temp_dir: The storage path of the temporary file.

    Returns:
        A python string indicating the temporary file name.
    """
    temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
    with closing(recordio.Writer(temp_file.name)) as f:
        for _ in range(size):
            if dataset_name == DatasetName.IMAGENET:
                image = np.random.randint(255, size=shape, dtype=np.uint8)
                image = tf.image.encode_jpeg(tf.convert_to_tensor(value=image))
                image = image.numpy()
                label = np.ndarray([1], dtype=np.int64)
                label[0] = np.random.randint(1, 11)
                example_dict = {
                    "image":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[image])),
                    "label":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[label])),
                }
            elif dataset_name == DatasetName.FRAPPE:
                feature = np.random.randint(5383, size=(shape, ))
                label = np.random.randint(2, size=(1, ))
                example_dict = {
                    "feature":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=feature)),
                    "label":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[label])),
                }
            elif dataset_name == DatasetName.TEST_MODULE:
                x = np.random.rand(shape).astype(np.float32)
                y = 2 * x + 1
                example_dict = {
                    "x":
                    tf.train.Feature(float_list=tf.train.FloatList(value=x)),
                    "y":
                    tf.train.Feature(float_list=tf.train.FloatList(value=y)),
                }
            elif dataset_name == DatasetName.IMAGE_DEFAULT:
                image = np.random.rand(np.prod(shape)).astype(np.float32)
                label = np.ndarray([1], dtype=np.int64)
                label[0] = np.random.randint(0, 10)
                example_dict = {
                    "image":
                    tf.train.Feature(float_list=tf.train.FloatList(
                        value=image)),
                    "label":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[label])),
                }
            else:
                raise ValueError("Unknown dataset name %s." % dataset_name)

            example = tf.train.Example(features=tf.train.Features(
                feature=example_dict))
            f.write(example.SerializeToString())
    return temp_file.name
Esempio n. 11
0
def create_recordio_file(size, dataset_name, shape, temp_dir=None):
    """Creates a temporary file containing data of `recordio` format.

    Args:
        size: The number of records in the temporary file.
        dataset_name: A dataset name from `DatasetName`.
        shape: The shape of records to be created.
        temp_dir: The storage path of the temporary file.

    Returns:
        A python string indicating the temporary file name.
    """
    temp_file = tempfile.NamedTemporaryFile(delete=False, dir=temp_dir)
    with closing(recordio.Writer(temp_file.name)) as f:
        for _ in range(size):
            if dataset_name == DatasetName.IMAGENET:
                image = np.random.randint(255, size=shape, dtype=np.uint8)
                image = tf.image.encode_jpeg(tf.convert_to_tensor(value=image))
                image = image.numpy()
                label = np.ndarray([1], dtype=np.int64)
                label[0] = np.random.randint(1, 11)
                example_dict = {
                    "image":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[image])),
                    "label":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[label])),
                }
            elif dataset_name == DatasetName.FRAPPE:
                feature = np.random.randint(5383, size=(shape, ))
                label = np.random.randint(2, size=(1, ))
                example_dict = {
                    "feature":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=feature)),
                    "label":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[label])),
                }
            elif dataset_name == DatasetName.TEST_MODULE:
                x = np.random.rand(shape).astype(np.float32)
                y = 2 * x + 1
                example_dict = {
                    "x":
                    tf.train.Feature(float_list=tf.train.FloatList(value=x)),
                    "y":
                    tf.train.Feature(float_list=tf.train.FloatList(value=y)),
                }
            elif dataset_name == DatasetName.IMAGE_DEFAULT:
                image = np.random.rand(np.prod(shape)).astype(np.float32)
                label = np.ndarray([1], dtype=np.int64)
                label[0] = np.random.randint(0, 10)
                example_dict = {
                    "image":
                    tf.train.Feature(float_list=tf.train.FloatList(
                        value=image)),
                    "label":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[label])),
                }
            elif dataset_name == DatasetName.CENSUS:
                example_dict = {
                    "workclass":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[b"Private"])),
                    "education":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[b"HS-grad"])),
                    "marital-status":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[b"Widowed"])),
                    "occupation":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[b"Exec-managerial"])),
                    "relationship":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[b"Not-in-family"])),
                    "race":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[b"White"])),
                    "sex":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[b"Female"])),
                    "native-country":
                    tf.train.Feature(bytes_list=tf.train.BytesList(
                        value=[b"United-States"])),
                    "age":
                    tf.train.Feature(float_list=tf.train.FloatList(
                        value=[np.random.randint(10, 100)])),
                    "capital-gain":
                    tf.train.Feature(float_list=tf.train.FloatList(
                        value=[np.random.randint(100, 4000)])),
                    "capital-loss":
                    tf.train.Feature(float_list=tf.train.FloatList(
                        value=[np.random.randint(2000, 7000)])),
                    "hours-per-week":
                    tf.train.Feature(float_list=tf.train.FloatList(
                        value=[np.random.randint(10, 70)])),
                    "label":
                    tf.train.Feature(int64_list=tf.train.Int64List(
                        value=[np.random.randint(0, 2)])),
                }
            else:
                raise ValueError("Unknown dataset name %s." % dataset_name)

            example = tf.train.Example(features=tf.train.Features(
                feature=example_dict))
            f.write(example.SerializeToString())
    return temp_file.name
Esempio n. 12
0
def write_to_recordio(filename, data_list):
    logger.info("Writing to file:", filename)
    with closing(recordio.Writer(filename)) as f:
        for d in data_list:
            f.write(d)