Ejemplo n.º 1
0
def read_a_sample_for_criteo(args, slot_num=1):
    """
    read a sample from criteo dataset
    """
    with open(args.dataset, 'rb') as file:
        # skip data_header
        file.seek(4 + 64 + 1, 0)

        # one sample
        length_buffer = file.read(4)  # int
        length = struct.unpack("i", length_buffer)[0]

        label_buffer = file.read(4)  # int
        label = struct.unpack("i", label_buffer)[0]

        # no dense

        keys = []
        for _ in range(slot_num):
            nnz_buffer = file.read(4)  # int
            nnz = struct.unpack("i", nnz_buffer)[0]
            keys_buffer = file.read(8 * nnz)  # nnz * long long
            key = struct.unpack(str(nnz) + "q", keys_buffer)
            keys += list(key)

        check_bit_buffer = file.read(1)  # char
        check_bit = struct.unpack("c", check_bit_buffer)

    label = np.int64(label)
    dense = []
    keys = np.reshape(np.array(keys, dtype=np.int64),
                      newshape=(1, 1, 39))  #[batch, slot_num, nnz]

    return label, dense, keys
Ejemplo n.º 2
0
def read_a_sample_for_dcn(args, slot_num=26):
    """
    read a sample from criteo dataset.
    """
    with open(args.dataset, 'rb') as file:
        # skip data_header
        file.seek(4 + 64 + 1, 0)

        # one sample
        length_buffer = file.read(4)  # int
        length = struct.unpack('i', length_buffer)

        label_buffer = file.read(4)  # int
        label = struct.unpack('i', label_buffer)[0]

        dense_buffer = file.read(4 * 13)  # dense_dim * float
        dense = struct.unpack("13f", dense_buffer)

        keys = []
        for _ in range(slot_num):
            nnz_buffer = file.read(4)  # int
            nnz = struct.unpack("i", nnz_buffer)[0]
            key_buffer = file.read(8 * nnz)  # nnz * long long
            key = struct.unpack(str(nnz) + "q", key_buffer)
            keys += list(key)

        check_bit_buffer = file.read(1)  # char
        check_bit = struct.unpack("c", check_bit_buffer)[0]

    label = np.int64(label)
    dense = np.reshape(np.array(dense, dtype=np.float32), newshape=(1, 13))
    keys = np.reshape(np.array(keys, dtype=np.int64), newshape=(1, 26, 1))

    return label, dense, keys