def read_a_sample_for_criteo(args, slot_num=1): """ read a sample from criteo dataset """ with open(args.dataset, 'rb') as file: # skip data_header file.seek(4 + 64 + 1, 0) # one sample length_buffer = file.read(4) # int length = struct.unpack("i", length_buffer)[0] label_buffer = file.read(4) # int label = struct.unpack("i", label_buffer)[0] # no dense keys = [] for _ in range(slot_num): nnz_buffer = file.read(4) # int nnz = struct.unpack("i", nnz_buffer)[0] keys_buffer = file.read(8 * nnz) # nnz * long long key = struct.unpack(str(nnz) + "q", keys_buffer) keys += list(key) check_bit_buffer = file.read(1) # char check_bit = struct.unpack("c", check_bit_buffer) label = np.int64(label) dense = [] keys = np.reshape(np.array(keys, dtype=np.int64), newshape=(1, 1, 39)) #[batch, slot_num, nnz] return label, dense, keys
def read_a_sample_for_dcn(args, slot_num=26): """ read a sample from criteo dataset. """ with open(args.dataset, 'rb') as file: # skip data_header file.seek(4 + 64 + 1, 0) # one sample length_buffer = file.read(4) # int length = struct.unpack('i', length_buffer) label_buffer = file.read(4) # int label = struct.unpack('i', label_buffer)[0] dense_buffer = file.read(4 * 13) # dense_dim * float dense = struct.unpack("13f", dense_buffer) keys = [] for _ in range(slot_num): nnz_buffer = file.read(4) # int nnz = struct.unpack("i", nnz_buffer)[0] key_buffer = file.read(8 * nnz) # nnz * long long key = struct.unpack(str(nnz) + "q", key_buffer) keys += list(key) check_bit_buffer = file.read(1) # char check_bit = struct.unpack("c", check_bit_buffer)[0] label = np.int64(label) dense = np.reshape(np.array(dense, dtype=np.float32), newshape=(1, 13)) keys = np.reshape(np.array(keys, dtype=np.int64), newshape=(1, 26, 1)) return label, dense, keys