def test_clone(): context = pa.SerializationContext() class Foo(object): pass def custom_serializer(obj): return 0 def custom_deserializer(serialized_obj): return (serialized_obj, 'a') context.register_type(Foo, 'Foo', custom_serializer=custom_serializer, custom_deserializer=custom_deserializer) new_context = context.clone() f = Foo() serialized = pa.serialize(f, context=context) deserialized = serialized.deserialize(context=context) assert deserialized == (0, 'a') serialized = pa.serialize(f, context=new_context) deserialized = serialized.deserialize(context=new_context) assert deserialized == (0, 'a')
def test_serialize_recursive_objects(): class ClassA(object): pass # Make a list that contains itself. lst = [] lst.append(lst) # Make an object that contains itself as a field. a1 = ClassA() a1.field = a1 # Make two objects that contain each other as fields. a2 = ClassA() a3 = ClassA() a2.field = a3 a3.field = a2 # Make a dictionary that contains itself. d1 = {} d1["key"] = d1 # Make a numpy array that contains itself. arr = np.array([None], dtype=object) arr[0] = arr # Create a list of recursive objects. recursive_objects = [lst, a1, a2, a3, d1, arr] # Check that exceptions are thrown when we serialize the recursive # objects. for obj in recursive_objects: with pytest.raises(Exception): pa.serialize(obj).deserialize()
def test_serialization_callback_error(): class TempClass(object): pass # Pass a SerializationContext into serialize, but TempClass # is not registered serialization_context = pa.SerializationContext() val = TempClass() with pytest.raises(pa.SerializationCallbackError) as err: serialized_object = pa.serialize(val, serialization_context) assert err.value.example_object == val serialization_context.register_type(TempClass, "TempClass") serialized_object = pa.serialize(TempClass(), serialization_context) deserialization_context = pa.SerializationContext() # Pass a Serialization Context into deserialize, but TempClass # is not registered with pytest.raises(pa.DeserializationCallbackError) as err: serialized_object.deserialize(deserialization_context) assert err.value.type_id == "TempClass" class TempClass2(object): pass # Make sure that we receive an error when we use an inappropriate value for # the type_id argument. with pytest.raises(TypeError): serialization_context.register_type(TempClass2, 1)
def test_numpy_subclass_serialization(): # Check that we can properly serialize subclasses of np.ndarray. class CustomNDArray(np.ndarray): def __new__(cls, input_array): array = np.asarray(input_array).view(cls) return array def serializer(obj): return {'numpy': obj.view(np.ndarray)} def deserializer(data): array = data['numpy'].view(CustomNDArray) return array context = pa.default_serialization_context() context.register_type(CustomNDArray, 'CustomNDArray', custom_serializer=serializer, custom_deserializer=deserializer) x = CustomNDArray(np.zeros(3)) serialized = pa.serialize(x, context=context).to_buffer() new_x = pa.deserialize(serialized, context=context) assert type(new_x) == CustomNDArray assert np.alltrue(new_x.view(np.ndarray) == np.zeros(3))
def test_serialize_subclasses(): # This test shows how subclasses can be handled in an idiomatic way # by having only a serializer for the base class # This technique should however be used with care, since pickling # type(obj) with couldpickle will include the full class definition # in the serialized representation. # This means the class definition is part of every instance of the # object, which in general is not desirable; registering all subclasses # with register_type will result in faster and more memory # efficient serialization. context = pa.default_serialization_context() context.register_type( Serializable, "Serializable", custom_serializer=serialize_serializable, custom_deserializer=deserialize_serializable) a = SerializableClass() serialized = pa.serialize(a, context=context) deserialized = serialized.deserialize(context=context) assert type(deserialized).__name__ == SerializableClass.__name__ assert deserialized.value == 3
def _check_component_roundtrip(value, context=global_serialization_context): # Test to/from components serialized = pa.serialize(value, context=context) components = serialized.to_components() from_comp = pa.SerializedPyObject.from_components(components) recons = from_comp.deserialize(context=context) assert_equal(value, recons)
def dumps_pyarrow(obj): """ Serialize an object. Returns: Implementation-dependent bytes-like object """ return pa.serialize(obj).to_buffer()
def test_integer_limits(large_buffer): # Check that Numpy scalars can be represented up to their limit values # (except np.uint64 which is limited to 2**63 - 1) for dt in [np.int8, np.int64, np.int32, np.int64, np.uint8, np.uint64, np.uint32, np.uint64]: scal = dt(np.iinfo(dt).min) serialization_roundtrip(scal, large_buffer) if dt is not np.uint64: scal = dt(np.iinfo(dt).max) serialization_roundtrip(scal, large_buffer) else: scal = dt(2**63 - 1) serialization_roundtrip(scal, large_buffer) for v in (2**63, 2**64 - 1): scal = dt(v) with pytest.raises(pa.ArrowInvalid): pa.serialize(scal)
def pack(data): if LZ4_ENABLED: data = pyarrow.serialize(data).to_buffer().to_pybytes() data = lz4.frame.compress(data) # TODO(ekl) we shouldn't need to base64 encode this data, but this # seems to not survive a transfer through the object store if we don't. data = base64.b64encode(data).decode("ascii") return data
def test_serialization_callback_numpy(): class DummyClass(object): pass def serialize_dummy_class(obj): x = np.zeros(4) return x def deserialize_dummy_class(serialized_obj): return serialized_obj context = pa.default_serialization_context() context.register_type(DummyClass, "DummyClass", custom_serializer=serialize_dummy_class, custom_deserializer=deserialize_dummy_class) pa.serialize(DummyClass(), context=context)
def pack(data): if SNAPPY_ENABLED: data = snappy.compress( pyarrow.serialize(data).to_buffer().to_pybytes()) # TODO(ekl) we shouldn't need to base64 encode this data, but this # seems to not survive a transfer through the object store if we don't. return base64.b64encode(data) else: return data
def test_serialize_with_pandas_objects(): df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3]) data = { 'a_series': df['a'], 'a_frame': df } serialized = pa.serialize(data).to_buffer() deserialized = pa.deserialize(serialized) assert_frame_equal(deserialized['a_frame'], df) assert_series_equal(deserialized['a_series'], df['a'])
def test_tensor_alignment(): # Deserialized numpy arrays should be 64-byte aligned. x = np.random.normal(size=(10, 20, 30)) y = pa.deserialize(pa.serialize(x).to_buffer()) assert y.ctypes.data % 64 == 0 xs = [np.random.normal(size=i) for i in range(100)] ys = pa.deserialize(pa.serialize(xs).to_buffer()) for y in ys: assert y.ctypes.data % 64 == 0 xs = [np.random.normal(size=i * (1,)) for i in range(20)] ys = pa.deserialize(pa.serialize(xs).to_buffer()) for y in ys: assert y.ctypes.data % 64 == 0 xs = [np.random.normal(size=i * (5,)) for i in range(1, 8)] xs = [xs[i][(i + 1) * (slice(1, 3),)] for i in range(len(xs))] ys = pa.deserialize(pa.serialize(xs).to_buffer()) for y in ys: assert y.ctypes.data % 64 == 0
def test_serialization_callback_error(): class TempClass(object): pass # Pass a SerializationContext into serialize, but TempClass # is not registered serialization_context = pa.SerializationContext() val = TempClass() with pytest.raises(pa.SerializationCallbackError) as err: serialized_object = pa.serialize(val, serialization_context) assert err.value.example_object == val serialization_context.register_type(TempClass, 20*b"\x00") serialized_object = pa.serialize(TempClass(), serialization_context) deserialization_context = pa.SerializationContext() # Pass a Serialization Context into deserialize, but TempClass # is not registered with pytest.raises(pa.DeserializationCallbackError) as err: serialized_object.deserialize(deserialization_context) assert err.value.type_id == 20*b"\x00"
def test_deserialize_buffer_in_different_process(): import tempfile f = tempfile.NamedTemporaryFile(delete=False) b = pa.serialize(pa.py_buffer(b'hello')).to_buffer() f.write(b.to_pybytes()) f.close() subprocess_env = test_util.get_modified_env_with_pythonpath() dir_path = os.path.dirname(os.path.realpath(__file__)) python_file = os.path.join(dir_path, 'deserialize_buffer.py') subprocess.check_call([sys.executable, python_file, f.name], env=subprocess_env)
def test_set_pickle(): # Use a custom type to trigger pickling. class Foo(object): pass context = pa.SerializationContext() context.register_type(Foo, 'Foo', pickle=True) test_object = Foo() # Define a custom serializer and deserializer to use in place of pickle. def dumps1(obj): return b'custom' def loads1(serialized_obj): return serialized_obj + b' serialization 1' # Test that setting a custom pickler changes the behavior. context.set_pickle(dumps1, loads1) serialized = pa.serialize(test_object, context=context).to_buffer() deserialized = pa.deserialize(serialized.to_pybytes(), context=context) assert deserialized == b'custom serialization 1' # Define another custom serializer and deserializer. def dumps2(obj): return b'custom' def loads2(serialized_obj): return serialized_obj + b' serialization 2' # Test that setting another custom pickler changes the behavior again. context.set_pickle(dumps2, loads2) serialized = pa.serialize(test_object, context=context).to_buffer() deserialized = pa.deserialize(serialized.to_pybytes(), context=context) assert deserialized == b'custom serialization 2'
def test_numpy_base_object(tmpdir): # ARROW-2040: deserialized Numpy array should keep a reference to the # owner of its memory path = os.path.join(str(tmpdir), 'zzz.bin') data = np.arange(12, dtype=np.int32) with open(path, 'wb') as f: f.write(pa.serialize(data).to_buffer()) serialized = pa.read_serialized(pa.OSFile(path)) result = serialized.deserialize() assert_equal(result, data) serialized = None assert_equal(result, data) assert result.base is not None
def test_buffer_serialization(): class BufferClass(object): pass def serialize_buffer_class(obj): return pa.frombuffer(b"hello") def deserialize_buffer_class(serialized_obj): return serialized_obj pa._default_serialization_context.register_type( BufferClass, "BufferClass", pickle=False, custom_serializer=serialize_buffer_class, custom_deserializer=deserialize_buffer_class) b = pa.serialize(BufferClass()).to_buffer() assert pa.deserialize(b).to_pybytes() == b"hello"
def test_serialize_with_pandas_objects(): df = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3]) s = pd.Series([1, 2, 3, 4]) data = { 'a_series': df['a'], 'a_frame': df, 's_series': s } serialized = pa.serialize(data).to_buffer() deserialized = pa.deserialize(serialized) assert_frame_equal(deserialized['a_frame'], df) assert_series_equal(deserialized['a_series'], df['a']) assert deserialized['a_series'].name == 'a' assert_series_equal(deserialized['s_series'], s) assert deserialized['s_series'].name is None
def test_buffer_serialization(): class BufferClass(object): pass def serialize_buffer_class(obj): return pa.py_buffer(b"hello") def deserialize_buffer_class(serialized_obj): return serialized_obj context = pa.default_serialization_context() context.register_type( BufferClass, "BufferClass", custom_serializer=serialize_buffer_class, custom_deserializer=deserialize_buffer_class) b = pa.serialize(BufferClass(), context=context).to_buffer() assert pa.deserialize(b, context=context).to_pybytes() == b"hello"
def test_fallback_to_subclasses(): class SubFoo(Foo): def __init__(self): Foo.__init__(self) # should be able to serialize/deserialize an instance # if a base class has been registered serialization_context = pa.SerializationContext() serialization_context.register_type(Foo, "Foo") subfoo = SubFoo() # should fallbact to Foo serializer serialized_object = pa.serialize(subfoo, serialization_context) reconstructed_object = serialized_object.deserialize( serialization_context ) assert type(reconstructed_object) == Foo
def test_numpy_matrix_serialization(tmpdir): class CustomType(object): def __init__(self, val): self.val = val path = os.path.join(str(tmpdir), 'pyarrow_npmatrix_serialization_test.bin') array = np.random.randint(low=-1, high=1, size=(2, 2)) for data_type in [str, int, float, CustomType]: matrix = np.matrix(array.astype(data_type)) with open(path, 'wb') as f: f.write(pa.serialize(matrix).to_buffer()) serialized = pa.read_serialized(pa.OSFile(path)) result = serialized.deserialize() assert_equal(result, matrix) assert_equal(result.dtype, matrix.dtype) serialized = None assert_equal(result, matrix) assert result.base is not None
def test_deserialize_in_different_process(): from multiprocessing import Process, Queue import re regex = re.compile(r"\d+\.\d*") serialization_context = pa.SerializationContext() serialization_context.register_type(type(regex), "Regex", pickle=True) serialized = pa.serialize(regex, serialization_context) serialized_bytes = serialized.to_buffer().to_pybytes() def deserialize_regex(serialized, q): import pyarrow as pa q.put(pa.deserialize(serialized)) q = Queue() p = Process(target=deserialize_regex, args=(serialized_bytes, q)) p.start() assert q.get().pattern == regex.pattern p.join()
def test_deserialize_components_in_different_process(): arr = pa.array([1, 2, 5, 6], type=pa.int8()) ser = pa.serialize(arr) data = pickle.dumps(ser.to_components(), protocol=-1) code = """if 1: import pickle import pyarrow as pa data = {0!r} components = pickle.loads(data) arr = pa.deserialize_components(components) assert arr.to_pylist() == [1, 2, 5, 6], arr """.format(data) subprocess_env = test_util.get_modified_env_with_pythonpath() print("** sys.path =", sys.path) print("** setting PYTHONPATH to:", subprocess_env['PYTHONPATH']) subprocess.check_call(["python", "-c", code], env=subprocess_env)
def serialize(data): return pyarrow.serialize(data, mars_serialize_context())
context = pa.default_serialization_context() start_time = timeit.default_timer() serialized_df = context.serialize(df) print(timeit.default_timer() - start_time) df_components = serialized_df.to_components() start_time = timeit.default_timer() original_df = context.deserialize_components(df_components) print(timeit.default_timer() - start_time) original_df data = {i: np.random.randn(500, 500) for i in range(100)} buf = pa.serialize(data).to_buffer() type(buf) buf.size restored_data = pa.deserialize(buf) restored_data[0] feather.write_feather(df, 'example.feather') read_df = feather.read_feather('example.feather') with open('example2.feather', 'wb') as f: feather.write_feather(df, f) with open('example2.feather', 'rb') as f: read_df = feather.read_feather(f) # StringIO buffer buffer = StringIO() df.to_csv(buffer)
print() t0 = time() sframe_nocopy = carr.sframe t1 = time() print("Time for serializing array in-memory (caterva, no-copy): %.3fs" % (t1 - t0)) t0 = time() sframe_copy = carr.to_sframe() t1 = time() print("Time for serializing array in-memory (caterva, copy): %.3fs" % (t1 - t0)) t0 = time() serialized = pa.serialize(arr) pyarrow_nocopy = serialized.to_components() t1 = time() print("Time for serializing array in-memory (arrow, no-copy): %.3fs" % (t1 - t0)) t0 = time() pyarrow_copy = pa.serialize(arr).to_buffer().to_pybytes() t1 = time() print("Time for serializing array in-memory (arrow, copy): %.3fs" % (t1 - t0)) t0 = time() frame_pickle = pickle.dumps(arr, protocol=4) t1 = time() print("Time for serializing array in-memory (pickle4, copy): %.3fs" % (t1 - t0))
def __init__(self, *args, **kwargs): super(LocalDiskArrowTableCache, self).__init__(*args, **kwargs) # Workaround for https://issues.apache.org/jira/browse/ARROW-5260 # unless we try to serialize something before deserialize_components is called, we would crash with a sigsegv pa.serialize(0)
def dumps_pyarrow(obj): return pyarrow.serialize(obj).to_buffer()
def test_serialize_to_buffer(): for nthreads in [1, 4]: for value in COMPLEX_OBJECTS: buf = pa.serialize(value).to_buffer(nthreads=nthreads) result = pa.deserialize(buf) assert_equal(value, result)
def valueCart(valueInput, valueLabel): return compress(serialize((valueInput, valueLabel)).to_buffer())
def __init__(self, opt, is_training=True, is_testing=False, live_test=False): self.node_dim = opt.node_dim self.state_dim = opt.state_dim self.is_training = is_training self.is_testing = is_testing if live_test: all_data_node_id, all_data_node_type = load_single_program( opt.test_graph_path) all_data_node_id = np.array( all_data_node_id)[0:len(all_data_node_id)] all_data_node_type = np.array( all_data_node_type)[0:len(all_data_node_type)] else: base_name = os.path.basename(opt.path) if is_training: saved_input_filename = "%s/%s-%d-train.pkl" % ( opt.path, base_name, opt.n_classes) if is_testing: saved_input_filename = "%s/%s-%d-test.pkl" % ( opt.path, base_name, opt.n_classes) if os.path.exists(saved_input_filename): input_file = open(saved_input_filename, 'rb') buf = input_file.read() all_data_node_id, all_data_node_type = pyarrow.deserialize(buf) input_file.close() else: all_data_node_id, all_data_node_type = load_program_graphs_from_directory( opt.path, is_training, is_testing, opt.n_classes) all_data_node_id = np.array( all_data_node_id)[0:len(all_data_node_id)] all_data_node_type = np.array( all_data_node_type)[0:len(all_data_node_type)] buf = pyarrow.serialize( (all_data_node_id, all_data_node_type)).to_buffer() out = pyarrow.OSFile(saved_input_filename, 'wb') out.write(buf) out.close() self.pretrained_embeddings = opt.pretrained_embeddings self.batch_size = opt.train_batch_size label_lookup = { label: _onehot(label, opt.n_classes) for label in range(0, opt.n_classes) } self.label_lookup = label_lookup # if is_train == True: print("Number of all data : " + str(len(all_data_node_id))) # else: # print("Number of all testing data : " + str(len(all_data_node_id))) # self.n_edge_types = find_max_edge_id(all_data_node_id) self.n_edge_types = 7 # print("Edge types : " + str(self.n_edge_types)) max_node_id = find_max_node_id(all_data_node_id) min_node_id = find_min_node_id(all_data_node_id) print("Max node id in data : " + str(max_node_id)) print("Min node id in data : " + str(min_node_id)) max_node_type = find_max_node_id(all_data_node_type) min_node_type = find_min_node_id(all_data_node_type) print("Max node type in data : " + str(max_node_type)) print("Min node type in data : " + str(min_node_type)) # print("Max node id : " + str(max_node_id)) # print("Max node type : " + str(max_node_type)) self.n_node_by_id = max_node_id self.n_node_by_type = max_node_type all_data_node_id = convert_program_data(all_data_node_id) all_data_node_type = convert_program_data(all_data_node_type) self.all_data_node_id = all_data_node_id self.all_data_node_type = all_data_node_type self.data = self.process_raw_graphs()
def pa_serialize(obj): return pyarrow.serialize(obj).to_buffer()
def make_lmdb_gesture_dataset(base_path): gesture_path = os.path.join(base_path, 'Motion') audio_path = os.path.join(base_path, 'Audio') text_path = os.path.join(base_path, 'Transcripts') out_path = os.path.join(base_path, 'lmdb') if not os.path.exists(out_path): os.makedirs(out_path) map_size = 1024 * 20 # in MB map_size <<= 20 # in B db = [ lmdb.open(os.path.join(out_path, 'lmdb_train'), map_size=map_size), lmdb.open(os.path.join(out_path, 'lmdb_test'), map_size=map_size) ] # delete existing files for i in range(2): with db[i].begin(write=True) as txn: txn.drop(db[i].open_db()) all_poses = [] bvh_files = sorted(glob.glob(gesture_path + "/*.bvh")) for v_i, bvh_file in enumerate(bvh_files): name = os.path.split(bvh_file)[1][:-4] print(name) # load skeletons and subtitles poses, poses_mirror = process_bvh(bvh_file) subtitle = SubtitleWrapper(os.path.join(text_path, name + '.json')).get() # load audio audio_raw, audio_sr = librosa.load(os.path.join( audio_path, '{}.wav'.format(name)), mono=True, sr=16000, res_type='kaiser_fast') # process clips = [ { 'vid': name, 'clips': [] }, # train { 'vid': name, 'clips': [] } ] # validation # split if v_i == 0: dataset_idx = 1 # validation else: dataset_idx = 0 # train # word preprocessing word_list = [] for wi in range(len(subtitle)): word_s = float(subtitle[wi]['start_time'][:-1]) word_e = float(subtitle[wi]['end_time'][:-1]) word = subtitle[wi]['word'] word = normalize_string(word) if len(word) > 0: word_list.append([word, word_s, word_e]) # save subtitles and skeletons poses = np.asarray(poses, dtype=np.float16) clips[dataset_idx]['clips'].append({ 'words': word_list, 'poses': poses, 'audio_raw': audio_raw }) poses_mirror = np.asarray(poses_mirror, dtype=np.float16) clips[dataset_idx]['clips'].append({ 'words': word_list, 'poses': poses_mirror, 'audio_raw': audio_raw }) # write to db for i in range(2): with db[i].begin(write=True) as txn: if len(clips[i]['clips']) > 0: k = '{:010}'.format(v_i).encode('ascii') v = pyarrow.serialize(clips[i]).to_buffer() txn.put(k, v) all_poses.append(poses) # close db for i in range(2): db[i].sync() db[i].close() # calculate data mean all_poses = np.vstack(all_poses) pose_mean = np.mean(all_poses, axis=0) pose_std = np.std(all_poses, axis=0) print('data mean/std') print(str(["{:0.5f}".format(e) for e in pose_mean]).replace("'", "")) print(str(["{:0.5f}".format(e) for e in pose_std]).replace("'", ""))
def serialize(data): buf = pa.serialize(data).to_buffer() return buf
def benchmark(procnum, send_end): resnet_url = "http://54.87.17.51:1337/resnet101-app/predict" inception_url = "http://3.235.164.133:1337/inceptionv3-app/predict" predict_url = "http://34.232.48.232:1337/predict-app/predict" headers = {"Content-type": "application/json"} latencies = [] post_serial_latencies = [] resnet_latencies = [] incept_latencies = [] # pred_latencies = [] y = os.listdir("imagenet_sample/imagenet/") x = random.choices(y, k=1000) count = 0 for filename in x: start = time.time() #print(filename) # Creating image input req_json = json.dumps({ "input": base64.b64encode( open("imagenet_sample/imagenet/" + filename, "rb").read()).decode() }) serial_start = time.time() #Calling resnet r = requests.post(resnet_url, headers=headers, data=req_json) resnet_output = r.json()['output'] if r.json()['default']: print("ERROR", os.getpid()) return resnet_end = time.time() #Calling inception incept_start = time.time() input_bytes = pa.serialize([ np.asarray( Image.open("imagenet_sample/imagenet/" + filename).convert("RGB")), resnet_output[1] ]).to_buffer().to_pybytes() req_json = json.dumps( {"input": base64.b64encode(input_bytes).decode()}) r2 = requests.post(inception_url, headers=headers, data=req_json) inception_output = r2.json()['output'] incept_end = time.time() #calling predict predict_input = [ resnet_output[1], inception_output[1], float(resnet_output[0]), float(inception_output[1]) ] req_json = json.dumps({"input": predict_input}) r3 = requests.post(predict_url, headers=headers, data=req_json) end = time.time() incept_end = end latency = (end - start) #print("'%s', %f ms" % (r.text, latency)) latencies.append(latency) resnet_latencies.append(resnet_end - serial_start) incept_latencies.append(incept_end - resnet_end) post_serial_latencies.append(end - serial_start) send_end.send([ latencies, post_serial_latencies, count, resnet_latencies, incept_latencies ])
def test_serialization_determinism(): for obj in COMPLEX_OBJECTS: buf1 = pa.serialize(obj).to_buffer() buf2 = pa.serialize(obj).to_buffer() assert buf1.to_pybytes() == buf2.to_pybytes()
def _sample_from_clip(self, vid, clip): clip_skeleton = clip['skeletons_3d'] clip_audio = clip['audio_feat'] clip_audio_raw = clip['audio_raw'] clip_word_list = clip['words'] clip_s_f, clip_e_f = clip['start_frame_no'], clip['end_frame_no'] clip_s_t, clip_e_t = clip['start_time'], clip['end_time'] n_filtered_out = defaultdict(int) # skeleton resampling clip_skeleton = utils.data_utils.resample_pose_seq( clip_skeleton, clip_e_t - clip_s_t, self.skeleton_resampling_fps) # divide aux_info = [] sample_skeletons_list = [] sample_words_list = [] sample_audio_list = [] sample_spectrogram_list = [] num_subdivision = math.floor( (len(clip_skeleton) - self.n_poses) / self.subdivision_stride) + 1 # floor((K - (N+M)) / S) + 1 expected_audio_length = utils.data_utils.calc_spectrogram_length_from_motion_length( len(clip_skeleton), self.skeleton_resampling_fps) assert abs(expected_audio_length - clip_audio.shape[1] ) <= 5, 'audio and skeleton lengths are different' for i in range(num_subdivision): start_idx = i * self.subdivision_stride fin_idx = start_idx + self.n_poses sample_skeletons = clip_skeleton[start_idx:fin_idx] subdivision_start_time = clip_s_t + start_idx / self.skeleton_resampling_fps subdivision_end_time = clip_s_t + fin_idx / self.skeleton_resampling_fps sample_words = self.get_words_in_time_range( word_list=clip_word_list, start_time=subdivision_start_time, end_time=subdivision_end_time) # spectrogram audio_start = math.floor(start_idx / len(clip_skeleton) * clip_audio.shape[1]) audio_end = audio_start + self.spectrogram_sample_length if audio_end > clip_audio.shape[ 1]: # correct size mismatch between poses and audio # logging.info('expanding audio array, audio start={}, end={}, clip_length={}'.format( # audio_start, audio_end, clip_audio.shape[1])) n_padding = audio_end - clip_audio.shape[1] padded_data = np.pad(clip_audio, ((0, 0), (0, n_padding)), mode='symmetric') sample_spectrogram = padded_data[:, audio_start:audio_end] else: sample_spectrogram = clip_audio[:, audio_start:audio_end] # raw audio audio_start = math.floor(start_idx / len(clip_skeleton) * len(clip_audio_raw)) audio_end = audio_start + self.audio_sample_length if audio_end > len( clip_audio_raw ): # correct size mismatch between poses and audio # logging.info('expanding audio array, audio start={}, end={}, clip_length={}'.format( # audio_start, audio_end, len(clip_audio_raw))) n_padding = audio_end - len(clip_audio_raw) padded_data = np.pad(clip_audio_raw, (0, n_padding), mode='symmetric') sample_audio = padded_data[audio_start:audio_end] else: sample_audio = clip_audio_raw[audio_start:audio_end] if len(sample_words) >= 2: # filtering motion skeleton data sample_skeletons, filtering_message = MotionPreprocessor( sample_skeletons, self.mean_pose).get() is_correct_motion = (sample_skeletons != []) motion_info = { 'vid': vid, 'start_frame_no': clip_s_f + start_idx, 'end_frame_no': clip_s_f + fin_idx, 'start_time': subdivision_start_time, 'end_time': subdivision_end_time, 'is_correct_motion': is_correct_motion, 'filtering_message': filtering_message } if is_correct_motion or self.disable_filtering: sample_skeletons_list.append(sample_skeletons) sample_words_list.append(sample_words) sample_audio_list.append(sample_audio) sample_spectrogram_list.append(sample_spectrogram) aux_info.append(motion_info) else: n_filtered_out[filtering_message] += 1 if len(sample_skeletons_list) > 0: with self.dst_lmdb_env.begin(write=True) as txn: for words, poses, audio, spectrogram, aux in zip( sample_words_list, sample_skeletons_list, sample_audio_list, sample_spectrogram_list, aux_info): # preprocessing for poses poses = np.asarray(poses) dir_vec = utils.data_utils.convert_pose_seq_to_dir_vec( poses) normalized_dir_vec = self.normalize_dir_vec( dir_vec, self.mean_dir_vec) # save k = '{:010}'.format(self.n_out_samples).encode('ascii') v = [ words, poses, normalized_dir_vec, audio, spectrogram, aux ] v = pyarrow.serialize(v).to_buffer() txn.put(k, v) self.n_out_samples += 1 return n_filtered_out
train_wav_files, test_wav_files = train_test_split(wav_files, train_size=split, random_state=1234) if 'train' in mode: wav_files = train_wav_files elif 'test' in mode: wav_files = test_wav_files else: NotImplementedError( 'Other mode are not implemented in split mode! (mode: %s, split: %.4f)' % (mode, split)) return wav_files # serialize data function serialize_data = lambda arr: pa.serialize(arr).to_buffer().to_pybytes() deserialize_data = lambda bin_data: pa.deserialize(bin_data) def load_arrow_file(file_path): with open(file_path, 'rb') as rb: return deserialize_data(rb.read()) def normalize_0_1(values, max, min): normalized = np.clip((values - min) / (max - min), 0, 1) return normalized def denormalize_0_1(normalized, max, min): values = np.clip(normalized, 0, 1) * (max - min) + min