def transform_job(self, chunk_store): gc.collect() start = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 10**6 #sys.getallocatedblocks() data = chunk_store.load() gc.collect() finished_load = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 10**6 #sys.getallocatedblocks() result = self.transform_chunk(data) gc.collect() finished_transform = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 10**6 #sys.getallocatedblocks() t_hdf_store = HdfDataStore(self.output_schema(), chunk_store.hdf_file) t_hdf_store.store(result) gc.collect() finished_store = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 10**6 #sys.getallocatedblocks() print('started with', start, 'mb, ended with', finished_store, 'difference =', finished_store - start) print('loading used', finished_load - start, 'mb') print('transforming used', finished_transform - finished_load, 'mb') print('storing used', finished_store - finished_transform, 'mb') return t_hdf_store
def df_to_hdf5(file_name, df): if not os.path.splitext(file_name)[1] == '.hdf': file_name = os.path.join(file_name, '.hdf') sp = os.path.splitext(file_name)[0] local_storage_dir = mimic_login.get_local_storage_dir() file_path = os.path.join(local_storage_dir, file_name) store = HdfDataStore(PartialSchema(sp), file_path) store.store(df) return FileLink(os.path.relpath(file_path), result_html_prefix='Right-click and save: ')
def df_to_hdf5(file_name, df): if not os.path.splitext(file_name)[1] == '.hdf': file_name = os.path.join(file_name, '.hdf') sp = os.path.splitext(file_name)[0] local_storage_dir = mimic_login.get_local_storage_dir() file_path = os.path.join(local_storage_dir, file_name) store = HdfDataStore(PartialSchema(sp), file_path) store.store(df) return FileLink(os.path.relpath(file_path), result_html_prefix='Right-click and save: ')
def _transform(self, data): start = time.time() print('transforming data of size', data.memory_usage(index=True).sum(), 'bytes') store_chunks_jobs = [] transform_jobs = [] hdf_stores = [] try: print('splitting data into large groups') group_iter = self._group_iter( data, len(data) // self.n_jobs or self.chunksize) for group_data in group_iter: if group_data.empty: continue f = temp_file.make_temporary_file() hdf_store = HdfDataStore(self.input_schema(), f) hdf_stores.append(hdf_store) hdf_store.store(group_data) store_chunks_jobs.append( joblib.delayed(self.store_chunks_job)(hdf_store)) #transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store)) print('breaking data into chunks in parallel') joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs) chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores) transform_jobs = [ joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores ] print('running transforms in', len(transform_jobs), 'parallel jobs') result_hdf_stores = joblib.Parallel( n_jobs=self.n_jobs)(transform_jobs) print('loading and merging the results') results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores) print('finished merge') finally: for hdf_store in hdf_stores: hdf_store.delete_chunks() hdf_store.delete() end = time.time() print('took', end - start, 'seconds to transform all data in parallel') return results
def _transform(self, data): start = time.time() print("transforming data of size", data.memory_usage(index=True).sum(), "bytes") store_chunks_jobs = [] transform_jobs = [] hdf_stores = [] try: print("splitting data into large groups") group_iter = self._group_iter(data, len(data) // self.n_jobs or self.chunksize) for group_data in group_iter: if group_data.empty: continue f = temp_file.make_temporary_file() hdf_store = HdfDataStore(self.input_schema(), f) hdf_stores.append(hdf_store) hdf_store.store(group_data) store_chunks_jobs.append(joblib.delayed(self.store_chunks_job)(hdf_store)) # transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store)) print("breaking data into chunks in parallel") joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs) chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores) transform_jobs = [joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores] print("running transforms in", len(transform_jobs), "parallel jobs") result_hdf_stores = joblib.Parallel(n_jobs=self.n_jobs)(transform_jobs) print("loading and merging the results") results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores) print("finished merge") finally: for hdf_store in hdf_stores: hdf_store.delete_chunks() hdf_store.delete() end = time.time() print("took", end - start, "seconds to transform all data in parallel") return results
def transform_job(self, chunk_store): gc.collect() start = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6 # sys.getallocatedblocks() data = chunk_store.load() gc.collect() finished_load = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6 # sys.getallocatedblocks() result = self.transform_chunk(data) gc.collect() finished_transform = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6 # sys.getallocatedblocks() t_hdf_store = HdfDataStore(self.output_schema(), chunk_store.hdf_file) t_hdf_store.store(result) gc.collect() finished_store = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6 # sys.getallocatedblocks() print("started with", start, "mb, ended with", finished_store, "difference =", finished_store - start) print("loading used", finished_load - start, "mb") print("transforming used", finished_transform - finished_load, "mb") print("storing used", finished_store - finished_transform, "mb") return t_hdf_store
def df_to_hdf5(file_path, df, schema): store = HdfDataStore(schema, file_path) store.store(df) return FileLink(file_path, result_html_prefix='Right-click and save: ')
def df_to_hdf5(file_path, df, schema): store = HdfDataStore(schema, file_path) store.store(df) return FileLink(file_path, result_html_prefix='Right-click and save: ')
def df_to_hdf5(file_path, df, schema): store = HdfDataStore(schema, file_path) store.store(df)