def transform_job(self, chunk_store): gc.collect() start = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 10**6 #sys.getallocatedblocks() data = chunk_store.load() gc.collect() finished_load = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 10**6 #sys.getallocatedblocks() result = self.transform_chunk(data) gc.collect() finished_transform = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 10**6 #sys.getallocatedblocks() t_hdf_store = HdfDataStore(self.output_schema(), chunk_store.hdf_file) t_hdf_store.store(result) gc.collect() finished_store = resource.getrusage( resource.RUSAGE_SELF).ru_maxrss / 10**6 #sys.getallocatedblocks() print('started with', start, 'mb, ended with', finished_store, 'difference =', finished_store - start) print('loading used', finished_load - start, 'mb') print('transforming used', finished_transform - finished_load, 'mb') print('storing used', finished_store - finished_transform, 'mb') return t_hdf_store
def _load(self): tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('loading from s3') load_file_from_s3(self.boto_bucket, self.schema.name, tmp_path) print('loading from hdf') store = HdfDataStore(self.schema, tmp_path) return store._load()
def _store(self, df): tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('storing to temp hdf') store = HdfDataStore(self.schema, tmp_path) store._store(df) print('saving to s3') store_file_to_s3(self.boto_bucket, self.schema.name, tmp_path)
def df_to_hdf5(file_name, df): if not os.path.splitext(file_name)[1] == '.hdf': file_name = os.path.join(file_name, '.hdf') sp = os.path.splitext(file_name)[0] local_storage_dir = mimic_login.get_local_storage_dir() file_path = os.path.join(local_storage_dir, file_name) store = HdfDataStore(PartialSchema(sp), file_path) store.store(df) return FileLink(os.path.relpath(file_path), result_html_prefix='Right-click and save: ')
def _store_chunks(self, chunks): for i, chunk in enumerate(chunks): k = self._chunk_key(i) tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('storing chunk to temp hdf') store = HdfDataStore(self.schema, tmp_path) store._store(chunk) print('saving chunk to s3') store_file_to_s3(self.boto_bucket, k, tmp_path)
def _load_chunks(self): for i in count(): k = self._chunk_key(i) if not key_exists(self.boto_bucket, k): break tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('loading from s3') load_file_from_s3(self.boto_bucket, k, tmp_path) print('loading from hdf') store = HdfDataStore(self.schema, tmp_path) chunk = store._load() yield chunk
def _transform(self, data): start = time.time() print('transforming data of size', data.memory_usage(index=True).sum(), 'bytes') store_chunks_jobs = [] transform_jobs = [] hdf_stores = [] try: print('splitting data into large groups') group_iter = self._group_iter( data, len(data) // self.n_jobs or self.chunksize) for group_data in group_iter: if group_data.empty: continue f = temp_file.make_temporary_file() hdf_store = HdfDataStore(self.input_schema(), f) hdf_stores.append(hdf_store) hdf_store.store(group_data) store_chunks_jobs.append( joblib.delayed(self.store_chunks_job)(hdf_store)) #transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store)) print('breaking data into chunks in parallel') joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs) chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores) transform_jobs = [ joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores ] print('running transforms in', len(transform_jobs), 'parallel jobs') result_hdf_stores = joblib.Parallel( n_jobs=self.n_jobs)(transform_jobs) print('loading and merging the results') results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores) print('finished merge') finally: for hdf_store in hdf_stores: hdf_store.delete_chunks() hdf_store.delete() end = time.time() print('took', end - start, 'seconds to transform all data in parallel') return results
def load_table(schema, condition=None): loader = _get_table_loader(schema, condition) local_storage_dir = mimic_login.get_local_storage_dir() if local_storage_dir: query_f_name = schema.name if condition is not None: query_f_name += '_' + condition query_f_name += '.hdf' query_f_name = os.path.join(local_storage_dir, query_f_name) cache = HdfDataStore(schema, query_f_name, fixed=True) loader = CachingDataStore(schema, loader, cache) return loader.load()
def transform_job(self, chunk_store): gc.collect() start = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6 # sys.getallocatedblocks() data = chunk_store.load() gc.collect() finished_load = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6 # sys.getallocatedblocks() result = self.transform_chunk(data) gc.collect() finished_transform = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6 # sys.getallocatedblocks() t_hdf_store = HdfDataStore(self.output_schema(), chunk_store.hdf_file) t_hdf_store.store(result) gc.collect() finished_store = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6 # sys.getallocatedblocks() print("started with", start, "mb, ended with", finished_store, "difference =", finished_store - start) print("loading used", finished_load - start, "mb") print("transforming used", finished_transform - finished_load, "mb") print("storing used", finished_store - finished_transform, "mb") return t_hdf_store
def _transform(self, data): start = time.time() print("transforming data of size", data.memory_usage(index=True).sum(), "bytes") store_chunks_jobs = [] transform_jobs = [] hdf_stores = [] try: print("splitting data into large groups") group_iter = self._group_iter(data, len(data) // self.n_jobs or self.chunksize) for group_data in group_iter: if group_data.empty: continue f = temp_file.make_temporary_file() hdf_store = HdfDataStore(self.input_schema(), f) hdf_stores.append(hdf_store) hdf_store.store(group_data) store_chunks_jobs.append(joblib.delayed(self.store_chunks_job)(hdf_store)) # transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store)) print("breaking data into chunks in parallel") joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs) chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores) transform_jobs = [joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores] print("running transforms in", len(transform_jobs), "parallel jobs") result_hdf_stores = joblib.Parallel(n_jobs=self.n_jobs)(transform_jobs) print("loading and merging the results") results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores) print("finished merge") finally: for hdf_store in hdf_stores: hdf_store.delete_chunks() hdf_store.delete() end = time.time() print("took", end - start, "seconds to transform all data in parallel") return results
def df_to_hdf5(file_path, df, schema): store = HdfDataStore(schema, file_path) store.store(df) return FileLink(file_path, result_html_prefix='Right-click and save: ')
from chatto_transform.config import config from chatto_transform.schema.ss.ss_sql_raw_schema import appointments from chatto_transform.datastores.hdf_datastore import HdfDataStore from chatto_transform.lib.chunks import from_chunks import time ds = HdfDataStore(appointments, config.data_dir+'test.hdf') chunks = ds.load_chunks() start = time.time() df = from_chunks(chunks) end = time.time() print('took', end - start, 'seconds to load and concatenate all data')
from chatto_transform.config import config from chatto_transform.schema.ss.ss_sql_raw_schema import appointments from chatto_transform.datastores.hdf_datastore import HdfDataStore from chatto_transform.lib.chunks import from_chunks import time ds = HdfDataStore(appointments, config.data_dir + 'test.hdf') chunks = ds.load_chunks() start = time.time() df = from_chunks(chunks) end = time.time() print('took', end - start, 'seconds to load and concatenate all data')
def load_hdf(file_path, schema): store = HdfDataStore(schema, file_path) df = store.load() return df
import boto from chatto_transform.datastores.s3_datastore import S3DataStore from chatto_transform.datastores.hdf_datastore import HdfDataStore from chatto_transform.schema.ss.ss_sql_raw_schema import appointments conn = boto.connect_s3() bucket = conn.get_bucket('chatto') print('loading data from hdf') data = HdfDataStore(appointments, '/Users/dan/dev/data/test.hdf_chunk_0').load() ds = S3DataStore(appointments, bucket) ds.store(data)
def df_to_hdf5(file_path, df, schema): store = HdfDataStore(schema, file_path) store.store(df)