def _load(self): tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('loading from s3') load_file_from_s3(self.boto_bucket, self.schema.name, tmp_path) print('loading from hdf') store = HdfDataStore(self.schema, tmp_path) return store._load()
def _load(self): tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('loading from s3') load_file_from_s3(self.boto_bucket, self.schema.name, tmp_path) print('loading from hdf') store = HdfDataStore(self.schema, tmp_path) return store._load()
def _store(self, df): tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('storing to temp hdf') store = HdfDataStore(self.schema, tmp_path) store._store(df) print('saving to s3') store_file_to_s3(self.boto_bucket, self.schema.name, tmp_path)
def _store(self, df): tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('storing to temp hdf') store = HdfDataStore(self.schema, tmp_path) store._store(df) print('saving to s3') store_file_to_s3(self.boto_bucket, self.schema.name, tmp_path)
def _store_chunks(self, chunks): for i, chunk in enumerate(chunks): k = self._chunk_key(i) tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('storing chunk to temp hdf') store = HdfDataStore(self.schema, tmp_path) store._store(chunk) print('saving chunk to s3') store_file_to_s3(self.boto_bucket, k, tmp_path)
def _transform(self, data): start = time.time() print('transforming data of size', data.memory_usage(index=True).sum(), 'bytes') store_chunks_jobs = [] transform_jobs = [] hdf_stores = [] try: print('splitting data into large groups') group_iter = self._group_iter( data, len(data) // self.n_jobs or self.chunksize) for group_data in group_iter: if group_data.empty: continue f = temp_file.make_temporary_file() hdf_store = HdfDataStore(self.input_schema(), f) hdf_stores.append(hdf_store) hdf_store.store(group_data) store_chunks_jobs.append( joblib.delayed(self.store_chunks_job)(hdf_store)) #transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store)) print('breaking data into chunks in parallel') joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs) chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores) transform_jobs = [ joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores ] print('running transforms in', len(transform_jobs), 'parallel jobs') result_hdf_stores = joblib.Parallel( n_jobs=self.n_jobs)(transform_jobs) print('loading and merging the results') results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores) print('finished merge') finally: for hdf_store in hdf_stores: hdf_store.delete_chunks() hdf_store.delete() end = time.time() print('took', end - start, 'seconds to transform all data in parallel') return results
def _load_chunks(self): for i in count(): k = self._chunk_key(i) if not key_exists(self.boto_bucket, k): break tmp_path = temp_file.make_temporary_file() with temp_file.deleting(tmp_path): print('loading from s3') load_file_from_s3(self.boto_bucket, k, tmp_path) print('loading from hdf') store = HdfDataStore(self.schema, tmp_path) chunk = store._load() yield chunk
def _transform(self, data): start = time.time() print("transforming data of size", data.memory_usage(index=True).sum(), "bytes") store_chunks_jobs = [] transform_jobs = [] hdf_stores = [] try: print("splitting data into large groups") group_iter = self._group_iter(data, len(data) // self.n_jobs or self.chunksize) for group_data in group_iter: if group_data.empty: continue f = temp_file.make_temporary_file() hdf_store = HdfDataStore(self.input_schema(), f) hdf_stores.append(hdf_store) hdf_store.store(group_data) store_chunks_jobs.append(joblib.delayed(self.store_chunks_job)(hdf_store)) # transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store)) print("breaking data into chunks in parallel") joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs) chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores) transform_jobs = [joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores] print("running transforms in", len(transform_jobs), "parallel jobs") result_hdf_stores = joblib.Parallel(n_jobs=self.n_jobs)(transform_jobs) print("loading and merging the results") results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores) print("finished merge") finally: for hdf_store in hdf_stores: hdf_store.delete_chunks() hdf_store.delete() end = time.time() print("took", end - start, "seconds to transform all data in parallel") return results