def _load(self):
        tmp_path = temp_file.make_temporary_file()
        with temp_file.deleting(tmp_path):
            print('loading from s3')
            load_file_from_s3(self.boto_bucket, self.schema.name, tmp_path)

            print('loading from hdf')
            store = HdfDataStore(self.schema, tmp_path)
            return store._load()
Example #2
0
    def _load(self):
        tmp_path = temp_file.make_temporary_file()
        with temp_file.deleting(tmp_path):
            print('loading from s3')
            load_file_from_s3(self.boto_bucket, self.schema.name, tmp_path)

            print('loading from hdf')
            store = HdfDataStore(self.schema, tmp_path)
            return store._load()
    def _store(self, df):
        tmp_path = temp_file.make_temporary_file()
        with temp_file.deleting(tmp_path):
            print('storing to temp hdf')
            store = HdfDataStore(self.schema, tmp_path)
            store._store(df)

            print('saving to s3')
            store_file_to_s3(self.boto_bucket, self.schema.name, tmp_path)
Example #4
0
    def _store(self, df):
        tmp_path = temp_file.make_temporary_file()
        with temp_file.deleting(tmp_path):
            print('storing to temp hdf')
            store = HdfDataStore(self.schema, tmp_path)
            store._store(df)

            print('saving to s3')
            store_file_to_s3(self.boto_bucket, self.schema.name, tmp_path)
Example #5
0
    def _store_chunks(self, chunks):
        for i, chunk in enumerate(chunks):
            k = self._chunk_key(i)

            tmp_path = temp_file.make_temporary_file()
            with temp_file.deleting(tmp_path):
                print('storing chunk to temp hdf')
                store = HdfDataStore(self.schema, tmp_path)
                store._store(chunk)

                print('saving chunk to s3')
                store_file_to_s3(self.boto_bucket, k, tmp_path)
    def _transform(self, data):
        start = time.time()
        print('transforming data of size',
              data.memory_usage(index=True).sum(), 'bytes')

        store_chunks_jobs = []
        transform_jobs = []
        hdf_stores = []
        try:
            print('splitting data into large groups')
            group_iter = self._group_iter(
                data,
                len(data) // self.n_jobs or self.chunksize)

            for group_data in group_iter:
                if group_data.empty:
                    continue
                f = temp_file.make_temporary_file()
                hdf_store = HdfDataStore(self.input_schema(), f)
                hdf_stores.append(hdf_store)
                hdf_store.store(group_data)

                store_chunks_jobs.append(
                    joblib.delayed(self.store_chunks_job)(hdf_store))
                #transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store))
            print('breaking data into chunks in parallel')
            joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs)

            chunk_stores = chain.from_iterable(store.chunk_stores()
                                               for store in hdf_stores)

            transform_jobs = [
                joblib.delayed(self.transform_job)(chunk_store)
                for chunk_store in chunk_stores
            ]

            print('running transforms in', len(transform_jobs),
                  'parallel jobs')
            result_hdf_stores = joblib.Parallel(
                n_jobs=self.n_jobs)(transform_jobs)

            print('loading and merging the results')
            results = from_chunks(r_hdf_store.load()
                                  for r_hdf_store in result_hdf_stores)
            print('finished merge')
        finally:
            for hdf_store in hdf_stores:
                hdf_store.delete_chunks()
                hdf_store.delete()

        end = time.time()
        print('took', end - start, 'seconds to transform all data in parallel')
        return results
Example #7
0
    def _load_chunks(self):
        for i in count():
            
            k = self._chunk_key(i)
            if not key_exists(self.boto_bucket, k):
                break

            tmp_path = temp_file.make_temporary_file()
            with temp_file.deleting(tmp_path):
                print('loading from s3')
                load_file_from_s3(self.boto_bucket, k, tmp_path)

                print('loading from hdf')
                store = HdfDataStore(self.schema, tmp_path)
                chunk = store._load()
                yield chunk
    def _transform(self, data):
        start = time.time()
        print("transforming data of size", data.memory_usage(index=True).sum(), "bytes")

        store_chunks_jobs = []
        transform_jobs = []
        hdf_stores = []
        try:
            print("splitting data into large groups")
            group_iter = self._group_iter(data, len(data) // self.n_jobs or self.chunksize)

            for group_data in group_iter:
                if group_data.empty:
                    continue
                f = temp_file.make_temporary_file()
                hdf_store = HdfDataStore(self.input_schema(), f)
                hdf_stores.append(hdf_store)
                hdf_store.store(group_data)

                store_chunks_jobs.append(joblib.delayed(self.store_chunks_job)(hdf_store))
                # transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store))
            print("breaking data into chunks in parallel")
            joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs)

            chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores)

            transform_jobs = [joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores]

            print("running transforms in", len(transform_jobs), "parallel jobs")
            result_hdf_stores = joblib.Parallel(n_jobs=self.n_jobs)(transform_jobs)

            print("loading and merging the results")
            results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores)
            print("finished merge")
        finally:
            for hdf_store in hdf_stores:
                hdf_store.delete_chunks()
                hdf_store.delete()

        end = time.time()
        print("took", end - start, "seconds to transform all data in parallel")
        return results