def transform_job(self, chunk_store):
        gc.collect()
        start = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss / 10**6  #sys.getallocatedblocks()

        data = chunk_store.load()
        gc.collect()

        finished_load = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss / 10**6  #sys.getallocatedblocks()

        result = self.transform_chunk(data)
        gc.collect()

        finished_transform = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss / 10**6  #sys.getallocatedblocks()

        t_hdf_store = HdfDataStore(self.output_schema(), chunk_store.hdf_file)
        t_hdf_store.store(result)
        gc.collect()

        finished_store = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss / 10**6  #sys.getallocatedblocks()
        print('started with', start, 'mb, ended with', finished_store,
              'difference =', finished_store - start)
        print('loading used', finished_load - start, 'mb')
        print('transforming used', finished_transform - finished_load, 'mb')
        print('storing used', finished_store - finished_transform, 'mb')
        return t_hdf_store
def df_to_hdf5(file_name, df):
    if not os.path.splitext(file_name)[1] == '.hdf':
        file_name = os.path.join(file_name, '.hdf')
    sp = os.path.splitext(file_name)[0]
    local_storage_dir = mimic_login.get_local_storage_dir()
    file_path = os.path.join(local_storage_dir, file_name)

    store = HdfDataStore(PartialSchema(sp), file_path)
    store.store(df)
    return FileLink(os.path.relpath(file_path), result_html_prefix='Right-click and save: ')
Example #3
0
def df_to_hdf5(file_name, df):
    if not os.path.splitext(file_name)[1] == '.hdf':
        file_name = os.path.join(file_name, '.hdf')
    sp = os.path.splitext(file_name)[0]
    local_storage_dir = mimic_login.get_local_storage_dir()
    file_path = os.path.join(local_storage_dir, file_name)

    store = HdfDataStore(PartialSchema(sp), file_path)
    store.store(df)
    return FileLink(os.path.relpath(file_path),
                    result_html_prefix='Right-click and save: ')
    def _transform(self, data):
        start = time.time()
        print('transforming data of size',
              data.memory_usage(index=True).sum(), 'bytes')

        store_chunks_jobs = []
        transform_jobs = []
        hdf_stores = []
        try:
            print('splitting data into large groups')
            group_iter = self._group_iter(
                data,
                len(data) // self.n_jobs or self.chunksize)

            for group_data in group_iter:
                if group_data.empty:
                    continue
                f = temp_file.make_temporary_file()
                hdf_store = HdfDataStore(self.input_schema(), f)
                hdf_stores.append(hdf_store)
                hdf_store.store(group_data)

                store_chunks_jobs.append(
                    joblib.delayed(self.store_chunks_job)(hdf_store))
                #transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store))
            print('breaking data into chunks in parallel')
            joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs)

            chunk_stores = chain.from_iterable(store.chunk_stores()
                                               for store in hdf_stores)

            transform_jobs = [
                joblib.delayed(self.transform_job)(chunk_store)
                for chunk_store in chunk_stores
            ]

            print('running transforms in', len(transform_jobs),
                  'parallel jobs')
            result_hdf_stores = joblib.Parallel(
                n_jobs=self.n_jobs)(transform_jobs)

            print('loading and merging the results')
            results = from_chunks(r_hdf_store.load()
                                  for r_hdf_store in result_hdf_stores)
            print('finished merge')
        finally:
            for hdf_store in hdf_stores:
                hdf_store.delete_chunks()
                hdf_store.delete()

        end = time.time()
        print('took', end - start, 'seconds to transform all data in parallel')
        return results
    def _transform(self, data):
        start = time.time()
        print("transforming data of size", data.memory_usage(index=True).sum(), "bytes")

        store_chunks_jobs = []
        transform_jobs = []
        hdf_stores = []
        try:
            print("splitting data into large groups")
            group_iter = self._group_iter(data, len(data) // self.n_jobs or self.chunksize)

            for group_data in group_iter:
                if group_data.empty:
                    continue
                f = temp_file.make_temporary_file()
                hdf_store = HdfDataStore(self.input_schema(), f)
                hdf_stores.append(hdf_store)
                hdf_store.store(group_data)

                store_chunks_jobs.append(joblib.delayed(self.store_chunks_job)(hdf_store))
                # transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store))
            print("breaking data into chunks in parallel")
            joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs)

            chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores)

            transform_jobs = [joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores]

            print("running transforms in", len(transform_jobs), "parallel jobs")
            result_hdf_stores = joblib.Parallel(n_jobs=self.n_jobs)(transform_jobs)

            print("loading and merging the results")
            results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores)
            print("finished merge")
        finally:
            for hdf_store in hdf_stores:
                hdf_store.delete_chunks()
                hdf_store.delete()

        end = time.time()
        print("took", end - start, "seconds to transform all data in parallel")
        return results
    def transform_job(self, chunk_store):
        gc.collect()
        start = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6  # sys.getallocatedblocks()

        data = chunk_store.load()
        gc.collect()

        finished_load = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6  # sys.getallocatedblocks()

        result = self.transform_chunk(data)
        gc.collect()

        finished_transform = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6  # sys.getallocatedblocks()

        t_hdf_store = HdfDataStore(self.output_schema(), chunk_store.hdf_file)
        t_hdf_store.store(result)
        gc.collect()

        finished_store = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6  # sys.getallocatedblocks()
        print("started with", start, "mb, ended with", finished_store, "difference =", finished_store - start)
        print("loading used", finished_load - start, "mb")
        print("transforming used", finished_transform - finished_load, "mb")
        print("storing used", finished_store - finished_transform, "mb")
        return t_hdf_store
Example #7
0
def df_to_hdf5(file_path, df, schema):
    store = HdfDataStore(schema, file_path)
    store.store(df)
    return FileLink(file_path, result_html_prefix='Right-click and save: ')
def df_to_hdf5(file_path, df, schema):
    store = HdfDataStore(schema, file_path)
    store.store(df)
    return FileLink(file_path, result_html_prefix='Right-click and save: ')
def df_to_hdf5(file_path, df, schema):
    store = HdfDataStore(schema, file_path)
    store.store(df)