def transform_job(self, chunk_store):
        gc.collect()
        start = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss / 10**6  #sys.getallocatedblocks()

        data = chunk_store.load()
        gc.collect()

        finished_load = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss / 10**6  #sys.getallocatedblocks()

        result = self.transform_chunk(data)
        gc.collect()

        finished_transform = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss / 10**6  #sys.getallocatedblocks()

        t_hdf_store = HdfDataStore(self.output_schema(), chunk_store.hdf_file)
        t_hdf_store.store(result)
        gc.collect()

        finished_store = resource.getrusage(
            resource.RUSAGE_SELF).ru_maxrss / 10**6  #sys.getallocatedblocks()
        print('started with', start, 'mb, ended with', finished_store,
              'difference =', finished_store - start)
        print('loading used', finished_load - start, 'mb')
        print('transforming used', finished_transform - finished_load, 'mb')
        print('storing used', finished_store - finished_transform, 'mb')
        return t_hdf_store
Example #2
0
    def _load(self):
        tmp_path = temp_file.make_temporary_file()
        with temp_file.deleting(tmp_path):
            print('loading from s3')
            load_file_from_s3(self.boto_bucket, self.schema.name, tmp_path)

            print('loading from hdf')
            store = HdfDataStore(self.schema, tmp_path)
            return store._load()
    def _store(self, df):
        tmp_path = temp_file.make_temporary_file()
        with temp_file.deleting(tmp_path):
            print('storing to temp hdf')
            store = HdfDataStore(self.schema, tmp_path)
            store._store(df)

            print('saving to s3')
            store_file_to_s3(self.boto_bucket, self.schema.name, tmp_path)
Example #4
0
    def _store(self, df):
        tmp_path = temp_file.make_temporary_file()
        with temp_file.deleting(tmp_path):
            print('storing to temp hdf')
            store = HdfDataStore(self.schema, tmp_path)
            store._store(df)

            print('saving to s3')
            store_file_to_s3(self.boto_bucket, self.schema.name, tmp_path)
    def _load(self):
        tmp_path = temp_file.make_temporary_file()
        with temp_file.deleting(tmp_path):
            print('loading from s3')
            load_file_from_s3(self.boto_bucket, self.schema.name, tmp_path)

            print('loading from hdf')
            store = HdfDataStore(self.schema, tmp_path)
            return store._load()
def df_to_hdf5(file_name, df):
    if not os.path.splitext(file_name)[1] == '.hdf':
        file_name = os.path.join(file_name, '.hdf')
    sp = os.path.splitext(file_name)[0]
    local_storage_dir = mimic_login.get_local_storage_dir()
    file_path = os.path.join(local_storage_dir, file_name)

    store = HdfDataStore(PartialSchema(sp), file_path)
    store.store(df)
    return FileLink(os.path.relpath(file_path), result_html_prefix='Right-click and save: ')
Example #7
0
def df_to_hdf5(file_name, df):
    if not os.path.splitext(file_name)[1] == '.hdf':
        file_name = os.path.join(file_name, '.hdf')
    sp = os.path.splitext(file_name)[0]
    local_storage_dir = mimic_login.get_local_storage_dir()
    file_path = os.path.join(local_storage_dir, file_name)

    store = HdfDataStore(PartialSchema(sp), file_path)
    store.store(df)
    return FileLink(os.path.relpath(file_path),
                    result_html_prefix='Right-click and save: ')
Example #8
0
    def _store_chunks(self, chunks):
        for i, chunk in enumerate(chunks):
            k = self._chunk_key(i)

            tmp_path = temp_file.make_temporary_file()
            with temp_file.deleting(tmp_path):
                print('storing chunk to temp hdf')
                store = HdfDataStore(self.schema, tmp_path)
                store._store(chunk)

                print('saving chunk to s3')
                store_file_to_s3(self.boto_bucket, k, tmp_path)
Example #9
0
    def _load_chunks(self):
        for i in count():
            
            k = self._chunk_key(i)
            if not key_exists(self.boto_bucket, k):
                break

            tmp_path = temp_file.make_temporary_file()
            with temp_file.deleting(tmp_path):
                print('loading from s3')
                load_file_from_s3(self.boto_bucket, k, tmp_path)

                print('loading from hdf')
                store = HdfDataStore(self.schema, tmp_path)
                chunk = store._load()
                yield chunk
    def _transform(self, data):
        start = time.time()
        print('transforming data of size',
              data.memory_usage(index=True).sum(), 'bytes')

        store_chunks_jobs = []
        transform_jobs = []
        hdf_stores = []
        try:
            print('splitting data into large groups')
            group_iter = self._group_iter(
                data,
                len(data) // self.n_jobs or self.chunksize)

            for group_data in group_iter:
                if group_data.empty:
                    continue
                f = temp_file.make_temporary_file()
                hdf_store = HdfDataStore(self.input_schema(), f)
                hdf_stores.append(hdf_store)
                hdf_store.store(group_data)

                store_chunks_jobs.append(
                    joblib.delayed(self.store_chunks_job)(hdf_store))
                #transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store))
            print('breaking data into chunks in parallel')
            joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs)

            chunk_stores = chain.from_iterable(store.chunk_stores()
                                               for store in hdf_stores)

            transform_jobs = [
                joblib.delayed(self.transform_job)(chunk_store)
                for chunk_store in chunk_stores
            ]

            print('running transforms in', len(transform_jobs),
                  'parallel jobs')
            result_hdf_stores = joblib.Parallel(
                n_jobs=self.n_jobs)(transform_jobs)

            print('loading and merging the results')
            results = from_chunks(r_hdf_store.load()
                                  for r_hdf_store in result_hdf_stores)
            print('finished merge')
        finally:
            for hdf_store in hdf_stores:
                hdf_store.delete_chunks()
                hdf_store.delete()

        end = time.time()
        print('took', end - start, 'seconds to transform all data in parallel')
        return results
Example #11
0
def load_table(schema, condition=None):
    loader = _get_table_loader(schema, condition)

    local_storage_dir = mimic_login.get_local_storage_dir()
    if local_storage_dir:
        query_f_name = schema.name
        if condition is not None:
            query_f_name += '_' + condition
        query_f_name += '.hdf'
        query_f_name = os.path.join(local_storage_dir, query_f_name)
        cache = HdfDataStore(schema, query_f_name, fixed=True)
        loader = CachingDataStore(schema, loader, cache)

    return loader.load()
    def transform_job(self, chunk_store):
        gc.collect()
        start = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6  # sys.getallocatedblocks()

        data = chunk_store.load()
        gc.collect()

        finished_load = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6  # sys.getallocatedblocks()

        result = self.transform_chunk(data)
        gc.collect()

        finished_transform = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6  # sys.getallocatedblocks()

        t_hdf_store = HdfDataStore(self.output_schema(), chunk_store.hdf_file)
        t_hdf_store.store(result)
        gc.collect()

        finished_store = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss / 10 ** 6  # sys.getallocatedblocks()
        print("started with", start, "mb, ended with", finished_store, "difference =", finished_store - start)
        print("loading used", finished_load - start, "mb")
        print("transforming used", finished_transform - finished_load, "mb")
        print("storing used", finished_store - finished_transform, "mb")
        return t_hdf_store
    def _transform(self, data):
        start = time.time()
        print("transforming data of size", data.memory_usage(index=True).sum(), "bytes")

        store_chunks_jobs = []
        transform_jobs = []
        hdf_stores = []
        try:
            print("splitting data into large groups")
            group_iter = self._group_iter(data, len(data) // self.n_jobs or self.chunksize)

            for group_data in group_iter:
                if group_data.empty:
                    continue
                f = temp_file.make_temporary_file()
                hdf_store = HdfDataStore(self.input_schema(), f)
                hdf_stores.append(hdf_store)
                hdf_store.store(group_data)

                store_chunks_jobs.append(joblib.delayed(self.store_chunks_job)(hdf_store))
                # transform_jobs.append(joblib.delayed(self.transform_job)(hdf_store))
            print("breaking data into chunks in parallel")
            joblib.Parallel(n_jobs=self.n_jobs)(store_chunks_jobs)

            chunk_stores = chain.from_iterable(store.chunk_stores() for store in hdf_stores)

            transform_jobs = [joblib.delayed(self.transform_job)(chunk_store) for chunk_store in chunk_stores]

            print("running transforms in", len(transform_jobs), "parallel jobs")
            result_hdf_stores = joblib.Parallel(n_jobs=self.n_jobs)(transform_jobs)

            print("loading and merging the results")
            results = from_chunks(r_hdf_store.load() for r_hdf_store in result_hdf_stores)
            print("finished merge")
        finally:
            for hdf_store in hdf_stores:
                hdf_store.delete_chunks()
                hdf_store.delete()

        end = time.time()
        print("took", end - start, "seconds to transform all data in parallel")
        return results
def df_to_hdf5(file_path, df, schema):
    store = HdfDataStore(schema, file_path)
    store.store(df)
    return FileLink(file_path, result_html_prefix='Right-click and save: ')
from chatto_transform.config import config
from chatto_transform.schema.ss.ss_sql_raw_schema import appointments
from chatto_transform.datastores.hdf_datastore import HdfDataStore
from chatto_transform.lib.chunks import from_chunks

import time

ds = HdfDataStore(appointments, config.data_dir+'test.hdf')

chunks = ds.load_chunks()

start = time.time()
df = from_chunks(chunks)
end = time.time()

print('took', end - start, 'seconds to load and concatenate all data')
Example #16
0
from chatto_transform.config import config
from chatto_transform.schema.ss.ss_sql_raw_schema import appointments
from chatto_transform.datastores.hdf_datastore import HdfDataStore
from chatto_transform.lib.chunks import from_chunks

import time

ds = HdfDataStore(appointments, config.data_dir + 'test.hdf')

chunks = ds.load_chunks()

start = time.time()
df = from_chunks(chunks)
end = time.time()

print('took', end - start, 'seconds to load and concatenate all data')
Example #17
0
def load_hdf(file_path, schema):
    store = HdfDataStore(schema, file_path)
    df = store.load()
    return df
Example #18
0
def df_to_hdf5(file_path, df, schema):
    store = HdfDataStore(schema, file_path)
    store.store(df)
    return FileLink(file_path, result_html_prefix='Right-click and save: ')
import boto

from chatto_transform.datastores.s3_datastore import S3DataStore
from chatto_transform.datastores.hdf_datastore import HdfDataStore

from chatto_transform.schema.ss.ss_sql_raw_schema import appointments

conn = boto.connect_s3()

bucket = conn.get_bucket('chatto')

print('loading data from hdf')
data = HdfDataStore(appointments, '/Users/dan/dev/data/test.hdf_chunk_0').load()

ds = S3DataStore(appointments, bucket)
ds.store(data)
def df_to_hdf5(file_path, df, schema):
    store = HdfDataStore(schema, file_path)
    store.store(df)
def load_hdf(file_path, schema):
    store = HdfDataStore(schema, file_path)
    df = store.load()
    return df