def test_data_routing_mulitple_sources(): # test data routing - first call # we setup a large data source on url1 # and a smaller source on url2 # execute 3 calls, url1 should have first priorty, url2, second, url3 third setup_client(integration.url1) c = client() name1 = tempfile.NamedTemporaryFile(prefix='ks-test').name name2 = tempfile.NamedTemporaryFile(prefix='ks-test').name len1 = 1000000 len2 = 100000 with open(name1, 'wb+') as f: f.write(len1 * b'0') with open(name2, 'wb+') as f: f.write(len2 * b'0') remote1 = dp(name1) remote1.rpc_url = integration.url1 remote2 = dp(name2) remote2.rpc_url = integration.url2 remote1.save() remote2.save() c.bc(routing_func, remote1, remote2) c.bc(routing_func, remote1, remote2) c.bc(routing_func, remote1, remote2) c.execute() results = c.br() assert results == [integration.url1, integration.url2, integration.url3]
def test_rpc(): # test simple execution of a dummy function setup_client(integration.url1) c = client() c.bc(dummy_func, 1) c.execute() result = c.br() assert result == [1]
def test_bulk_calls(): # test sleep function which should execute in 1 second. # should be parallelized, and all 3 calls should execute in ~ 1 second setup_client(integration.url1) c = client() st = time.time() c.bc(sleep_func) c.bc(sleep_func) c.bc(sleep_func) c.execute() result = c.br() ed = time.time() print ed-st assert ed-st < 2 assert len(result) ==3
def test_data_routing(): # test data routing - first call # should end up on the node the data is on # other 2 calls should be parallelized on the other 2 nodes setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100000)}) remote1 = do(obj=df1) remote1.rpc_url = integration.url2 remote1.save() c.bc(routing_func, remote1) c.bc(routing_func, remote1) c.bc(routing_func, remote1) c.execute() results = c.br() assert results[0] == integration.url2 assert set(results) == set([integration.url1, integration.url2, integration.url3])
def test_remote_data_source_conversions(): ### remote data sources can be accessed as an object, local path, or raw data ### test conversions of all setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100)}) shape = df1.shape obj = do(df1) obj.save() c.bc(remote_obj_func, du(obj.data_url)) c.execute() result = c.br()[0] assert result.shape == df1.shape obj = dp(obj.local_path()) obj.save() c.bc(remote_file, du(obj.data_url)) c.execute() result = c.br()[0] result = pickle.loads(result) assert result.shape == df1.shape
def test_remote_data_sources(): ### test grabbing obj, local_path, and raw data from a remote data source setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100000)}) shape = df1.shape obj = do(df1) obj.save() data_url = obj.data_url copy = du(data_url) assert copy.obj().shape == shape copy = du(data_url) df = pickle.loads(copy.raw()) assert df.shape == shape copy = du(data_url) path = copy.local_path() with open(path, "rb") as f: df = pickle.load(f) assert df.shape == shape
import os from os.path import relpath, join, basename from kitchensink import setup_client, client, do, du, dp setup_client('http://localhost:6323/') c = client() c.bc('bootstrap', 'taxi/big.hdf5', data_type='file',_rpc_name='data') c.execute() c.br()
import os from os.path import relpath, join, basename from kitchensink import setup_client, client, do, du, dp setup_client('http://localhost:6323/') c = client() c.bc('bootstrap', 'taxi/big.hdf5', data_type='file', _rpc_name='data') c.execute() c.br()
try: ds1 = f[xfield] xdata = smartslice(ds1, start, end, bvector) ds2 = f[yfield] ydata = smartslice(ds2, start, end, bvector) finally: f.close() with timethis('project'): mark = mark.astype('float64') args = (xdata, ydata, grid) + grid_data_bounds + (mark,) fast_project(*args) return grid if __name__ == "__main__": setup_client('http://power:6323/') #client().reducetree('taxi/partitioned*') #client().reducetree('taxi/cleaned*') #client().reducetree('taxi/index*') #client().reducetree('taxi/projections*') #client().reducetree('taxi/raw/projections*') import matplotlib.cm as cm st = time.time() ds = ARDataset() ds.partitions() #filters = ds.query({'trip_time_in_secs' : [lambda x : (x >= 1999) & (x <= 2000)]}) filters = None global_bounds = ds.gbounds local_bounds = global_bounds #local_bounds = global_bounds local_indexes, (grid_shape, results) = ds.project(
import logging import pandas as pd import numpy as np from kitchensink import client, setup_client, do, du, dp """single node setup This example illustrates basic usage of remote data sources first example works with a remote file second example works with a remote object(stored by pickle) """ setup_client("http://localhost:6323/") c = client() df = pd.DataFrame({'a' : np.arange(2000000)}) store = pd.HDFStore('test.hdf5') store['df'] = df store.close() """dp is a convenience function, equivalent to RemoteData(local_path=<path>) We construct a remote data object, and save the data to the server (which generates a url). Then we create a new RemoteData pointer with du (short for data url, equivalent to RemoteData(data_url=<data_url>) and we use that in a function call """ remote = dp("test.hdf5") remote.save(prefix="testdata/test") print remote.data_url
import os from os.path import relpath, join, basename from kitchensink import setup_client, client, do, du, dp, Client import cStringIO import pandas as pd import numpy as np setup_client('http://power:6323/') c = client(rpc_name='data', queue_name='data') fields = [ 'posted_date', 'location_1', 'location_2', 'department', 'title', 'salary', 'start', 'duration', 'job_type', 'applications', 'company', 'contact', 'phone', 'fax', 'translated_location', 'latitude', 'longitude', 'date_first_seen', 'url', 'date_last_seen' ] tsvs = [du(x) for x in c.path_search('*employment*tsv')] def parse(tsv): data = cStringIO.StringIO(tsv.raw()) raw = pd.read_csv( data, sep="\t", names=fields, parse_dates=['posted_date', 'date_first_seen', 'date_last_seen'], index_col=False) return raw