def parse_and_save(tsv): raw = parse(tsv) raw = raw[['latitude', 'longitude', 'posted_date', 'date_first_seen', 'date_last_seen', 'translated_location', 'job_type', 'duration' ]] url = join('employment', 'pickled', basename(tsv.data_url).replace('.tsv', '.pkl')) do(raw).save(url=url)
def parse_and_save(tsv): raw = parse(tsv) raw = raw[[ 'latitude', 'longitude', 'posted_date', 'date_first_seen', 'date_last_seen', 'translated_location', 'job_type', 'duration' ]] url = join('employment', 'pickled', basename(tsv.data_url).replace('.tsv', '.pkl')) do(raw).save(url=url)
def boolfilter(source, start, end, query_dict, prefilter=None): """ if query_dict is present, we will load the dset into memory, and do the filtering if query_dict is not present, we will resort to smart slicing prefilter is a boolean vector """ if prefilter is None: boolvect = np.ones(end - start, dtype=np.bool) else: boolvect = prefilter.obj() f = h5py.File(source.local_path(), 'r') for field, operations in query_dict.items(): with timethis('load_%s' % field): ds = f[field] data = ds[start:end] with timethis('filter_%s' % field): for op in operations: val = op(data) result = boolvect & val boolvect = result with timethis('saving'): obj = do(boolvect, fmt='bloscpickle') obj.save(prefix='index') return obj
def test_remote_data_source_conversions(): ### remote data sources can be accessed as an object, local path, or raw data ### test conversions of all df1 = pd.DataFrame({'a' : np.arange(100000)}) shape = df1.shape #start with a python object - we should be able to convert to raw and local path obj = do(df1) path = obj.local_path() with open(path, "rb") as f: df = pickle.load(f) assert df.shape == shape df = pickle.loads(obj.raw()) assert df.shape == shape #start with a raw data, should be able to convert to raw and local path obj = dr(obj.raw()) assert obj.obj().shape == shape path = obj.local_path() with open(path, 'rb') as f: df = pickle.load(f) assert df.shape == shape #start with a file, should be able to convert to obj and raw obj = dp(obj.local_path()) assert obj.obj().shape == shape df = pickle.loads(obj.raw()) assert df.shape == shape
def cleaned_data(self): if self._cleaned: return self._cleaned c = client() if c.path_search('taxi/cleaned'): self._cleaned = du('taxi/cleaned').obj() return self._cleaned chunked = self.chunked() cleaned = chunked.query({'pickup_latitude' : [self.clean_lat], 'pickup_longitude' : [self.clean_long], 'dropoff_latitude' : [self.clean_lat], 'dropoff_longitude' : [self.clean_long], }) self._cleaned = cleaned do(self._cleaned).save(url='taxi/cleaned') return self._cleaned
def cleaned_data(self): if self._cleaned: return self._cleaned c = client() if c.path_search('taxi/cleaned'): self._cleaned = du('taxi/cleaned').obj() return self._cleaned chunked = self.chunked() cleaned = chunked.query({ 'pickup_latitude': [self.clean_lat], 'pickup_longitude': [self.clean_long], 'dropoff_latitude': [self.clean_lat], 'dropoff_longitude': [self.clean_long], }) self._cleaned = cleaned do(self._cleaned).save(url='taxi/cleaned') return self._cleaned
def test_read_only(): c = Client(integration.url3) c.bc(lambda: do(None).save(url="test_read_only"), _queue_name="default|node3") c.execute() c.br() active_hosts, results = c.data_info(["test_read_only"]) location_info, data_info = results["test_read_only"] assert "node3" not in location_info assert len(location_info) == 1
def aggregate(results, grid_shape): with timethis('data_loading'): bigdata = np.zeros(grid_shape) for source in results: path = source.local_path() data = h5py.File(path)['data'] bigdata += data[:,:] with timethis('saving_result'): obj = do(bigdata) obj.save(prefix='taxi/aggregate') return obj
def aggregate(results, grid_shape): with timethis('data_loading'): bigdata = np.zeros(grid_shape) for source in results: path = source.local_path() data = h5py.File(path)['data'] bigdata += data[:, :] with timethis('saving_result'): obj = do(bigdata) obj.save(prefix='taxi/aggregate') return obj
def query(self, query_dict): c = client() chunked = self.chunked() for source, start, end in chunked.chunks: c.bc(boolfilter, source, start, end, query_dict, _intermediate_results=ksdebug, _no_route_data=no_route_data) c.execute() results = c.br(profile='profile_query') output = {} for result, (source, start, end) in zip(results, chunked.chunks): output[(source.data_url, start, end)] = result output = do(output) output.save(prefix='taxi/query') return output
def test_data_routing(): # test data routing - first call # should end up on the node the data is on # other 2 calls should be parallelized on the other 2 nodes setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100000)}) remote1 = do(obj=df1) remote1.rpc_url = integration.url2 remote1.save() c.bc(routing_func, remote1) c.bc(routing_func, remote1) c.bc(routing_func, remote1) c.execute() results = c.br() assert results[0] == integration.url2 assert set(results) == set([integration.url1, integration.url2, integration.url3])
def test_remote_data_source_conversions(): ### remote data sources can be accessed as an object, local path, or raw data ### test conversions of all setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100)}) shape = df1.shape obj = do(df1) obj.save() c.bc(remote_obj_func, du(obj.data_url)) c.execute() result = c.br()[0] assert result.shape == df1.shape obj = dp(obj.local_path()) obj.save() c.bc(remote_file, du(obj.data_url)) c.execute() result = c.br()[0] result = pickle.loads(result) assert result.shape == df1.shape
def test_remote_data_sources(): ### test grabbing obj, local_path, and raw data from a remote data source setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100000)}) shape = df1.shape obj = do(df1) obj.save() data_url = obj.data_url copy = du(data_url) assert copy.obj().shape == shape copy = du(data_url) df = pickle.loads(copy.raw()) assert df.shape == shape copy = du(data_url) path = copy.local_path() with open(path, "rb") as f: df = pickle.load(f) assert df.shape == shape
We construct a remote data object, and save the data to the server (which generates a url). Then we create a new RemoteData pointer with du (short for data url, equivalent to RemoteData(data_url=<data_url>) and we use that in a function call """ remote = dp("test.hdf5") remote.save(prefix="testdata/test") print remote.data_url new_remote = du(remote.data_url) def head(obj, name): store = pd.HDFStore(obj.local_path()) return store.select(name).head(10) c.bc(head, new_remote, 'df') c.execute() result = c.br()[0] print result """do is short for dataobject, equivalent to RemoteData(obj=<obj>) """ remote = do(df) remote.save() def head(obj): return obj.obj().head(10) new_remote = du(remote.data_url) c.bc(head, new_remote) c.execute() print c.br()[0]
import time import pandas as pd import numpy as np from kitchensink import client, setup_client, do setup_client("http://localhost:6323/") c = client() df = pd.DataFrame({'a' : np.arange(2000000)}) obj = do(df) obj.save() print obj[100:110].obj() print obj[100:110]['a'].obj()