def test_remote_data_source_conversions(): ### remote data sources can be accessed as an object, local path, or raw data ### test conversions of all setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100)}) shape = df1.shape obj = do(df1) obj.save() c.bc(remote_obj_func, du(obj.data_url)) c.execute() result = c.br()[0] assert result.shape == df1.shape obj = dp(obj.local_path()) obj.save() c.bc(remote_file, du(obj.data_url)) c.execute() result = c.br()[0] result = pickle.loads(result) assert result.shape == df1.shape
def chunked(self): if self._chunked: return self._chunked c = client() urls = c.path_search('taxi/big.hdf5') urls.sort() objs = [du(x) for x in urls] chunked = Chunked(objs) #compute the property, for kicks chunked.chunks self._chunked = chunked return self._chunked
def test_remote_data_sources(): ### test grabbing obj, local_path, and raw data from a remote data source setup_client(integration.url1) c = client() df1 = pd.DataFrame({'a' : np.arange(100000)}) shape = df1.shape obj = do(df1) obj.save() data_url = obj.data_url copy = du(data_url) assert copy.obj().shape == shape copy = du(data_url) df = pickle.loads(copy.raw()) assert df.shape == shape copy = du(data_url) path = copy.local_path() with open(path, "rb") as f: df = pickle.load(f) assert df.shape == shape
def taxidatatimehist(): try: data = request.get_json() except BadRequest: data = {} if data.get('filter_url'): filters = du(data.get('filter_url')) else: filters = None print 'HIST', filters data = get_time_histogram(filters) data['y_bounds'] = [min(data['counts']), max(data['counts'])] result = make_json(ujson.dumps(data)) return result
def cleaned_data(self): if self._cleaned: return self._cleaned c = client() if c.path_search('taxi/cleaned'): self._cleaned = du('taxi/cleaned').obj() return self._cleaned chunked = self.chunked() cleaned = chunked.query({'pickup_latitude' : [self.clean_lat], 'pickup_longitude' : [self.clean_long], 'dropoff_latitude' : [self.clean_lat], 'dropoff_longitude' : [self.clean_long], }) self._cleaned = cleaned do(self._cleaned).save(url='taxi/cleaned') return self._cleaned
def cleaned_data(self): if self._cleaned: return self._cleaned c = client() if c.path_search('taxi/cleaned'): self._cleaned = du('taxi/cleaned').obj() return self._cleaned chunked = self.chunked() cleaned = chunked.query({ 'pickup_latitude': [self.clean_lat], 'pickup_longitude': [self.clean_long], 'dropoff_latitude': [self.clean_lat], 'dropoff_longitude': [self.clean_long], }) self._cleaned = cleaned do(self._cleaned).save(url='taxi/cleaned') return self._cleaned
def aggregate(self, results, grid_shape): c = client() data_urls = [x.data_url for x in results] hosts, info = c.data_info(data_urls) process_dict = {} for u in data_urls: hosts, meta = info[u] assert len(hosts) == 1 process_dict.setdefault(list(hosts)[0], []).append(u) c = client() for k, v in process_dict.items(): v = [du(x) for x in v] queue_name = c.queue('default', host=k) c.bc(aggregate, v, grid_shape, _intermediate_results=ksdebug, _queue_name=queue_name, _no_route_data=no_route_data) c.execute() results = c.br(profile='aggregate') results = [x.obj() for x in results] results = sum(results) return results
def taxidata(pickup): st = time.time() if pickup == 'pickup': pickup = True else: pickup = False try: data = request.get_json() except BadRequest: import pdb; pdb.set_trace() data = {} if data.get('filter_url'): filters = du(data.get('filter_url')) else: filters = None if data: bounds = (data['xmin'], data['xmax'], data['ymin'], data['ymax']) bounds = (max(bounds[0], gbounds[0]), min(bounds[1], gbounds[1]), max(bounds[2], gbounds[2]), min(bounds[3], gbounds[3])) else: bounds = gbounds with timethis('regular project'): data = get_data(pickup, bounds, filters) data = data ** 0.2 data = data - data.min() data = data / data.max() data = data *255 data = data.astype('int64') xmin, xmax, ymin, ymax = bounds output = dict(x=[xmin], y=[ymin], dw=[xmax-xmin], dh=[ymax-ymin], palette=["Greys-256"]) print output output['image'] = [data.tolist()] result = make_json(ujson.dumps(output)) return result
ds[orig:] = col except: print 'exception', path, x raise finally: f.close() def parse(path): new_path = path.replace('.csv', '.hdf5') if exists(new_path): return iterobj = pd.read_csv(path, chunksize=500000, skipinitialspace=True, dtype={'store_and_fwd_flag' : 'S4'}, parse_dates=['pickup_datetime', 'dropoff_datetime'], ) for idx, df in enumerate(iterobj): print idx, 'chunk', new_path to_hdf(df, new_path) def ksparse(obj): parse(obj.local_path()) #parse("/data/taxi/trip_data_5.csv") zips = [du(x) for x in c.path_search('taxi/trip_data*') if '12' not in x] for z in zips: c.bc(ksparse, z) c.execute() c.br()
store = pd.HDFStore('test.hdf5') store['df'] = df store.close() """dp is a convenience function, equivalent to RemoteData(local_path=<path>) We construct a remote data object, and save the data to the server (which generates a url). Then we create a new RemoteData pointer with du (short for data url, equivalent to RemoteData(data_url=<data_url>) and we use that in a function call """ remote = dp("test.hdf5") remote.save(prefix="testdata/test") print remote.data_url new_remote = du(remote.data_url) def head(obj, name): store = pd.HDFStore(obj.local_path()) return store.select(name).head(10) c.bc(head, new_remote, 'df') c.execute() result = c.br()[0] print result """do is short for dataobject, equivalent to RemoteData(obj=<obj>) """ remote = do(df) remote.save() def head(obj): return obj.obj().head(10)
from kitchensink import setup_client, client, do, du, dp, Client import cStringIO import pandas as pd import numpy as np setup_client('http://power:6323/') c = client(rpc_name='data', queue_name='data') fields = [ 'posted_date', 'location_1', 'location_2', 'department', 'title', 'salary', 'start', 'duration', 'job_type', 'applications', 'company', 'contact', 'phone', 'fax', 'translated_location', 'latitude', 'longitude', 'date_first_seen', 'url', 'date_last_seen' ] tsvs = [du(x) for x in c.path_search('*employment*tsv')] def parse(tsv): data = cStringIO.StringIO(tsv.raw()) raw = pd.read_csv( data, sep="\t", names=fields, parse_dates=['posted_date', 'date_first_seen', 'date_last_seen'], index_col=False) return raw def parse_and_save(tsv): raw = parse(tsv)
from kitchensink import setup_client, client, do, du, dp, Client import cStringIO import pandas as pd import numpy as np setup_client('http://power:6323/') c = client(rpc_name='data', queue_name='data') fields = ['posted_date', 'location_1', 'location_2', 'department', 'title', 'salary', 'start', 'duration', 'job_type', 'applications', 'company', 'contact', 'phone', 'fax', 'translated_location', 'latitude', 'longitude', 'date_first_seen', 'url', 'date_last_seen'] tsvs = [du(x) for x in c.path_search('*employment*tsv')] def parse(tsv): data = cStringIO.StringIO(tsv.raw()) raw = pd.read_csv(data, sep="\t", names=fields, parse_dates=['posted_date', 'date_first_seen', 'date_last_seen'], index_col=False) return raw def parse_and_save(tsv): raw = parse(tsv) raw = raw[['latitude', 'longitude', 'posted_date', 'date_first_seen', 'date_last_seen', 'translated_location', 'job_type', 'duration' ]] url = join('employment', 'pickled', basename(tsv.data_url).replace('.tsv', '.pkl'))
def taxidatavsregular(pickup): st = time.time() if pickup == 'pickup': pickup = True else: pickup = False try: data = request.get_json() except BadRequest: data = {} if data.get('filter_url'): filters = du(data.get('filter_url')) else: filters = None if data and data.get('xmin'): bounds = (data['xmin'], data['xmax'], data['ymin'], data['ymax']) bounds = (max(bounds[0], gbounds[0]), min(bounds[1], gbounds[1]), max(bounds[2], gbounds[2]), min(bounds[3], gbounds[3])) else: bounds = gbounds if filters: with timethis('unfiltered'): unfiltered = get_data(pickup, bounds, None) with timethis('filtered'): filtered = get_data(pickup, bounds, filters) # marker = np.array([[1,1], [1,1]]) # filtered = scipy.ndimage.convolve(filtered, marker) # unfiltered = scipy.ndimage.convolve(unfiltered, marker) # import cPickle as pickle # with open("unfiltered.pkl", "w+") as f: # pickle.dump(unfiltered, f, -1) # with open("filtered.pkl", "w+") as f: # pickle.dump(filtered, f, -1) with timethis('procesing'): percents = np.percentile(unfiltered[unfiltered!=0], np.arange(100)) unfiltered = np.interp(unfiltered, percents, np.arange(100)) percents = np.percentile(filtered[filtered!=0], np.arange(100)) filtered = np.interp(filtered, percents, np.arange(100)) ##truncate data filtered[filtered < 50] = 0 unfiltered[filtered < 50] = 0 data = filtered - unfiltered #flatten the dynamic range #data = data ** 0.333 data [data > 0] = data[data>0] ** 0.333 #linearize it data[data > 0] = data[data > 0] / data.max() data[data < 0] = - (data[data < 0] / data.min()) #convert to ints data = data - data.min() data = data / data.max() data = data *255 data = data.astype('int64') palette = 'seismic-256' else: data = get_data(pickup, bounds, None) data = data ** 0.2 data = data - data.min() data = data / data.max() data = data *255 data = data.astype('int64') palette = 'Greys-256' xmin, xmax, ymin, ymax = bounds output = dict(x=[xmin], y=[ymin], dw=[xmax-xmin], dh=[ymax-ymin], palette=[palette]) print output output['image'] = [data.tolist()] with timethis('json'): result = make_json(ujson.dumps(output)) return result
finally: f.close() def parse(path): new_path = path.replace('.csv', '.hdf5') if exists(new_path): return iterobj = pd.read_csv( path, chunksize=500000, skipinitialspace=True, dtype={'store_and_fwd_flag': 'S4'}, parse_dates=['pickup_datetime', 'dropoff_datetime'], ) for idx, df in enumerate(iterobj): print idx, 'chunk', new_path to_hdf(df, new_path) def ksparse(obj): parse(obj.local_path()) #parse("/data/taxi/trip_data_5.csv") zips = [du(x) for x in c.path_search('taxi/trip_data*') if '12' not in x] for z in zips: c.bc(ksparse, z) c.execute() c.br()