def test_remote_data_source_conversions():
    ### remote data sources can be accessed as an object, local path, or raw data
    ### test conversions of all
    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100)})
    shape = df1.shape
    obj = do(df1)
    obj.save()
    c.bc(remote_obj_func, du(obj.data_url))
    c.execute()
    result = c.br()[0]
    assert result.shape == df1.shape

    obj = dp(obj.local_path())
    obj.save()
    c.bc(remote_file, du(obj.data_url))
    c.execute()
    result = c.br()[0]
    result = pickle.loads(result)
    assert result.shape == df1.shape
Example #2
0
 def chunked(self):
     if self._chunked:
         return self._chunked
     c = client()
     urls = c.path_search('taxi/big.hdf5')
     urls.sort()
     objs = [du(x) for x in urls]
     chunked = Chunked(objs)
     #compute the property, for kicks
     chunked.chunks
     self._chunked = chunked
     return self._chunked
Example #3
0
 def chunked(self):
     if self._chunked:
         return self._chunked
     c = client()
     urls = c.path_search('taxi/big.hdf5')
     urls.sort()
     objs = [du(x) for x in urls]
     chunked = Chunked(objs)
     #compute the property, for kicks
     chunked.chunks
     self._chunked = chunked
     return self._chunked
def test_remote_data_sources():
    ### test grabbing obj, local_path, and raw data from a remote data source
    setup_client(integration.url1)
    c = client()
    df1 = pd.DataFrame({'a' : np.arange(100000)})
    shape = df1.shape
    obj = do(df1)
    obj.save()
    data_url = obj.data_url
    copy = du(data_url)
    assert copy.obj().shape == shape

    copy = du(data_url)
    df = pickle.loads(copy.raw())
    assert df.shape == shape

    copy = du(data_url)
    path = copy.local_path()
    with open(path, "rb") as f:
        df = pickle.load(f)
    assert df.shape == shape
Example #5
0
File: app.py Project: hhuuggoo/ar
def taxidatatimehist():
    try:
        data = request.get_json()
    except BadRequest:
        data = {}
    if data.get('filter_url'):
        filters = du(data.get('filter_url'))
    else:
        filters = None
    print 'HIST', filters
    data = get_time_histogram(filters)
    data['y_bounds'] = [min(data['counts']), max(data['counts'])]
    result = make_json(ujson.dumps(data))
    return result
Example #6
0
 def cleaned_data(self):
     if self._cleaned:
         return self._cleaned
     c = client()
     if c.path_search('taxi/cleaned'):
         self._cleaned = du('taxi/cleaned').obj()
         return self._cleaned
     chunked = self.chunked()
     cleaned = chunked.query({'pickup_latitude' : [self.clean_lat],
                              'pickup_longitude' : [self.clean_long],
                              'dropoff_latitude' : [self.clean_lat],
                              'dropoff_longitude' : [self.clean_long],
                          })
     self._cleaned = cleaned
     do(self._cleaned).save(url='taxi/cleaned')
     return self._cleaned
Example #7
0
 def cleaned_data(self):
     if self._cleaned:
         return self._cleaned
     c = client()
     if c.path_search('taxi/cleaned'):
         self._cleaned = du('taxi/cleaned').obj()
         return self._cleaned
     chunked = self.chunked()
     cleaned = chunked.query({
         'pickup_latitude': [self.clean_lat],
         'pickup_longitude': [self.clean_long],
         'dropoff_latitude': [self.clean_lat],
         'dropoff_longitude': [self.clean_long],
     })
     self._cleaned = cleaned
     do(self._cleaned).save(url='taxi/cleaned')
     return self._cleaned
Example #8
0
 def aggregate(self, results, grid_shape):
     c = client()
     data_urls = [x.data_url for x in results]
     hosts, info = c.data_info(data_urls)
     process_dict = {}
     for u in data_urls:
         hosts, meta = info[u]
         assert len(hosts) == 1
         process_dict.setdefault(list(hosts)[0], []).append(u)
     c = client()
     for k, v in process_dict.items():
         v = [du(x) for x in v]
         queue_name = c.queue('default', host=k)
         c.bc(aggregate, v, grid_shape, _intermediate_results=ksdebug, _queue_name=queue_name, _no_route_data=no_route_data)
     c.execute()
     results = c.br(profile='aggregate')
     results = [x.obj() for x in results]
     results = sum(results)
     return results
Example #9
0
File: app.py Project: hhuuggoo/ar
def taxidata(pickup):
    st = time.time()
    if pickup == 'pickup':
        pickup = True
    else:
        pickup = False
    try:
        data = request.get_json()
    except BadRequest:
        import pdb; pdb.set_trace()
        data = {}
    if data.get('filter_url'):
        filters = du(data.get('filter_url'))
    else:
        filters = None
    if data:
        bounds = (data['xmin'], data['xmax'], data['ymin'], data['ymax'])
        bounds = (max(bounds[0], gbounds[0]),
                  min(bounds[1], gbounds[1]),
                  max(bounds[2], gbounds[2]),
                  min(bounds[3], gbounds[3]))
    else:
        bounds = gbounds
    with timethis('regular project'):
        data = get_data(pickup, bounds, filters)
    data = data ** 0.2
    data = data - data.min()
    data = data / data.max()
    data = data *255
    data =  data.astype('int64')

    xmin, xmax, ymin, ymax = bounds
    output = dict(x=[xmin],
                  y=[ymin],
                  dw=[xmax-xmin],
                  dh=[ymax-ymin],
                  palette=["Greys-256"])
    print output
    output['image'] = [data.tolist()]
    result = make_json(ujson.dumps(output))
    return result
Example #10
0
 def aggregate(self, results, grid_shape):
     c = client()
     data_urls = [x.data_url for x in results]
     hosts, info = c.data_info(data_urls)
     process_dict = {}
     for u in data_urls:
         hosts, meta = info[u]
         assert len(hosts) == 1
         process_dict.setdefault(list(hosts)[0], []).append(u)
     c = client()
     for k, v in process_dict.items():
         v = [du(x) for x in v]
         queue_name = c.queue('default', host=k)
         c.bc(aggregate,
              v,
              grid_shape,
              _intermediate_results=ksdebug,
              _queue_name=queue_name,
              _no_route_data=no_route_data)
     c.execute()
     results = c.br(profile='aggregate')
     results = [x.obj() for x in results]
     results = sum(results)
     return results
Example #11
0
                ds[orig:] = col
    except:
        print 'exception', path, x
        raise
    finally:
        f.close()


def parse(path):
    new_path = path.replace('.csv', '.hdf5')
    if exists(new_path):
        return
    iterobj = pd.read_csv(path,
                          chunksize=500000,
                          skipinitialspace=True,
                          dtype={'store_and_fwd_flag' : 'S4'},
                          parse_dates=['pickup_datetime', 'dropoff_datetime'],
    )
    for idx, df in enumerate(iterobj):
        print idx, 'chunk', new_path
        to_hdf(df, new_path)

def ksparse(obj):
    parse(obj.local_path())
#parse("/data/taxi/trip_data_5.csv")
zips = [du(x) for x in c.path_search('taxi/trip_data*') if '12' not in x]
for z in zips:
    c.bc(ksparse, z)
c.execute()
c.br()
Example #12
0
store = pd.HDFStore('test.hdf5')
store['df'] = df
store.close()

"""dp is a convenience function, equivalent to RemoteData(local_path=<path>)
We construct a remote data object, and save the data to the server
(which  generates a url).  Then we create a new RemoteData pointer with du
(short for data url, equivalent to RemoteData(data_url=<data_url>)
and we use that in a function call
"""

remote = dp("test.hdf5")
remote.save(prefix="testdata/test")
print remote.data_url

new_remote = du(remote.data_url)
def head(obj, name):
    store = pd.HDFStore(obj.local_path())
    return store.select(name).head(10)

c.bc(head, new_remote, 'df')
c.execute()
result = c.br()[0]
print result

"""do is short for dataobject, equivalent to RemoteData(obj=<obj>)
"""
remote = do(df)
remote.save()
def head(obj):
    return obj.obj().head(10)
Example #13
0
from kitchensink import setup_client, client, do, du, dp, Client
import cStringIO
import pandas as pd
import numpy as np

setup_client('http://power:6323/')
c = client(rpc_name='data', queue_name='data')

fields = [
    'posted_date', 'location_1', 'location_2', 'department', 'title', 'salary',
    'start', 'duration', 'job_type', 'applications', 'company', 'contact',
    'phone', 'fax', 'translated_location', 'latitude', 'longitude',
    'date_first_seen', 'url', 'date_last_seen'
]

tsvs = [du(x) for x in c.path_search('*employment*tsv')]


def parse(tsv):
    data = cStringIO.StringIO(tsv.raw())
    raw = pd.read_csv(
        data,
        sep="\t",
        names=fields,
        parse_dates=['posted_date', 'date_first_seen', 'date_last_seen'],
        index_col=False)
    return raw


def parse_and_save(tsv):
    raw = parse(tsv)
Example #14
0
File: parse.py Project: hhuuggoo/ar
from kitchensink import setup_client, client, do, du, dp, Client
import cStringIO
import pandas as pd
import numpy as np

setup_client('http://power:6323/')
c = client(rpc_name='data', queue_name='data')

fields = ['posted_date', 'location_1', 'location_2', 'department',
          'title', 'salary', 'start', 'duration', 'job_type',
          'applications', 'company', 'contact', 'phone',
          'fax', 'translated_location', 'latitude',
          'longitude', 'date_first_seen', 'url',
          'date_last_seen']

tsvs = [du(x) for x in c.path_search('*employment*tsv')]
def parse(tsv):
    data = cStringIO.StringIO(tsv.raw())
    raw = pd.read_csv(data, sep="\t",
                      names=fields,
                      parse_dates=['posted_date', 'date_first_seen', 'date_last_seen'],
                      index_col=False)
    return raw

def parse_and_save(tsv):
    raw = parse(tsv)
    raw = raw[['latitude', 'longitude', 'posted_date',
               'date_first_seen', 'date_last_seen',
               'translated_location', 'job_type', 'duration'
           ]]
    url = join('employment', 'pickled', basename(tsv.data_url).replace('.tsv', '.pkl'))
Example #15
0
File: app.py Project: hhuuggoo/ar
def taxidatavsregular(pickup):
    st = time.time()
    if pickup == 'pickup':
        pickup = True
    else:
        pickup = False
    try:
        data = request.get_json()
    except BadRequest:
        data = {}
    if data.get('filter_url'):
        filters = du(data.get('filter_url'))
    else:
        filters = None
    if data and data.get('xmin'):
        bounds = (data['xmin'], data['xmax'], data['ymin'], data['ymax'])
        bounds = (max(bounds[0], gbounds[0]),
                  min(bounds[1], gbounds[1]),
                  max(bounds[2], gbounds[2]),
                  min(bounds[3], gbounds[3]))
    else:
        bounds = gbounds
    if filters:
        with timethis('unfiltered'):
            unfiltered = get_data(pickup, bounds, None)
        with timethis('filtered'):
            filtered = get_data(pickup, bounds, filters)
        # marker = np.array([[1,1], [1,1]])
        # filtered = scipy.ndimage.convolve(filtered, marker)
        # unfiltered = scipy.ndimage.convolve(unfiltered, marker)
        # import cPickle as pickle
        # with open("unfiltered.pkl", "w+") as f:
        #     pickle.dump(unfiltered, f, -1)

        # with open("filtered.pkl", "w+") as f:
        #     pickle.dump(filtered, f, -1)
        with timethis('procesing'):
            percents = np.percentile(unfiltered[unfiltered!=0], np.arange(100))
            unfiltered = np.interp(unfiltered, percents, np.arange(100))
            percents = np.percentile(filtered[filtered!=0], np.arange(100))
            filtered = np.interp(filtered, percents, np.arange(100))

            ##truncate data
            filtered[filtered < 50] = 0
            unfiltered[filtered < 50] = 0

            data = filtered - unfiltered

            #flatten the dynamic range
            #data = data ** 0.333
            data [data > 0] = data[data>0] ** 0.333

            #linearize it
            data[data > 0] = data[data > 0] / data.max()
            data[data < 0] = - (data[data < 0] / data.min())

            #convert to ints
            data = data - data.min()
            data = data / data.max()
            data = data *255
            data =  data.astype('int64')
            palette = 'seismic-256'
    else:
        data = get_data(pickup, bounds, None)
        data = data ** 0.2
        data = data - data.min()
        data = data / data.max()
        data = data *255
        data =  data.astype('int64')
        palette = 'Greys-256'
    xmin, xmax, ymin, ymax = bounds
    output = dict(x=[xmin],
                  y=[ymin],
                  dw=[xmax-xmin],
                  dh=[ymax-ymin],
                  palette=[palette])
    print output
    output['image'] = [data.tolist()]
    with timethis('json'):
        result = make_json(ujson.dumps(output))
    return result
Example #16
0
    finally:
        f.close()


def parse(path):
    new_path = path.replace('.csv', '.hdf5')
    if exists(new_path):
        return
    iterobj = pd.read_csv(
        path,
        chunksize=500000,
        skipinitialspace=True,
        dtype={'store_and_fwd_flag': 'S4'},
        parse_dates=['pickup_datetime', 'dropoff_datetime'],
    )
    for idx, df in enumerate(iterobj):
        print idx, 'chunk', new_path
        to_hdf(df, new_path)


def ksparse(obj):
    parse(obj.local_path())


#parse("/data/taxi/trip_data_5.csv")
zips = [du(x) for x in c.path_search('taxi/trip_data*') if '12' not in x]
for z in zips:
    c.bc(ksparse, z)
c.execute()
c.br()