Exemple #1
0
def test_column(sql):
    t = Data(sql)

    r = compute(t['x'])
    assert r == [1, 10, 100]
    assert compute(t[['x']]) == [(1,), (10,), (100,)]

    assert compute(t.count()) == 3
Exemple #2
0
def test_column(sql):
    t = Data(sql)

    r = compute(t['x'])
    assert r == [1, 10, 100]
    assert compute(t[['x']]) == [(1, ), (10, ), (100, )]

    assert compute(t.count()) == 3
Exemple #3
0
def test_column(sql):
    t = Data(sql)

    r = list(t['x'])
    assert r == [1, 10, 100]
    assert list(t[['x']]) == [(1,), (10,), (100,)]

    assert int(t.count()) == 3
Exemple #4
0
def test_swap_resources_into_scope():

    from blaze import Data
    t = Data([1, 2, 3], dshape='3 * int', name='t')
    expr, scope = swap_resources_into_scope(t.head(2), {t: t.data})

    assert t._resources()
    assert not expr._resources()

    assert t not in scope
def test_swap_resources_into_scope():

    from blaze import Data
    t = Data([1, 2, 3], dshape='3 * int', name='t')
    expr, scope = swap_resources_into_scope(t.head(2), {t: t.data})

    assert t._resources()
    assert not expr._resources()

    assert t not in scope
Exemple #6
0
def test_add_data_to_empty_server(empty_server, serial):
    # add data
    with temp_server() as test:
        iris_path = example('iris.csv')
        blob = serial.dumps({'iris': iris_path})
        response1 = empty_server.post(
            '/add',
            headers=mimetype(serial),
            data=blob,
        )
        assert 'OK' in response1.status
        assert response1.status_code == 200

        # check for expected server datashape
        response2 = empty_server.get('/datashape')
        expected2 = str(discover({'iris': resource(iris_path)}))
        assert response2.data.decode('utf-8') == expected2

        # compute on added data
        t = Data({'iris': resource(iris_path)})
        expr = t.iris.petal_length.sum()

        response3 = empty_server.post(
            '/compute',
            data=serial.dumps({'expr': to_tree(expr)}),
            headers=mimetype(serial)
        )

        result3 = serial.loads(response3.data)['data']
        expected3 = compute(expr, {'iris': resource(iris_path)})
        assert result3 == expected3
def test_interactive_dshape_works():
    try:
        d = Data('mongodb://localhost:27017/test_db::bank',
                 dshape='var * {name: string, amount: int64}')
    except pymongo.errors.ConnectionFailure:
        pytest.skip('No mongo server running')
    assert d.dshape == dshape('var * {name: string, amount: int64}')
Exemple #8
0
def test_into_cds_mixed():
    pytest.importorskip('bokeh')
    from bokeh.objects import ColumnDataSource
    n = 25
    ddict = {
        'first': np.random.choice(list('abc'), size=n),
        'second': np.random.choice(['cachaça', 'tres leches', 'pizza'],
                                   size=n),
        'third': list(range(n))
    }
    df = pd.DataFrame(ddict)
    with tmpfile('.csv') as fn:
        df.to_csv(fn, header=None, index=False, encoding='utf8')
        csv = CSV(fn, columns=['first', 'second', 'third'], encoding='utf8')
        t = Data(csv)

        cds = into(ColumnDataSource, t)
        assert isinstance(cds, ColumnDataSource)
        expected = dict(
            (k, into(list, csv[:, k])) for k in ['first', 'second', 'third'])
        assert cds.data == expected

        cds = into(ColumnDataSource, t[['first', 'second']])
        assert isinstance(cds, ColumnDataSource)
        expected = dict(
            (k, into(list, csv[:, k])) for k in ['first', 'second'])
        assert cds.data == expected

        cds = into(ColumnDataSource, t['first'])
        assert isinstance(cds, ColumnDataSource)
        assert cds.data == {'first': into(list, csv[:, 'first'])}
def test_interactive_dshape_works(mongo_host_port):
    try:
        d = Data('mongodb://{}:{}/test_db::bank'.format(*mongo_host_port),
                 dshape='var * {name: string, amount: int64}')
    except pymongo.errors.ConnectionFailure:
        pytest.skip('No mongo server running')
    assert d.dshape == dshape('var * {name: string, amount: int64}')
def test_url_csv_data(iris_local):
    iris_remote = Data(iris_url)
    assert isinstance(iris_remote.data, URL(CSV))
    iris_remote_df = compute(iris_remote)
    assert isinstance(iris_remote_df, pd.DataFrame)
    iris_local_df = compute(iris_local)
    tm.assert_frame_equal(iris_remote_df, iris_local_df)
Exemple #11
0
def test_series_single_column_projection():
    data = [('Alice', -200.0, 1), ('Bob', -300.0, 2)]
    t = Data(data, '2 * {name: string, amount: float64, id: int64}')
    df = into(pd.Series, t[['name']])
    assert isinstance(df, pd.Series)
    expected = pd.DataFrame(data, columns=t.schema.measure.names).name
    assert str(df) == str(expected)
 def test_detect_types(self):
     for dataset in config['datasets']:
         for uri in dataset['uris']:
             data = Data(uri)
             result = al.types(data)
             self.check_type(result, dataset['types'], dataset['table'])
         print('for %s on %s' %
               (dataset['table'], getengine(dataset['uris'])))
 def test_numeric_cols(self):
     for dataset in config['datasets']:
         for uri in dataset['uris']:
             data = Data(uri)
             result = al.get_numeric_cols(data.dshape)
             eq_(set(result), set(dataset['types']['numbers']))
         print('for %s on %s' %
               (dataset['table'], getengine(dataset['uris'])))
 def test_gains(self):
     for dataset in config['datasets']:
         for uri in dataset['uris']:
             data = Data(uri)
             types = al.types(data)
             result = al.groupmeans(data, types['groups'], types['numbers'])
             self.check_gain(result, dataset['groupmeans']['gain'], uri)
         print('for %s on %s' %
               (dataset['table'], getengine(dataset['uris'])))
Exemple #15
0
 def to_blaze(self):
     from blaze.server.client import Client
     from blaze.server import from_tree
     from blaze import Data
     # hacky - blaze urls have `compute.json` in it, but we need to strip it off
     # to feed it into the blaze client lib
     c = Client(self.data_url.rsplit('compute.json', 1)[0])
     d = Data(c)
     return from_tree(self.expr, {':leaf': d})
 def test_stats(self):
     for dataset in config['datasets']:
         for uri in dataset['uris']:
             data = Data(uri)
             groups = dataset['types']['groups']
             result = al.crosstabs(data, groups, details=False)
             expected = pd.DataFrame(dataset['crosstabs'])
             self.check_stats(result, expected, uri)
         print('for %s on %s' %
               (dataset['table'], getengine(dataset['uris'])))
Exemple #17
0
def test_expr_client_interactive():
    c = Client('localhost:6363')
    t = Data(c)

    assert compute(t.accounts.name) == ['Alice', 'Bob']
    assert (into(
        set,
        compute(
            by(t.accounts.name,
               min=t.accounts.amount.min(),
               max=t.accounts.amount.max()))) == set([('Alice', 100, 100),
                                                      ('Bob', 200, 200)]))
 def test_gains_changed_types(self):
     # Issue #24
     for dataset in config['datasets']:
         if 'changedtypes' not in dataset:
             continue
         for uri in dataset['uris']:
             data = Data(uri)
             types = dataset['changedtypes']
             result = al.groupmeans(data, types['groups'], types['numbers'])
             self.check_gain(result, types['groupmeans']['gain'], uri)
         print('for %s on %s' %
               (dataset['table'], getengine(dataset['uris'])))
Exemple #19
0
def test_expressions():
    sources = [v for k, v in data if k not in [nd.array, CSV, Data]]
    targets = [
        v for k, v in no_date
        if k not in [Data, CSV, Collection, nd.array, PyTable, SQL]
    ]

    for a in sources:
        for b in targets:
            c = Data(
                a,
                "var * {amount: int64, id: int64, name: string, timestamp: datetime}"
            )[['amount', 'id', 'name']]
            assert normalize(into(type(b), c)) == normalize(b)
def test_pre_compute_on_multiple_datasets_is_selective():
    from odo import CSV
    from blaze import Data
    from blaze.cached import CachedDataset

    df = pd.DataFrame(
        [[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300],
         [4, 'Denis', 400], [5, 'Edith', -500]],
        columns=['id', 'name', 'amount'])
    iris = CSV(example('iris.csv'))
    dset = CachedDataset({'df': df, 'iris': iris})

    d = Data(dset)
    assert str(compute(d.df.amount)) == str(df.amount)
def setUpModule():
    "Download test data files into data/ target folder"

    # Set root logger logging level to INFO
    logging.basicConfig(level=logging.INFO)

    # Download datasets
    for dataset in config['datasets']:
        dataset['path'] = os.path.join(DATA_DIR, dataset['table'] + '.csv')
        dataset['uris'] = [dataset['path']]
        if not os.path.exists(dataset['path']):
            logging.info('Downloading %s', dataset['table'])
            pd.read_csv(dataset['url'],
                        encoding='cp1252').to_csv(dataset['path'],
                                                  index=False,
                                                  encoding='cp1252')

    # Create autolysis databases (sqlite3 data directory is DATA_DIR)
    os.chdir(DATA_DIR)
    dburl = {}
    for db, url in config['databases'].items():
        if not server_exists(url):
            continue
        dburl[db] = url

    # Load datasets into databases
    for dataset in config['datasets']:
        for db in dataset.get('databases', []):
            if db not in dburl:
                logging.warning('%s cannot use unconfigured DB %s',
                                dataset['table'], db)
                continue
            url = dburl[db]

            # Don't load data if a non-empty table already exists
            target = dburl[db] + '::' + dataset['table']
            dataset['uris'].append(target)
            engine = sa.create_engine(url, encoding='utf-8')
            if engine.dialect.has_table(engine.connect(), dataset['table']):
                if Data(target).count() > 0:
                    continue
            logging.info('Creating table %s on %s', dataset['table'], db)
            try:
                odo(dataset['path'], target)
            except sa.exc.InternalError:
                logging.warning('Loading %s into %s failed: %s',
                                dataset['table'], db, traceback.format_exc(0))
Exemple #22
0
def test_add_data_to_server(serial):
    with temp_server(data) as test:
        # add data
        initial_datashape = datashape.dshape(test.get('/datashape').data.decode('utf-8'))
        iris_path = example('iris.csv')
        blob = serial.dumps({'iris': iris_path})
        response1 = test.post(
            '/add',
            headers=mimetype(serial),
            data=blob,
        )
        assert 'OK' in response1.status
        assert response1.status_code == 200

        # check for expected server datashape
        new_datashape = datashape.dshape(test.get('/datashape').data.decode('utf-8'))
        data2 = data.copy()
        data2.update({'iris': resource(iris_path)})
        expected2 = datashape.dshape(discover(data2))
        from pprint import pprint as pp
        assert_dshape_equal(new_datashape, expected2)
        assert new_datashape.measure.fields != initial_datashape.measure.fields

        # compute on added data
        t = Data({'iris': resource(iris_path)})
        expr = t.iris.petal_length.sum()

        response3 = test.post(
            '/compute',
            data=serial.dumps({'expr': to_tree(expr)}),
            headers=mimetype(serial)
        )

        result3 = serial.loads(response3.data)['data']
        expected3 = compute(expr, {'iris': resource(iris_path)})
        assert result3 == expected3
Exemple #23
0
from bokeh.plotting import figure, show, output_server
from bokeh.transforms import image_downsample
from blaze.server.client import Client
from blaze import Data

N = 1000

x = np.linspace(0, 10, N)
y = np.linspace(0, 10, N)
xx, yy = np.meshgrid(x, y)
d = np.sin(xx) * np.cos(yy)

output_server("remote_image")

c = Client('http://localhost:5006')
d = Data(c)
source = image_downsample.source()
source.from_blaze(d.array, local=True)

plot = figure(
    x_range=[0, 10],
    y_range=[0, 10],
)
plot.image(source=source,
           image="image",
           x="x",
           y="y",
           dw="dw",
           dh="dh",
           palette="Spectral11",
           tools="pan,wheel_zoom,box_zoom,reset,previewsave")
def test_interactive_len(sql):
    t = Data(sql)
    assert len(t) == int(t.count())
def test_Data_construct_with_table(bank):
    d = Data('mongodb://localhost/test_db::bank')
    assert set(d.fields) == set(('name', 'amount'))
    assert int(d.count()) == 5
def test_Data_construct(bank, points):
    d = Data('mongodb://localhost/test_db')
    assert 'bank' in d.fields
    assert 'points' in d.fields
    assert isinstance(d.dshape.measure, Record)
def test_Data_construct_with_table(bank, mongo_host_port):
    d = Data('mongodb://{}:{}/test_db::bank'.format(*mongo_host_port))
    assert set(d.fields) == set(('name', 'amount'))
    assert int(d.count()) == 5
def test_Data_construct(bank, points, mongo_host_port):
    d = Data('mongodb://{}:{}/test_db'.format(*mongo_host_port))
    assert 'bank' in d.fields
    assert 'points' in d.fields
    assert isinstance(d.dshape.measure, Record)
Exemple #29
0
import IO

dataPath = os.getcwd() + '/data'

twts_data = IO.IO_json(dataPath, 'twitterData').load()
logger.debug('Type of loaded json is %s' % str(type(twts_data)))
twts_read = IO_twitter.TwitterAPI().parseTweets(twts_data)

fields = ['id', 'created_at', 'user_id', 'user_name', 'tweet_text', 'url']

twts_pd_df = pd.DataFrame(twts_read, columns=fields)

twts_pd_df.head()
twts_pd_df.describe()

twts_bz_df = Data(twts_pd_df)

twts_bz_df.schema
twts_bz_df.dshape

twts_bz_df.data

tweet_text_distinct = twts_bz_df.tweet_text.distinct()
tweet_text_distinct

twts_bz_df[['id', 'user_name', 'tweet_text']].distinct()

twts_odo_df = Data(twts_pd_df)
print twts_odo_df.dshape

odo(twts_bz_df, 'jsonlines://{}/{}.json'.format(dataPath, 'jsonData'))
Exemple #30
0
def test_into_list_Column():
    with filetext('Alice,1\nBob,2') as fn:
        csv = CSV(fn, columns=['name', 'id'])
        t = Data(csv)
        assert into(list, t.name) == ['Alice', 'Bob']
Exemple #31
0
def test_into_numpy_from_tableexpr_with_option_types():
    t = Data([[1, 'Alice'], [2, 'Bob']],
             '2 * {id: ?int32, name: string[5, "ascii"]}')
    assert into(np.ndarray, t).dtype == np.dtype([('id', 'i4'),
                                                  ('name', 'S5')])
def test_Data_construct_with_table(bank, mongo_host_port):
    d = Data('mongodb://{}:{}/test_db::bank'.format(*mongo_host_port))
    assert set(d.fields) == set(('name', 'amount'))
    assert int(d.count()) == 5
Exemple #33
0
def test_into_csv_blaze_table(good_csv):
    t = Data(CSV(good_csv))
    df = into(pd.DataFrame, t[['userid', 'text']])
    assert list(df.columns) == ['userid', 'text']
Exemple #34
0
def test_Data_construct_with_table(bank):
    d = Data('mongodb://localhost/test_db::bank')
    assert set(d.fields) == set(('name', 'amount'))
    assert int(d.count()) == 5