def create_plot(team="LAA", year=2012): expr = bz.by(db.Salaries.teamID, avg=db.Salaries.salary.mean(), max=db.Salaries.salary.max(), ratio=db.Salaries.salary.max() / db.Salaries.salary.min()) expr = expr.sort('ratio', ascending=False) df_salary_gb = into(pd.DataFrame, expr) source1 = into(ColumnDataSource, df_salary_gb[["teamID", "avg"]]) plot1 = plt.figure(title="Salary ratio by team", x_range=list(df_salary_gb["teamID"])) plot1.scatter(x="teamID", y="avg", source=source1, size=20) plot1.xaxis.major_label_orientation = np.pi/3 df = into(pd.DataFrame, db.Salaries) df = df[df["teamID"] == team] df = df[df["yearID"] == year] df = df[["playerID","salary"]].sort('salary') source_team = into(ColumnDataSource, df) p_team = plt.figure(title="Salary of players for %s during %s" % (team, year), x_range=list(df["playerID"]))#, tools=TOOLS) p_team.scatter(x="playerID", y="salary", source=source_team, size=20) p_team.xaxis.major_label_orientation = np.pi/3 p = plt.gridplot([[plot1, p_team]]) return p
def concrete_head(expr, n=10): """ Return head of computed expression """ if not expr._resources(): raise ValueError("Expression does not contain data resources") if not iscollection(expr.dshape): return compute(expr) head = expr.head(n + 1) if not iscollection(expr.dshape): return into(object, head) elif isrecord(expr.dshape.measure): return into(DataFrame, head) else: df = into(DataFrame, head) df.columns = [expr._name] return df result = compute(head) if len(result) == 0: return DataFrame(columns=expr.fields) if isrecord(expr.dshape.measure): return into(DataFrame, result, dshape=expr.dshape) else: df = into(DataFrame, result, dshape=expr.dshape) df.columns = [expr._name] return df
def test_failing_argument(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, skipinitialspace="alpha") # failing call
def test_varlen_dtypes(): y = np.array([('Alice', 100), ('Bob', 200)], dtype=[('name', 'O'), ('amount', 'i4')]) with tmpfile('.hdf5') as fn: dset = into(fn + '::/data', y) assert into(list, dset) == into(list, dset)
def test_simple_into(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_no_header_no_columns(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def test_failing_argument(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", skipinitialspace="alpha") # failing call
def test_no_header_no_columns(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, dshape=ds) assert into(list, sql) == data
def eq(a, b): if isinstance(a, pd.DataFrame): a = into(np.ndarray, a) if isinstance(b, pd.DataFrame): b = into(np.ndarray, b) c = a == b if isinstance(c, np.ndarray): c = c.all() return c
def test_table_resource(): with tmpfile('csv') as filename: ds = dshape('var * {a: int, b: int}') csv = CSV(filename) append(csv, [[1, 2], [10, 20]], dshape=ds) t = Data(filename) assert isinstance(t.data, CSV) assert into(list, compute(t)) == into(list, csv)
def test_simple_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_simple_float_into(): tbl = 'testtable_into_float' csv = CSV(file_name_floats, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql,csv, if_exists="replace") assert into(list, sql) == \ [(1.02, 2.02), (102.02, 202.02), (1002.02, 2002.02)]
def test_data_on_iterator_refies_data(): data = [1, 2, 3] d = Data(iter(data)) assert into(list, d) == data assert into(list, d) == data # in context with Data(iter(data)) as d: assert d is not None
def test_tryexcept_into(): tbl = 'testtable_into_2' csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, quotechar="alpha") # uses multi-byte character and # fails over to using sql.extend() assert into(list, sql) == data
def test_datetimes(): from into import into import numpy as np data = [{'a': 1, 'dt': datetime.datetime(2001, 1, 1)}, {'a': 2, 'dt': datetime.datetime(2002, 2, 2)}] with tmpfile('json') as fn: j = JSONLines(fn) append(j, data) assert str(into(np.ndarray, j)) == str(into(np.ndarray, data))
def test_into_sqlite(): data = [('Alice', 100), ('Bob', 200)] ds = datashape.dshape('var * {name: string, amount: int}') with tmpfile('.db') as dbpath: with tmpfile('.csv') as csvpath: csv = into(csvpath, data, dshape=ds, has_header=False) sql = resource('sqlite:///%s::mytable' % dbpath, dshape=ds) append_csv_to_sql_table(sql, csv) assert into(list, sql) == data
def test_tryexcept_into(): tbl = 'testtable_into_2' csv = CSV(file_name, columns=['a', 'b']) sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace", QUOTE="alpha", FORMAT="csv") # uses multi-byte character and # fails over to using sql.extend() assert into(list, sql) == [(1, 2), (10, 20), (100, 200)]
def test_month(): dts = [datetime(2000, 7, 1), datetime(2000, 6, 30), datetime(2000, 6, 1), datetime(2000, 5, 31)] dts = into(np.ndarray, dts) assert eq(compute(s.truncate(1, 'month'), dts), into(np.ndarray, [date(2000, 7, 1), date(2000, 6, 1), date(2000, 6, 1), date(2000, 5, 1)]))
def test_copy_remote_csv(): with tmpfile('csv') as target: with filetext('name,balance\nAlice,100\nBob,200', extension='csv') as fn: csv = resource(fn) scsv = into('ssh://localhost:foo.csv', csv) assert isinstance(scsv, SSH(CSV)) assert discover(scsv) == discover(csv) # Round trip csv2 = into(target, scsv) assert into(list, csv) == into(list, csv2)
def test_hour(): dts = [datetime(2000, 6, 20, 1, 00, 00), datetime(2000, 6, 20, 12, 59, 59), datetime(2000, 6, 20, 12, 00, 00), datetime(2000, 6, 20, 11, 59, 59)] dts = into(np.ndarray, dts) assert eq(compute(s.truncate(1, 'hour'), dts), into(np.ndarray, [datetime(2000, 6, 20, 1, 0), datetime(2000, 6, 20, 12, 0), datetime(2000, 6, 20, 12, 0), datetime(2000, 6, 20, 11, 0)]))
def test_pandas_csv_naive_behavior_results_in_columns(): df = pd.DataFrame([[1, 'Alice', 100], [2, 'Bob', -200], [3, 'Charlie', 300], [4, 'Denis', 400], [5, 'Edith', -500]], columns=['id', 'name', 'amount']) with tmpfile('.csv') as fn: os.remove(fn) into(fn, df) with open(fn) as f: assert next(f).strip() == 'id,name,amount'
def csvs(n=3): path = tempfile.mktemp() os.mkdir(path) fns = [os.path.join(path, 'file_%d.csv' % i) for i in range(n)] for i, fn in enumerate(fns): into(fn, [{'a': i, 'b': j} for j in range(5)]) try: yield path + os.path.sep finally: shutil.rmtree(path)
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' ds = dshape('var * {Name: string, RegistrationDate: date, ZipCode: int32, Consts: float64}') csv = CSV(file_name, has_header=True) sql = resource(url, tbl, dshape=ds) into(sql, csv) assert_allclose(into(list, sql), into(list, csv))
def compute_up(t, df, **kwargs): grouper = get_grouper(t, t.grouper, df) result = compute_by(t, t.apply, grouper, df) result2 = post_compute_by(t.apply, into(DataFrame, result)) if isinstance(result2, DataFrame): result2.columns = t.fields return result2
def test_first_csv_establishes_consistent_dshape(): d = {'accounts1.csv': 'name,when\nAlice,one\nBob,two', 'accounts2.csv': 'name,when\nAlice,300\nBob,400'} with filetexts(d) as fns: L = into(list, 'accounts*.csv') assert len(L) == 4 assert all(isinstance(val, (str, unicode)) for name, val in L)
def test_complex_into(): # data from: http://dummydata.me/generate this_dir = os.path.dirname(__file__) file_name = os.path.join(this_dir, 'dummydata.csv') tbl = 'testtable_into_complex' csv = CSV(file_name, schema='{Name: string, RegistrationDate: date, ZipCode: int64, Consts: float64}') sql = resource(url + '::' + tbl, dshape=csv.dshape) into(sql, csv, if_exists="replace") df = pd.read_csv(file_name, parse_dates=['RegistrationDate']) assert into(list, sql) == into(list, csv)
def test_resource_existing_ctable(): with tmpfile('.bcolz') as fn: r = into(fn, y) r.flush() r2 = resource(fn) assert eq(r2[:], y)
def test_simple_into(csv): tbl = 'testtable' with tmpfile('db') as filename: engine = sqlalchemy.create_engine('sqlite:///' + filename) t = resource('sqlite:///' + filename + '::' + tbl, dshape=ds) into(t, csv, dshape=ds) conn = engine.raw_connection() cursor = conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table' and name='{0}';".format(tbl)) sqlite_tbl_names = cursor.fetchall() assert sqlite_tbl_names[0][0] == tbl assert into(list, t) == data
def test_sparksql_with_literals(): srdd = into(sqlContext, data, schema=t.schema) expr = t[t.amount >= 100] result = compute(expr, srdd) assert isinstance(result, SchemaRDD) assert set(map(tuple, result.collect())) == \ set(map(tuple, compute(expr, data)))
def test_into_sparksql_from_other(): srdd = into(sqlContext, df) assert isinstance(srdd, SchemaRDD) assert into(list, srdd) == into(list, df)
def test_into_SparkSQL_from_PySpark(): srdd = into(sqlContext, data, schema=t.schema) assert isinstance(srdd, SchemaRDD) assert into(list, rdd) == into(list, srdd)
from datashape.predicates import isscalar, iscollection, isrecord from blaze.expr import symbol, by from blaze.interactive import Data from blaze.compute import compute from blaze.expr.functions import sin, exp sources = [] t = symbol('t', 'var * {amount: int64, id: int64, name: string}') L = [[100, 1, 'Alice'], [200, 2, 'Bob'], [300, 3, 'Charlie'], [400, 4, 'Dan'], [500, 5, 'Edith']] df = DataFrame(L, columns=['amount', 'id', 'name']) x = into(np.ndarray, df) sources = [df, x] try: import sqlalchemcy sql = resource('sqlite:///:memory:::accounts', dshape=t.dshape) into(sql, L) sources.append(sql) except: sql = None try: import bcolz bc = into(bcolz.ctable, df) sources.append(bc)
def test_chunks(): assert len(list(chunks(b, chunksize=2))) == 2 assert (next(chunks(b, chunksize=2)) == into(np.ndarray, b)[:2]).all()
def test_tryexcept_into(tbl): csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, quotechar="alpha") # uses multi-byte character assert into(list, sql) == data
def compute_down(expr, data, **kwargs): leaf = expr._leaves()[0] if all(isinstance(e, Cheap) for e in path(expr, leaf)): return compute(expr, {leaf: into(Iterator, data)}, **kwargs) else: raise MDNotImplementedError()
def test_into_resource(): with tmpfile('.hdf5') as fn: d = into(fn + '::/x', x) assert d.shape == x.shape assert eq(d[:], x[:])
def intonumpy(data, dtype=None, **kwargs): # TODO: Don't ignore other kwargs like copy result = into(np.ndarray, data) if dtype and result.dtype != dtype: result = result.astype(dtype) return result
def into(a, b, **kwargs): result = compute(b, **kwargs) kwargs['dshape'] = b.dshape return into(a, result, **kwargs)
Expr.__repr__ = expr_repr Expr._repr_html_ = lambda x: to_html(x) Expr.__len__ = table_length def intonumpy(data, dtype=None, **kwargs): # TODO: Don't ignore other kwargs like copy result = into(np.ndarray, data) if dtype and result.dtype != dtype: result = result.astype(dtype) return result def convert_base(typ, x): x = compute(x) try: return typ(x) except: return typ(into(typ, x)) Expr.__array__ = intonumpy Expr.__int__ = lambda x: convert_base(int, x) Expr.__float__ = lambda x: convert_base(float, x) Expr.__complex__ = lambda x: convert_base(complex, x) Expr.__bool__ = lambda x: convert_base(bool, x) Expr.__nonzero__ = lambda x: convert_base(bool, x) Expr.__iter__ = into(Iterator)
def test_movie(hdf_filename, base_output_name, ncols=None, interval=None, max_blockshape=(1e5, 100)): f = h5py.File(hdf_filename, 'r') img_shape = np.array(f['img_shape'], dtype=np.int) f.close() m = min(max_blockshape[0], reduce(mul, img_shape)) if interval is not None: n = min(max_blockshape[1], -reduce(sub, interval)) else: n = max_blockshape[1] m = int(m) n = int(n) data = into(Array, hdf_filename + '::/data', blockshape=(m, n)) if interval is not None: data = data[:, interval[0]:interval[1]] data = np.array(data) if ncols is None: ncols = data.shape[1] / 120 print(data.shape, ncols, m, n) t = timeit.default_timer() cols, mat_h, error = csnmf.snmf.compute(data, ncols, 'SPA', compress=True) t = timeit.default_timer() - t print(error) data = np.array(data) error = mrnmf.nnls_frob(data, cols)[1] def argsort(seq): return sorted(range(len(seq)), key=seq.__getitem__) cols_order = argsort(cols) cols = sorted(cols) mat_h = mat_h[cols_order, :] res_dict = {'cols': cols, 'error': error, 'time': t} base_str = 'error {error:.4f}; time {time:.2f}; cols {cols}' print(base_str.format(**res_dict)) if interval is not None and ncols <= 10: colors = ['#a6cee3', '#1f78b4', '#b2df8a', '#33a02c', '#fb9a99', '#e31a1c', '#fdbf6f', '#ff7f00', '#cab2d6', '#6a3d9a'] cmap = ListedColormap(colors) fourcc = cv2.cv.CV_FOURCC(*'mp4v') out = cv2.VideoWriter(base_output_name + '.avi', fourcc, 8.0, (img_shape[1], img_shape[0]), True) max_val = np.argmax(mat_h, axis=0) for i in range(data.shape[1]): img = np.reshape(data[:, i], img_shape) * 255 img = img.astype(np.uint8) norm_idx = float(max_val[i]) / ncols c = map(lambda x: int(x*255), cmap(norm_idx))[::-1] cv2.rectangle(img, (img_shape[1]-50, img_shape[0]-50), (img_shape[1], img_shape[0]), c, cv2.cv.CV_FILLED) out.write(img) out.release() border_width = 40 arrangement = int(math.ceil(math.sqrt(ncols))) plt.figure() for i, c in enumerate(cols): img = np.reshape(data[:, c], img_shape) norm_idx = float(i) / ncols ax = plt.subplot(arrangement, arrangement, i+1, axisbg=cmap(norm_idx)) ax.imshow(img, aspect='equal', origin='lower', extent=(border_width, img_shape[1] - border_width, border_width, img_shape[0] - border_width)) ax.imshow(img, alpha=0) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) plt.tight_layout() plt.savefig(base_output_name + '_representatives.pdf', dpi=300) mat_h_norm = mat_h / np.sum(mat_h, axis=0) plt.figure() ax = plt.axes() for i in range(ncols): bottom = np.sum(mat_h_norm[:i, :], axis=0) norm_idx = float(i) / ncols ax.bar(range(data.shape[1]), mat_h_norm[i, :], 1, color=cmap(norm_idx), linewidth=0, bottom=bottom) ax.set_ylim(0, 1) plt.savefig(base_output_name + '_activation.pdf', dpi=300) for i, c in enumerate(cols): img = np.reshape(data[:, c], img_shape) plt.figure() ax = plt.axes() ax.imshow(img) ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) plt.savefig(base_output_name + '_representative_{0}.png'.format(i)) plt.close() plt.close('all')
def df_eq(a, b): return (list(a.columns) == list(b.columns) # and list(a.dtypes) == list(b.dtypes) and into(set, into(list, a)) == into(set, into(list, b)))
def convert_base(typ, x): x = compute(x) try: return typ(x) except: return typ(into(typ, x))
def compute_up(expr, data, **kwargs): from blaze import into, np return Series(compute_up(expr, into(np.ndarray, data), **kwargs))
def test_into_resource(): with tmpfile('.hdf5') as fn: d = into('hdfstore://' + fn + '::/x', df) assert discover(d) == discover(df) assert eq(into(pd.DataFrame, d), df)
def test_copy_with_into(): with tmpfile('.hdf5') as fn: dset = into(fn + '::/data', [1, 2, 3]) assert dset.shape == (3,) assert eq(dset[:], [1, 2, 3])
def test_into(): assert into(list, t) == into(list, data)
def test_failing_argument(tbl): # this will start to fail if we ever restrict kwargs csv = CSV(file_name) sql = resource(url, tbl, dshape=ds) into(sql, csv, skipinitialspace="alpha") # failing call
def test_outer_join(): L = symbol('L', 'var * {id: int, name: string, amount: real}') R = symbol('R', 'var * {city: string, id: int}') with tmpfile('db') as fn: uri = 'sqlite:///' + fn engine = resource(uri) _left = [(1, 'Alice', 100), (2, 'Bob', 200), (4, 'Dennis', 400)] left = resource(uri, 'left', dshape=L.dshape) into(left, _left) _right = [('NYC', 1), ('Boston', 1), ('LA', 3), ('Moscow', 4)] right = resource(uri, 'right', dshape=R.dshape) into(right, _right) conn = engine.connect() query = compute(join(L, R, how='inner'), { L: left, R: right }, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (4, 'Dennis', 400, 'Moscow')]) query = compute(join(L, R, how='left'), { L: left, R: right }, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (4, 'Dennis', 400, 'Moscow')]) query = compute(join(L, R, how='right'), { L: left, R: right }, post_compute=False) print(query) result = list(map(tuple, conn.execute(query).fetchall())) print(result) assert set(result) == set([(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) # SQLAlchemy doesn't support full outer join """ query = compute(join(L, R, how='outer'), {L: left, R: right}, post_compute=False) result = list(map(tuple, conn.execute(query).fetchall())) assert set(result) == set( [(1, 'Alice', 100, 'NYC'), (1, 'Alice', 100, 'Boston'), (2, 'Bob', 200, None), (3, None, None, 'LA'), (4, 'Dennis', 400, 'Moscow')]) """ conn.close()
def pre_compute(expr, data, **kwargs): seq = into(Iterator, data, **kwargs) leaf = expr._leaves()[0] return records_to_tuples(leaf.dshape, seq)
def test_spark_into(): seq = [1, 2, 3] assert isinstance(into(rdd, seq), RDD) assert into([], into(rdd, seq)) == seq
def test_into_np_ndarray_column(): t = Data(L, fields=['id', 'name', 'balance']) expr = t[t.balance < 0].name colarray = into(np.ndarray, expr) assert len(list(compute(expr))) == len(colarray)
def test_into_nd_array_column_failure(): tble = Data(L, fields=['id', 'name', 'balance']) expr = tble[tble['balance'] < 0] colarray = into(np.ndarray, expr) assert len(list(compute(expr))) == len(colarray)
def test_into_nd_array_selection(): t = Data(L, fields=['id', 'name', 'balance']) expr = t[t['balance'] < 0] selarray = into(np.ndarray, expr) assert len(list(compute(expr))) == len(selarray)
def test_append_other(): with tmpfile('.hdf5') as fn: x = into(np.ndarray, df) dset = into('hdfstore://' + fn + '::/data', x) assert discover(dset) == discover(x)