def test_dask_convert_dataframes(): """The dask executor auto-converts any pandas dataframes.""" stores = pd.DataFrame({ 'country': [0, 0, 1, 1], 'id': [1, 2, 3, 4], }) sales = pd.DataFrame({'store_id': [1, 2, 3, 4], 'sales': [5, 6, 7, 8]}) sales_by_country = fq.execute(q, scope={ 'stores': stores, 'sales': sales }, model="dask") assert type(sales_by_country) is dd.DataFrame sales_by_country = sales_by_country.compute() pdt.assert_frame_equal( sales_by_country, pd.DataFrame({ 'country': [0, 1], 'sales': [11, 15], }), )
def test_readme_example(): stores = pd.DataFrame({ 'country': [0, 0, 1, 1], 'id': [1, 2, 3, 4], }) sales = pd.DataFrame({'store_id': [1, 2, 3, 4], 'sales': [5, 6, 7, 8]}) # usage to prevent flake8 message print("shapes: ", stores.shape, sales.shape) import framequery as fq sales_by_country = fq.execute(""" SELECT country, sum(sales) as sales FROM sales JOIN stores ON sales.store_id = stores.id GROUP BY country """) pdt.assert_frame_equal( sales_by_country, pd.DataFrame({ 'country': [0, 1], 'sales': [11, 15], }), )
def test_example(query, expected, model): if model == 'dask': sc = {k: dd.from_pandas(df, npartitions=3) for k, df in scope.items()} actual = fq.execute(query, scope=sc, model=model) actual = actual.compute() else: actual = fq.execute(query, scope=scope, model=model) expected = expected() # set empty columns in expected to the ones in actual expected.columns = [ e or a for a, e in zip(actual.columns, expected.columns) ] actual = actual.reset_index(drop=True) expected = actual.reset_index(drop=True) pdt.assert_frame_equal(actual, expected, check_dtype=False)
def test_select(setup, model, query): db, scope = setup if model == 'dask': scope = { k: dd.from_pandas(df, npartitions=3) for (k, df) in scope.items() } actual = fq.execute(query, scope, model=model) expected = _norm_result(db.execute(query.replace('%', '%%')).fetchall()) actual = _norm_result(row for _, row in actual.iterrows()) print('expected', expected, 'actual', actual, sep='\n') pdt.assert_frame_equal(actual, expected, check_dtype=False, check_less_precise=True)
import framequery as fq # assign labels to the generated features features = fq.execute(""" SELECT f.* ,case when g.type = 'P' then 1 else 0 end as label FROM features f JOIN game_df g on f.game_id = g.game_id """) from sklearn.linear_model import LogisticRegression from sklearn.metrics import roc_auc_score # create inputs for sklearn y = features['label'] X = features.drop(['label', 'game_id'], axis=1).fillna(0) # train a classifier lr = LogisticRegression() model = lr.fit(X, y) # Results print("Accuracy: " + str(model.score(X, y))) print("ROC" + str(roc_auc_score(y, model.predict_proba(X)[:, 1])))
import framequery as fq import pandas as pd stores = pd.read_csv('data/stores.csv') sales = pd.read_csv('data/sales.csv') sales_by_country = fq.execute(""" SELECT country, sum(sales) as total_sales FROM sales JOIN stores ON sales.store_id = stores.id GROUP BY country """) print(sales_by_country)