Beispiel #1
0
def test_dask_convert_dataframes():
    """The dask executor auto-converts any pandas dataframes."""
    stores = pd.DataFrame({
        'country': [0, 0, 1, 1],
        'id': [1, 2, 3, 4],
    })

    sales = pd.DataFrame({'store_id': [1, 2, 3, 4], 'sales': [5, 6, 7, 8]})

    sales_by_country = fq.execute(q,
                                  scope={
                                      'stores': stores,
                                      'sales': sales
                                  },
                                  model="dask")

    assert type(sales_by_country) is dd.DataFrame
    sales_by_country = sales_by_country.compute()

    pdt.assert_frame_equal(
        sales_by_country,
        pd.DataFrame({
            'country': [0, 1],
            'sales': [11, 15],
        }),
    )
def test_readme_example():
    stores = pd.DataFrame({
        'country': [0, 0, 1, 1],
        'id': [1, 2, 3, 4],
    })

    sales = pd.DataFrame({'store_id': [1, 2, 3, 4], 'sales': [5, 6, 7, 8]})

    # usage to prevent flake8 message
    print("shapes: ", stores.shape, sales.shape)

    import framequery as fq

    sales_by_country = fq.execute("""
        SELECT country, sum(sales) as sales

        FROM sales

        JOIN stores
        ON sales.store_id = stores.id

        GROUP BY country
    """)

    pdt.assert_frame_equal(
        sales_by_country,
        pd.DataFrame({
            'country': [0, 1],
            'sales': [11, 15],
        }),
    )
def test_example(query, expected, model):
    if model == 'dask':
        sc = {k: dd.from_pandas(df, npartitions=3) for k, df in scope.items()}
        actual = fq.execute(query, scope=sc, model=model)
        actual = actual.compute()

    else:
        actual = fq.execute(query, scope=scope, model=model)

    expected = expected()

    # set empty columns in expected to the ones in actual
    expected.columns = [
        e or a for a, e in zip(actual.columns, expected.columns)
    ]

    actual = actual.reset_index(drop=True)
    expected = actual.reset_index(drop=True)

    pdt.assert_frame_equal(actual, expected, check_dtype=False)
def test_select(setup, model, query):
    db, scope = setup

    if model == 'dask':
        scope = {
            k: dd.from_pandas(df, npartitions=3)
            for (k, df) in scope.items()
        }

    actual = fq.execute(query, scope, model=model)

    expected = _norm_result(db.execute(query.replace('%', '%%')).fetchall())
    actual = _norm_result(row for _, row in actual.iterrows())

    print('expected', expected, 'actual', actual, sep='\n')
    pdt.assert_frame_equal(actual,
                           expected,
                           check_dtype=False,
                           check_less_precise=True)
Beispiel #5
0
import framequery as fq
# assign labels to the generated features
features = fq.execute("""
SELECT f.*
,case when g.type = 'P' then 1 else 0 end as label
FROM features f
JOIN game_df g
on f.game_id = g.game_id
""")

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
# create inputs for sklearn
y = features['label']
X = features.drop(['label', 'game_id'], axis=1).fillna(0)
# train a classifier
lr = LogisticRegression()
model = lr.fit(X, y)
# Results
print("Accuracy: " + str(model.score(X, y)))
print("ROC" + str(roc_auc_score(y, model.predict_proba(X)[:, 1])))
Beispiel #6
0
import framequery as fq
import pandas as pd

stores = pd.read_csv('data/stores.csv')
sales = pd.read_csv('data/sales.csv')

sales_by_country = fq.execute("""
    SELECT country, sum(sales) as total_sales

    FROM sales
    JOIN stores
    ON sales.store_id = stores.id

    GROUP BY country
""")

print(sales_by_country)