Example #1
0
def run_stripe_test(api_key: str):
    from snapflow_stripe import module as stripe

    if not api_key:
        api_key = TEST_API_KEY
    storage = get_tmp_sqlite_db_url()
    env = Environment(DataspaceCfg(metadata_storage="sqlite://", storages=[storage]))
    env.add_module(stripe)

    # Initial graph
    raw_charges = GraphCfg(
        key="import_charges",
        function="stripe.import_charges",
        params={"api_key": api_key},
    )
    clean_charges = GraphCfg(
        key="clean_charges", function="stripe.clean_charges", input="import_charges"
    )
    g = GraphCfg(nodes=[raw_charges, clean_charges])
    results = env.produce(
        clean_charges.key, g, target_storage=storage, execution_timelimit_seconds=1
    )
    records = results[0].stdout().as_records()
    assert len(records) >= 100
    assert records[0]["amount"] > 0
Example #2
0
def get_env(key="_test", db_url=None):
    if db_url is None:
        db_url = get_tmp_sqlite_db_url()
    env = Environment(key=key, metadata_storage=db_url)
    env.add_module(core)
    env.add_schema(Customer)
    env.add_schema(Metric)
    return env
Example #3
0
def test_simple_import():
    dburl = get_tmp_sqlite_db_url()
    env = Environment(metadata_storage=dburl)
    g = Graph(env)
    env.add_module(core)
    df = pd.DataFrame({"a": range(10), "b": range(10)})
    g.create_node(key="n1", function="import_dataframe", params={"dataframe": df})
    blocks = env.produce("n1", g)
    assert_almost_equal(blocks[0].as_dataframe(), df, check_dtype=False)
Example #4
0
def produce_function_output_for_static_input(
        function: DataFunction,
        params: Dict[str, Any] = None,
        input: Any = None,
        inputs: Any = None,
        env: Optional[Environment] = None,
        module: Optional[SnapflowModule] = None,
        target_storage: Optional[Storage] = None,
        upstream: Any = None,  # TODO: DEPRECATED
) -> Iterator[List[DataBlock]]:
    inputs = input or inputs or upstream
    if env is None:
        db = get_tmp_sqlite_db_url()
        env = Environment(metadata_storage=db)
    if module:
        env.add_module(module)
    with provide_test_storages(function, target_storage) as target_storage:
        if target_storage:
            target_storage = env.add_storage(target_storage)
        with env.md_api.begin():
            g = Graph(env)
            input_datas = inputs
            input_nodes: Dict[str, Node] = {}
            pi = function.get_interface()
            if not isinstance(inputs, dict):
                assert len(pi.get_non_recursive_inputs()) == 1
                input_datas = {
                    pi.get_single_non_recursive_input().name: inputs
                }
            for inpt in pi.inputs.values():
                if inpt.is_self_reference:
                    continue
                assert inpt.name is not None
                input_data = input_datas[inpt.name]
                if isinstance(input_data, str):
                    input_data = DataInput(data=input_data)
                assert isinstance(input_data, DataInput)
                n = g.create_node(
                    key=f"_input_{inpt.name}",
                    function="core.import_dataframe",
                    params={
                        "dataframe": input_data.as_dataframe(env),
                        "schema": input_data.get_schema_key(),
                    },
                )
                input_nodes[inpt.name] = n
            test_node = g.create_node(
                key=f"{function.name}",
                function=function,
                params=params,
                inputs=input_nodes,
            )
            blocks = env.produce(test_node,
                                 to_exhaustion=False,
                                 target_storage=target_storage)
            yield blocks
Example #5
0
def make_test_env(**kwargs) -> Environment:
    if "metadata_storage" not in kwargs:
        url = get_tmp_sqlite_db_url()
        metadata_storage = Storage.from_url(url)
        kwargs["metadata_storage"] = metadata_storage
    env = Environment(settings=SnapflowSettings(abort_on_function_error=True), **kwargs)
    test_module = SnapflowModule(
        "_test",
    )
    for schema in [TestSchema1, TestSchema2, TestSchema3, TestSchema4]:
        env.add_schema(schema)
    env.add_module(test_module)
    return env
Example #6
0
def test_database_handler():
    dburl = get_tmp_sqlite_db_url()
    s = Storage(dburl)
    name = "_test"
    handler = get_handler(DatabaseTableFormat, s.storage_engine)
    handler().create_empty(name, s, test_records_schema)
    s.get_api().bulk_insert_records(name, test_records)
    assert list(handler().infer_field_names(name, s)) == list(test_records[0].keys())
    assert handler().infer_field_type(name, s, "f1") == Text()
    assert handler().infer_field_type(name, s, "f2") == Integer()
    assert handler().infer_field_type(name, s, "f3") == DEFAULT_FIELD_TYPE
    assert handler().infer_field_type(name, s, "f4") == Date()
    assert handler().infer_field_type(name, s, "f5") == DEFAULT_FIELD_TYPE
Example #7
0
def test_conform():
    from snapflow.modules import core

    TestSchemaA = create_quick_schema("TestSchemaA", [("a", "Integer"),
                                                      ("b", "Integer")],
                                      namespace="core")
    TestSchemaB = create_quick_schema(
        "TestSchemaB",
        [("a", "Integer"), ("c", "Integer"), ("d", "Text")],
        implementations=[Implementation("TestSchemaA", {"b": "c"})],
        namespace="core",
    )

    core.add_schema(TestSchemaA)
    core.add_schema(TestSchemaB)

    input_data = """
        a,c,d
        1,2,i
        1,3,i
        1,4,i
        2,2,i
    """
    expected = """
        a,b
        1,2
        1,3
        1,4
        2,2
    """
    # expected_df = str_as_dataframe(expected, schema=core.schemas.CoreTestSchema)
    data_input = DataInput(input_data, schema=TestSchemaB)
    s = get_tmp_sqlite_db_url()
    for p in [
            dataframe_conform_to_schema,
            sql_conform_to_schema,
    ]:
        with produce_snap_output_for_static_input(
                p,
                input=data_input,
                target_storage=s,
                params={"schema": "TestSchemaA"}) as dbs:
            assert len(dbs) == 1
            db = dbs[0]
            expected_df = DataInput(expected, schema=TestSchemaA).as_dataframe(
                db.manager.env)
            df = db.as_dataframe()
            print(expected_df)
            print(df)
Example #8
0
def provide_test_storages(
        function: DataFunction,
        target_storage: Storage) -> Iterator[Optional[Storage]]:
    if target_storage:
        yield target_storage  # TODO
    elif function.required_storage_engines:
        # TODO: multiple engines -- is it AND or OR?? each entry is AND and inside entry commas delim OR
        eng = get_engine_for_scheme(function.required_storage_engines[0])
        api_cls = eng.get_api_cls()
        if issubclass(api_cls, DatabaseApi):
            if not api_cls.dialect_is_supported():
                raise TestFeatureNotImplementedError(eng)
            with api_cls.temp_local_database() as url:
                yield Storage(url)
    elif "database" in function.required_storage_classes:
        yield Storage(get_tmp_sqlite_db_url())
    else:
        yield None
Example #9
0
def test_app():
    db_url = get_tmp_sqlite_db_url()
    runner = CliRunner()
    result = runner.invoke(app, ["-m", db_url, "logs"])
    assert result.exit_code == 0
    result = runner.invoke(app, ["-m", db_url, "nodes"])
    assert result.exit_code == 0
    result = runner.invoke(app, ["-m", db_url, "blocks"])
    assert result.exit_code == 0
    result = runner.invoke(
        app, ["-m", db_url, "generate", "schema"], input='{"f1": 1, "f2": "hi"}'
    )
    assert result.exit_code == 0
    with runner.isolated_filesystem():
        result = runner.invoke(app, ["-m", db_url, "init"])
        assert result.exit_code == 0
        pth = os.path.join(os.getcwd(), SNAPFLOW_PROJECT_FILE_NAME)
        assert os.path.exists(pth)
Example #10
0
def test_shopify():
    from snapflow_shopify import module as shopify

    api_key = ensure_api_key()
    storage = get_tmp_sqlite_db_url()
    env = Environment(
        DataspaceCfg(metadata_storage="sqlite://", storages=[storage]))

    # Initial graph
    orders = GraphCfg(
        key="import_orders",
        function="shopify.import_orders",
        params={"admin_url": api_key},
    )
    g = GraphCfg(nodes=[orders])
    results = env.produce(orders.key,
                          g,
                          target_storage=storage,
                          execution_timelimit_seconds=1)
    records = results[0].stdout().as_records()
    assert len(records) > 0
Example #11
0
def test_multi_env():
    db_url = get_tmp_sqlite_db_url()
    cfg = EnvironmentConfiguration(
        key=f"_test_{rand_str()}",
        metadata_storage_url=db_url,
        settings=SnapflowSettings(add_core_module=False),
    )
    env1 = Environment.from_config(cfg)
    with env1.md_api.begin():
        env1.md_api.add(DataBlockMetadata(realized_schema_key="Any"))
        env1.md_api.flush()
        assert env1.md_api.count(select(DataBlockMetadata)) == 1
    cfg = EnvironmentConfiguration(
        key=f"_test_{rand_str()}",
        metadata_storage_url=db_url,
        settings=SnapflowSettings(add_core_module=False),
    )
    env2 = Environment.from_config(cfg)
    with env2.md_api.begin():
        assert env2.md_api.count(select(DataBlockMetadata)) == 0
        env2.md_api.add(DataBlockMetadata(realized_schema_key="Any"))
        env2.md_api.flush()
        assert env2.md_api.count(select(DataBlockMetadata)) == 1
Example #12
0
def test_dedupe():
    from snapflow.modules import core

    input_data = """
        k1,k2,f1,f2,f3,f4
        1,2,abc,1.1,1,2012-01-01
        1,2,def,1.1,{"1":2},2012-01-02
        1,3,abc,1.1,2,2012-01-01
        1,4,,,"[1,2,3]",2012-01-01
        2,2,1.0,2.1,"[1,2,3]",2012-01-01
    """
    expected = """
        k1,k2,f1,f2,f3,f4
        1,2,def,1.1,{"1":2},2012-01-02
        1,3,abc,1.1,2,2012-01-01
        1,4,,,"[1,2,3]",2012-01-01
        2,2,1.0,2.1,"[1,2,3]",2012-01-01
    """
    # expected_df = str_as_dataframe(expected, schema=core.schemas.CoreTestSchema)
    data_input = DataInput(input_data, schema="CoreTestSchema", module=core)
    s = get_tmp_sqlite_db_url()
    for p in [
            # sql_dedupe_unique_keep_newest_row,
            dataframe_dedupe_unique_keep_newest_row,
    ]:
        with produce_snap_output_for_static_input(p,
                                                  input=data_input,
                                                  target_storage=s) as dbs:
            assert len(dbs) == 1
            db = dbs[0]
            expected_df = DataInput(expected,
                                    schema="CoreTestSchema",
                                    module=core).as_dataframe(db.manager.env)
            df = db.as_dataframe()
            assert_dataframes_are_almost_equal(
                df, expected_df, schema=core.schemas.CoreTestSchema)
Example #13
0
 def temp_local_database(cls) -> Iterator[str]:
     db_url = get_tmp_sqlite_db_url("__test_dcp_sqlite")
     yield db_url