def test_multi_env(): env1 = get_env(key="e1") g = Graph(env1) s = env1._local_python_storage # Initial graph batches = 2 source = g.create_node(customer_source, params={"batches": batches}) metrics = g.create_node(shape_metrics, input=source) # Run first time blocks = produce(metrics, graph=g, target_storage=s, env=env1) assert len(blocks) == 1 with env1.md_api.begin(): assert env1.md_api.count(select(DataFunctionLog)) == 2 assert env1.md_api.count(select(DataBlockLog)) == 3 env2 = get_env(key="e2", db_url=env1.metadata_storage.url) g = Graph(env2) s = env2._local_python_storage # Initial graph batches = 2 source = g.create_node(customer_source, params={"batches": batches}) metrics = g.create_node(shape_metrics, input=source) # Run first time blocks = produce(metrics, graph=g, target_storage=s, env=env2) assert len(blocks) == 1 with env2.md_api.begin(): assert env2.md_api.count(select(DataFunctionLog)) == 2 assert env2.md_api.count(select(DataBlockLog)) == 3
def test_node_reset(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 cfg = {"batches": batches} source = g.create_node(customer_source, params=cfg) accum = g.create_node("core.accumulator", input=source) metrics = g.create_node(shape_metrics, input=accum) # Run first time produce(source, graph=g, target_storage=s, env=env) # Now reset node with env.md_api.begin(): state = source.get_state(env) assert state.state is not None source.reset(env) state = source.get_state(env) assert state is None blocks = produce(metrics, graph=g, target_storage=s, env=env) assert len(blocks) == 1 records = blocks[0].as_records() expected_records = [ {"metric": "row_count", "value": batch_size}, # Just one run of source, not two {"metric": "col_count", "value": 3}, ] assert records == expected_records
def test_function_failure(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 cfg = {"batches": batches, "fail": True} source = g.create_node(customer_source, params=cfg) blocks = produce(source, graph=g, target_storage=s, env=env) assert len(blocks) == 1 records = blocks[0].as_records() assert len(records) == 2 with env.md_api.begin(): assert env.md_api.count(select(DataFunctionLog)) == 1 assert env.md_api.count(select(DataBlockLog)) == 1 pl = env.md_api.execute(select(DataFunctionLog)).scalar_one_or_none() assert pl.node_key == source.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {} assert pl.node_end_state == {"records_imported": chunk_size} assert pl.function_key == source.function.key assert pl.function_params == cfg assert pl.error is not None assert FAIL_MSG in pl.error["error"] ns = env.md_api.execute( select(NodeState).filter(NodeState.node_key == pl.node_key) ).scalar_one_or_none() assert ns.state == {"records_imported": chunk_size} # Run again without failing, should see different result source.params["fail"] = False blocks = produce(source, graph=g, target_storage=s, env=env) assert len(blocks) == 1 records = blocks[0].as_records() assert len(records) == batch_size with env.md_api.begin(): assert env.md_api.count(select(DataFunctionLog)) == 2 assert env.md_api.count(select(DataBlockLog)) == 2 pl = ( env.md_api.execute( select(DataFunctionLog).order_by(DataFunctionLog.completed_at.desc()) ) .scalars() .first() ) assert pl.node_key == source.key assert pl.graph_id == g.get_metadata_obj().hash assert pl.node_start_state == {"records_imported": chunk_size} assert pl.node_end_state == {"records_imported": chunk_size + batch_size} assert pl.function_key == source.function.key assert pl.function_params == cfg assert pl.error is None ns = env.md_api.execute( select(NodeState).filter(NodeState.node_key == pl.node_key) ).scalar_one_or_none() assert ns.state == {"records_imported": chunk_size + batch_size}
def test_ref_input(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 cfg = {"batches": batches} source = g.create_node(customer_source, params=cfg) accum = g.create_node("core.accumulator", input=source) metrics = g.create_node(shape_metrics, input=accum) join_ref = g.create_node( with_latest_metrics, inputs={"metrics": metrics, "cust": source} ) join = g.create_node( with_latest_metrics_no_ref, inputs={"metrics": metrics, "cust": source} ) # Run once, for one metrics output output = produce(metrics, graph=g, target_storage=s, env=env) # Both joins work output = env.run_node(join_ref, g, target_storage=s) assert output.output_blocks output = env.run_node(join, g, target_storage=s) assert output.output_blocks # Run source to create new customers, but NOT new metrics output = env.run_node(source, g, target_storage=s) # This time only ref will still have a metrics input output = env.run_node(join_ref, g, target_storage=s) assert output.output_blocks output = env.run_node(join, g, target_storage=s) assert not output.output_blocks # Regular join has exhausted metrics
def test_alternate_apis(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph batches = 2 source = g.create_node(customer_source, params={"batches": batches}) metrics = g.create_node(shape_metrics, input=source) # Run first time blocks = produce(metrics, graph=g, target_storage=s, env=env) assert len(blocks) == 1 output = blocks[0] assert output.nominal_schema_key.endswith("Metric") records = blocks[0].as_records() expected_records = [ {"metric": "row_count", "value": batch_size}, {"metric": "col_count", "value": 3}, ] assert records == expected_records
def test_alternate_apis(): env = get_env() g = Graph(env) s = env._local_python_storage # Initial graph N = 2 * 4 source = g.create_node(customer_source, config={"total_records": N}) metrics = g.create_node(shape_metrics, upstream=source) # Run first time output = produce(metrics, graph=g, target_storage=s, env=env) assert output.nominal_schema_key.endswith("Metric") records = output.as_records() expected_records = [ { "metric": "row_count", "value": 4 }, { "metric": "col_count", "value": 3 }, ] assert records == expected_records