def run_scenario(self, input, func): expected = func(input) empty = input.iloc[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) check_correct(expected, actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]]) input_df = convert.to_dataframe(input_pcoll, proxy=empty) output_df = func(input_df) output_proxy = output_df._expr.proxy() if isinstance(output_proxy, pd.core.generic.NDFrame): self.assertTrue( output_proxy.iloc[:0].equals(expected.iloc[:0]), ('Output proxy is incorrect:\n' f'Expected:\n{expected.iloc[:0]}\n\n' f'Actual:\n{output_proxy.iloc[:0]}')) else: self.assertEqual(type(output_proxy), type(expected)) output_pcoll = convert.to_pcollection(output_df, yield_elements='pandas') assert_that(output_pcoll, lambda actual: check_correct(expected, concat(actual)))
def evaluate(partition, stage=self.stage): session = expressions.Session( {expr: partition[expr._id] for expr in stage.inputs}) for expr in stage.outputs: yield beam.pvalue.TaggedOutput( expr._id, expr.evaluate_at(session))
def run_scenario(self, input, func): expected = func(input) empty = input[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) def check_correct(actual): if actual is None: raise AssertionError('Empty frame but expected: \n\n%s' % (expected)) sorted_actual = actual.sort_index() sorted_expected = expected.sort_index() if not sorted_actual.equals(sorted_expected): raise AssertionError( 'Dataframes not equal: \n\n%s\n\n%s' % (sorted_actual, sorted_expected)) check_correct(actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input[::2], input[1::2]]) output_pcoll = input_pcoll | transforms.DataframeTransform( func, proxy=empty) assert_that( output_pcoll, lambda actual: check_correct(pd.concat(actual) if actual else None))
def test_computed_expression(self): a = expressions.PlaceholderExpression(0) b = expressions.PlaceholderExpression(0) a_plus_b = expressions.ComputedExpression('add', lambda a, b: a + b, [a, b]) session = expressions.Session({a: 1, b: 2}) self.assertEqual(session.evaluate(a_plus_b), 3)
def compute_using_session(self, to_compute): session = expressions.Session(self._env._inputs) return { name: frame._expr.evaluate_at(session) for name, frame in to_compute.items() }
def evaluate(partition, stage=self.stage, **side_inputs): session = expressions.Session( dict([(expr, partition[expr._id]) for expr in tabular_inputs] + [(expr, side_inputs[expr._id]) for expr in scalar_inputs])) for expr in stage.outputs: yield beam.pvalue.TaggedOutput( expr._id, expr.evaluate_at(session))
def _run_test(self, func, *args): deferred_args = [ frame_base.DeferredFrame.wrap( expressions.ConstantExpression(arg, arg[0:0])) for arg in args ] expected = func(*args) actual = expressions.Session({}).evaluate(func(*deferred_args)._expr) self.assertTrue( expected.equals(actual), 'Expected:\n\n%r\n\nActual:\n\n%r' % (expected, actual))
def evaluate(partition, stage=self.stage, **side_inputs): def lookup(expr): # Use proxy if there's no data in this partition return expr.proxy( ).iloc[:0] if partition[expr._id] is None else partition[expr._id] session = expressions.Session( dict([(expr, lookup(expr)) for expr in tabular_inputs] + [(expr, side_inputs[expr._id]) for expr in scalar_inputs])) for expr in stage.outputs: yield beam.pvalue.TaggedOutput(expr._id, expr.evaluate_at(session))
def test_elementwise_func(self): a = pd.Series([1, 2, 3]) b = pd.Series([100, 200, 300]) empty_proxy = a[:0] x = frames.DeferredSeries(expressions.PlaceholderExpression(empty_proxy)) y = frames.DeferredSeries(expressions.PlaceholderExpression(empty_proxy)) sub = frame_base._elementwise_function(lambda x, y: x - y) session = expressions.Session({x._expr: a, y._expr: b}) self.assertTrue(sub(x, y)._expr.evaluate_at(session).equals(a - b)) self.assertTrue(sub(x, 1)._expr.evaluate_at(session).equals(a - 1)) self.assertTrue(sub(1, x)._expr.evaluate_at(session).equals(1 - a)) self.assertTrue(sub(x, b)._expr.evaluate_at(session).equals(a - b)) self.assertTrue(sub(a, y)._expr.evaluate_at(session).equals(a - b))
def run_scenario(self, input, func): expected = func(input) empty = input[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) check_correct(expected, actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input[::2], input[1::2]]) output_pcoll = input_pcoll | transforms.DataframeTransform( func, proxy=empty, yield_elements='pandas') assert_that(output_pcoll, lambda actual: check_correct(expected, concat(actual)))
def run_scenario(self, input, func): expected = func(input) empty = input[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) def concat(parts): if len(parts) > 1: return pd.concat(parts) elif len(parts) == 1: return parts[0] else: return None def check_correct(actual): if actual is None: raise AssertionError('Empty frame but expected: \n\n%s' % (expected)) if isinstance(expected, pd.core.generic.NDFrame): sorted_actual = actual.sort_index() sorted_expected = expected.sort_index() if not sorted_actual.equals(sorted_expected): raise AssertionError('Dataframes not equal: \n\n%s\n\n%s' % (sorted_actual, sorted_expected)) else: if actual != expected: raise AssertionError('Scalars not equal: %s != %s' % (actual, expected)) check_correct(actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input[::2], input[1::2]]) output_pcoll = input_pcoll | transforms.DataframeTransform( func, proxy=empty) assert_that(output_pcoll, lambda actual: check_correct(concat(actual)))
def test_constant_expresion(self): two = expressions.ConstantExpression(2) session = expressions.Session({}) self.assertEqual(session.evaluate(two), 2)
def test_placeholder_expression(self): a = expressions.PlaceholderExpression(None) b = expressions.PlaceholderExpression(None) session = expressions.Session({a: 1, b: 2}) self.assertEqual(session.evaluate(a), 1) self.assertEqual(session.evaluate(b), 2)