def test_computed_expression(self): a = expressions.PlaceholderExpression(0) b = expressions.PlaceholderExpression(0) a_plus_b = expressions.ComputedExpression('add', lambda a, b: a + b, [a, b]) session = expressions.Session({a: 1, b: 2}) self.assertEqual(session.evaluate(a_plus_b), 3)
def test_elementwise_func(self): a = pd.Series([1, 2, 3]) b = pd.Series([100, 200, 300]) empty_proxy = a[:0] x = frames.DeferredSeries(expressions.PlaceholderExpression(empty_proxy)) y = frames.DeferredSeries(expressions.PlaceholderExpression(empty_proxy)) sub = frame_base._elementwise_function(lambda x, y: x - y) session = expressions.Session({x._expr: a, y._expr: b}) self.assertTrue(sub(x, y)._expr.evaluate_at(session).equals(a - b)) self.assertTrue(sub(x, 1)._expr.evaluate_at(session).equals(a - 1)) self.assertTrue(sub(1, x)._expr.evaluate_at(session).equals(1 - a)) self.assertTrue(sub(x, b)._expr.evaluate_at(session).equals(a - b)) self.assertTrue(sub(a, y)._expr.evaluate_at(session).equals(a - b))
def run_scenario(self, input, func): expected = func(input) empty = input.iloc[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) check_correct(expected, actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]]) input_df = convert.to_dataframe(input_pcoll, proxy=empty) output_df = func(input_df) output_proxy = output_df._expr.proxy() if isinstance(output_proxy, pd.core.generic.NDFrame): self.assertTrue( output_proxy.iloc[:0].equals(expected.iloc[:0]), ('Output proxy is incorrect:\n' f'Expected:\n{expected.iloc[:0]}\n\n' f'Actual:\n{output_proxy.iloc[:0]}')) else: self.assertEqual(type(output_proxy), type(expected)) output_pcoll = convert.to_pcollection(output_df, yield_elements='pandas') assert_that(output_pcoll, lambda actual: check_correct(expected, concat(actual)))
def to_dataframe( pcoll, # type: pvalue.PCollection proxy=None, # type: pandas.core.generic.NDFrame ): # type: (...) -> frame_base.DeferredFrame """Convers a PCollection to a deferred dataframe-like object, which can manipulated with pandas methods like `filter` and `groupby`. For example, one might write:: pcoll = ... df = to_dataframe(pcoll, proxy=...) result = df.groupby('col').sum() pcoll_result = to_pcollection(result) A proxy object must be given if the schema for the PCollection is not known. """ if proxy is None: if pcoll.element_type is None: raise ValueError( "Cannot infer a proxy because the input PCollection does not have a " "schema defined. Please make sure a schema type is specified for " "the input PCollection, or provide a proxy.") # If no proxy is given, assume this is an element-wise schema-aware # PCollection that needs to be batched. proxy = schemas.generate_proxy(pcoll.element_type) pcoll = pcoll | 'BatchElements' >> schemas.BatchRowsAsDataFrame() return frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(proxy, pcoll))
def run_scenario(self, input, func): expected = func(input) empty = input[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) def check_correct(actual): if actual is None: raise AssertionError('Empty frame but expected: \n\n%s' % (expected)) sorted_actual = actual.sort_index() sorted_expected = expected.sort_index() if not sorted_actual.equals(sorted_expected): raise AssertionError( 'Dataframes not equal: \n\n%s\n\n%s' % (sorted_actual, sorted_expected)) check_correct(actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input[::2], input[1::2]]) output_pcoll = input_pcoll | transforms.DataframeTransform( func, proxy=empty) assert_that( output_pcoll, lambda actual: check_correct(pd.concat(actual) if actual else None))
class ConstructionTimeTest(unittest.TestCase): """Tests for operations that can be executed eagerly.""" DF = pd.DataFrame({ 'str_col': ['foo', 'bar'], 'int_col': [1, 2], 'flt_col': [1.1, 2.2], }) DEFERRED_DF = frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(DF)) def _run_test(self, fn): self.assertEqual(fn(self.DEFERRED_DF), fn(self.DF)) @parameterized.expand(DF.columns) def test_series_name(self, col_name): self._run_test(lambda df: df[col_name]) @parameterized.expand(DF.columns) def test_series_dtype(self, col_name): self._run_test(lambda df: df[col_name].dtype) self._run_test(lambda df: df[col_name].dtypes) def test_dataframe_columns(self): self._run_test(lambda df: list(df.columns)) def test_dataframe_dtypes(self): self._run_test(lambda df: list(df.dtypes))
def to_dataframe( pcoll, # type: pvalue.PCollection proxy=None, # type: Optional[pandas.core.generic.NDFrame] label=None, # type: Optional[str] ): # type: (...) -> frame_base.DeferredFrame """Converts a PCollection to a deferred dataframe-like object, which can manipulated with pandas methods like `filter` and `groupby`. For example, one might write:: pcoll = ... df = to_dataframe(pcoll, proxy=...) result = df.groupby('col').sum() pcoll_result = to_pcollection(result) A proxy object must be given if the schema for the PCollection is not known. """ if proxy is None: if pcoll.element_type is None: raise ValueError( "Cannot infer a proxy because the input PCollection does not have a " "schema defined. Please make sure a schema type is specified for " "the input PCollection, or provide a proxy.") # If no proxy is given, assume this is an element-wise schema-aware # PCollection that needs to be batched. if label is None: # Attempt to come up with a reasonable, stable label by retrieving # the name of these variables in the calling context. label = 'BatchElements(%s)' % _var_name(pcoll, 2) proxy = schemas.generate_proxy(pcoll.element_type) pcoll = pcoll | label >> schemas.BatchRowsAsDataFrame(proxy=proxy) return frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(proxy, pcoll))
def _replace_with_cached_recur( self, expr: expressions.Expression, replaced_inputs: Dict[str, expressions.Expression]) -> None: """Recursive call for `replace_with_cached`. Recurses through the expression tree and replaces any cached inputs with `PlaceholderExpression`s. """ final_inputs = [] for input in expr.args(): pc = self._get_cached(input) # Only read from cache when there is the PCollection has been fully # computed. This is so that no partial results are used. if self._is_computed(pc): # Reuse previously seen cached expressions. This is so that the same # value isn't cached multiple times. if input._id in replaced_inputs: cached = replaced_inputs[input._id] else: cached = expressions.PlaceholderExpression( input.proxy(), self._pcollection_cache[input._id]) replaced_inputs[input._id] = cached final_inputs.append(cached) else: final_inputs.append(input) self._replace_with_cached_recur(input, replaced_inputs) expr._args = tuple(final_inputs)
def __call__(self, *args, **kwargs): result = self._pandas_obj(*args, **kwargs) if type(result) in DeferredBase._pandas_type_map.keys(): placeholder = expressions.PlaceholderExpression(result[0:0]) self._test_env._inputs[placeholder] = result return DeferredBase.wrap(placeholder) else: return result
def expand(self, input_pcolls): def wrap_as_dict(values): if isinstance(values, dict): return values elif isinstance(values, tuple): return dict(enumerate(values)) else: return {None: values} # TODO: Infer the proxy from the input schema. def proxy(key): if key is None: return self._proxy else: return self._proxy[key] # The input can be a dictionary, tuple, or plain PCollection. # Wrap as a dict for homogeneity. # TODO: Possibly inject batching here. input_dict = wrap_as_dict(input_pcolls) placeholders = { key: frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(proxy(key))) for key in input_dict.keys() } # The calling convention of the user-supplied func varies according to the # type of the input. if isinstance(input_pcolls, dict): result_frames = self._func(**placeholders) elif isinstance(input_pcolls, tuple): result_frames = self._func( *(value for _, value in sorted(placeholders.items()))) else: result_frames = self._func(placeholders[None]) # Likewise the output may be a dict, tuple, or raw (deferred) Dataframe. result_dict = wrap_as_dict(result_frames) result_pcolls = self._apply_deferred_ops( { placeholders[key]._expr: pcoll for key, pcoll in input_dict.items() }, {key: df._expr for key, df in result_dict.items()}) # Convert the result back into a set of PCollections. if isinstance(result_frames, dict): return result_pcolls elif isinstance(result_frames, tuple): return tuple((value for _, value in sorted(result_pcolls.items()))) else: return result_pcolls[None]
def test_maybe_inplace(self): @frame_base.maybe_inplace def add_one(frame): return frame + 1 frames.DeferredSeries.add_one = add_one original_expr = expressions.PlaceholderExpression(pd.Series([1, 2, 3])) x = frames.DeferredSeries(original_expr) x.add_one() self.assertIs(x._expr, original_expr) x.add_one(inplace=False) self.assertIs(x._expr, original_expr) x.add_one(inplace=True) self.assertIsNot(x._expr, original_expr)
def run_scenario(self, input, func): expected = func(input) empty = input[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) check_correct(expected, actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input[::2], input[1::2]]) output_pcoll = input_pcoll | transforms.DataframeTransform( func, proxy=empty, yield_elements='pandas') assert_that(output_pcoll, lambda actual: check_correct(expected, concat(actual)))
def to_dataframe( pcoll, # type: pvalue.PCollection proxy, # type: pandas.core.generic.NDFrame ): # type: (...) -> frame_base.DeferredFrame """Convers a PCollection to a deferred dataframe-like object, which can manipulated with pandas methods like `filter` and `groupby`. For example, one might write:: pcoll = ... df = to_dataframe(pcoll, proxy=...) result = df.groupby('col').sum() pcoll_result = to_pcollection(result) A proxy object must be given if the schema for the PCollection is not known. """ return frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(proxy, pcoll))
def run_scenario(self, input, func): expected = func(input) empty = input[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) def concat(parts): if len(parts) > 1: return pd.concat(parts) elif len(parts) == 1: return parts[0] else: return None def check_correct(actual): if actual is None: raise AssertionError('Empty frame but expected: \n\n%s' % (expected)) if isinstance(expected, pd.core.generic.NDFrame): sorted_actual = actual.sort_index() sorted_expected = expected.sort_index() if not sorted_actual.equals(sorted_expected): raise AssertionError('Dataframes not equal: \n\n%s\n\n%s' % (sorted_actual, sorted_expected)) else: if actual != expected: raise AssertionError('Scalars not equal: %s != %s' % (actual, expected)) check_correct(actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input[::2], input[1::2]]) output_pcoll = input_pcoll | transforms.DataframeTransform( func, proxy=empty) assert_that(output_pcoll, lambda actual: check_correct(concat(actual)))
def _use_non_parallel_operation(self): _ = frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(pd.Series([1, 2, 3]))).replace('a', 'b', limit=1)
def wrapper(*args, **kwargs): df = pandas_type(*args, **kwargs) placeholder = expressions.PlaceholderExpression(df[0:0]) self._inputs[placeholder] = df return deferred_type(placeholder)
def to_pcollection( *dataframes, # type: Union[frame_base.DeferredFrame, pd.DataFrame, pd.Series] label=None, always_return_tuple=False, yield_elements='schemas', include_indexes=False, pipeline=None ) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]]: """Converts one or more deferred dataframe-like objects back to a PCollection. This method creates and applies the actual Beam operations that compute the given deferred dataframes, returning a PCollection of their results. By default the resulting PCollections are schema-aware PCollections where each element is one row from the output dataframes, excluding indexes. This behavior can be modified with the `yield_elements` and `include_indexes` arguments. Also accepts non-deferred pandas dataframes, which are converted to deferred, schema'd PCollections. In this case the contents of the entire dataframe are serialized into the graph, so for large amounts of data it is preferable to write them to disk and read them with one of the read methods. If more than one (related) result is desired, it can be more efficient to pass them all at the same time to this method. Args: label: (optional, default "ToPCollection(...)"") the label to use for the conversion transform. always_return_tuple: (optional, default: False) If true, always return a tuple of PCollections, even if there's only one output. yield_elements: (optional, default: "schemas") If set to "pandas", return PCollections containing the raw Pandas objects (DataFrames or Series), if set to "schemas", return an element-wise PCollection, where DataFrame and Series instances are expanded to one element per row. DataFrames are converted to schema-aware PCollections, where column values can be accessed by attribute. include_indexes: (optional, default: False) When yield_elements="schemas", if include_indexes=True, attempt to include index columns in the output schema for expanded DataFrames. Raises an error if any of the index levels are unnamed (name=None), or if any of the names are not unique among all column and index names. pipeline: (optional, unless non-deferred dataframes are passed) Used when creating a PCollection from a non-deferred dataframe. """ if not yield_elements in ("pandas", "schemas"): raise ValueError("Invalid value for yield_elements argument, '%s'. " "Allowed values are 'pandas' and 'schemas'" % yield_elements) if label is None: # Attempt to come up with a reasonable, stable label by retrieving the name # of these variables in the calling context. label = 'ToPCollection(%s)' % ', '.join( _var_name(e, 3) for e in dataframes) # Support for non-deferred dataframes. deferred_dataframes = [] for ix, df in enumerate(dataframes): if isinstance(df, frame_base.DeferredBase): # TODO(robertwb): Maybe extract pipeline object? deferred_dataframes.append(df) elif isinstance(df, (pd.Series, pd.DataFrame)): if pipeline is None: raise ValueError( 'Pipeline keyword required for non-deferred dataframe conversion.' ) deferred = pipeline | '%s_Defer%s' % (label, ix) >> beam.Create( [df]) deferred_dataframes.append( frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(df.iloc[:0], deferred))) else: raise TypeError( 'Unable to convert objects of type %s to a PCollection' % type(df)) dataframes = tuple(deferred_dataframes) def extract_input(placeholder): if not isinstance(placeholder._reference, pvalue.PCollection): raise TypeError( 'Expression roots must have been created with to_dataframe.') return placeholder._reference placeholders = frozenset.union( frozenset(), *[df._expr.placeholders() for df in dataframes]) # Exclude any dataframes that have already been converted to PCollections. # We only want to convert each DF expression once, then re-use. new_dataframes = [ df for df in dataframes if df._expr._id not in TO_PCOLLECTION_CACHE ] if len(new_dataframes): new_results = {p: extract_input(p) for p in placeholders } | label >> transforms._DataframeExpressionsTransform({ ix: df._expr for (ix, df) in enumerate(new_dataframes) }) # type: Dict[Any, pvalue.PCollection] TO_PCOLLECTION_CACHE.update({ new_dataframes[ix]._expr._id: pc for ix, pc in new_results.items() }) raw_results = { ix: TO_PCOLLECTION_CACHE[df._expr._id] for ix, df in enumerate(dataframes) } if yield_elements == "schemas": def maybe_unbatch(pc, value): if isinstance(value, frame_base._DeferredScalar): return pc else: return _make_unbatched_pcoll(pc, value._expr, include_indexes) results = { ix: maybe_unbatch(pc, dataframes[ix]) for (ix, pc) in raw_results.items() } else: results = raw_results if len(results) == 1 and not always_return_tuple: return results[0] else: return tuple(value for key, value in sorted(results.items()))
def test_expression_proxy_error(self): a = expressions.PlaceholderExpression(1) b = expressions.PlaceholderExpression('s') with self.assertRaises(TypeError): expressions.ComputedExpression('add', lambda a, b: a + b, [a, b])
def test_expression_proxy(self): a = expressions.PlaceholderExpression(1) b = expressions.PlaceholderExpression(2) a_plus_b = expressions.ComputedExpression('add', lambda a, b: a + b, [a, b]) self.assertEqual(a_plus_b.proxy(), 3)
def test_placeholder_expression(self): a = expressions.PlaceholderExpression(None) b = expressions.PlaceholderExpression(None) session = expressions.Session({a: 1, b: 2}) self.assertEqual(session.evaluate(a), 1) self.assertEqual(session.evaluate(b), 2)