def test_computed_expression(self):
     a = expressions.PlaceholderExpression(0)
     b = expressions.PlaceholderExpression(0)
     a_plus_b = expressions.ComputedExpression('add', lambda a, b: a + b,
                                               [a, b])
     session = expressions.Session({a: 1, b: 2})
     self.assertEqual(session.evaluate(a_plus_b), 3)
Example #2
0
  def test_elementwise_func(self):
    a = pd.Series([1, 2, 3])
    b = pd.Series([100, 200, 300])
    empty_proxy = a[:0]
    x = frames.DeferredSeries(expressions.PlaceholderExpression(empty_proxy))
    y = frames.DeferredSeries(expressions.PlaceholderExpression(empty_proxy))
    sub = frame_base._elementwise_function(lambda x, y: x - y)

    session = expressions.Session({x._expr: a, y._expr: b})
    self.assertTrue(sub(x, y)._expr.evaluate_at(session).equals(a - b))
    self.assertTrue(sub(x, 1)._expr.evaluate_at(session).equals(a - 1))
    self.assertTrue(sub(1, x)._expr.evaluate_at(session).equals(1 - a))
    self.assertTrue(sub(x, b)._expr.evaluate_at(session).equals(a - b))
    self.assertTrue(sub(a, y)._expr.evaluate_at(session).equals(a - b))
    def run_scenario(self, input, func):
        expected = func(input)

        empty = input.iloc[0:0]
        input_placeholder = expressions.PlaceholderExpression(empty)
        input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
        actual_deferred = func(input_deferred)._expr.evaluate_at(
            expressions.Session({input_placeholder: input}))

        check_correct(expected, actual_deferred)

        with beam.Pipeline() as p:
            input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]])
            input_df = convert.to_dataframe(input_pcoll, proxy=empty)
            output_df = func(input_df)

            output_proxy = output_df._expr.proxy()
            if isinstance(output_proxy, pd.core.generic.NDFrame):
                self.assertTrue(
                    output_proxy.iloc[:0].equals(expected.iloc[:0]),
                    ('Output proxy is incorrect:\n'
                     f'Expected:\n{expected.iloc[:0]}\n\n'
                     f'Actual:\n{output_proxy.iloc[:0]}'))
            else:
                self.assertEqual(type(output_proxy), type(expected))

            output_pcoll = convert.to_pcollection(output_df,
                                                  yield_elements='pandas')

            assert_that(output_pcoll,
                        lambda actual: check_correct(expected, concat(actual)))
Example #4
0
def to_dataframe(
        pcoll,  # type: pvalue.PCollection
        proxy=None,  # type: pandas.core.generic.NDFrame
):
    # type: (...) -> frame_base.DeferredFrame
    """Convers a PCollection to a deferred dataframe-like object, which can
  manipulated with pandas methods like `filter` and `groupby`.

  For example, one might write::

    pcoll = ...
    df = to_dataframe(pcoll, proxy=...)
    result = df.groupby('col').sum()
    pcoll_result = to_pcollection(result)

  A proxy object must be given if the schema for the PCollection is not known.
  """
    if proxy is None:
        if pcoll.element_type is None:
            raise ValueError(
                "Cannot infer a proxy because the input PCollection does not have a "
                "schema defined. Please make sure a schema type is specified for "
                "the input PCollection, or provide a proxy.")
        # If no proxy is given, assume this is an element-wise schema-aware
        # PCollection that needs to be batched.
        proxy = schemas.generate_proxy(pcoll.element_type)
        pcoll = pcoll | 'BatchElements' >> schemas.BatchRowsAsDataFrame()
    return frame_base.DeferredFrame.wrap(
        expressions.PlaceholderExpression(proxy, pcoll))
Example #5
0
  def run_scenario(self, input, func):
    expected = func(input)

    empty = input[0:0]
    input_placeholder = expressions.PlaceholderExpression(empty)
    input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
    actual_deferred = func(input_deferred)._expr.evaluate_at(
        expressions.Session({input_placeholder: input}))

    def check_correct(actual):
      if actual is None:
        raise AssertionError('Empty frame but expected: \n\n%s' % (expected))
      sorted_actual = actual.sort_index()
      sorted_expected = expected.sort_index()
      if not sorted_actual.equals(sorted_expected):
        raise AssertionError(
            'Dataframes not equal: \n\n%s\n\n%s' %
            (sorted_actual, sorted_expected))

    check_correct(actual_deferred)

    with beam.Pipeline() as p:
      input_pcoll = p | beam.Create([input[::2], input[1::2]])
      output_pcoll = input_pcoll | transforms.DataframeTransform(
          func, proxy=empty)
      assert_that(
          output_pcoll,
          lambda actual: check_correct(pd.concat(actual) if actual else None))
Example #6
0
class ConstructionTimeTest(unittest.TestCase):
    """Tests for operations that can be executed eagerly."""
    DF = pd.DataFrame({
        'str_col': ['foo', 'bar'],
        'int_col': [1, 2],
        'flt_col': [1.1, 2.2],
    })
    DEFERRED_DF = frame_base.DeferredFrame.wrap(
        expressions.PlaceholderExpression(DF))

    def _run_test(self, fn):
        self.assertEqual(fn(self.DEFERRED_DF), fn(self.DF))

    @parameterized.expand(DF.columns)
    def test_series_name(self, col_name):
        self._run_test(lambda df: df[col_name])

    @parameterized.expand(DF.columns)
    def test_series_dtype(self, col_name):
        self._run_test(lambda df: df[col_name].dtype)
        self._run_test(lambda df: df[col_name].dtypes)

    def test_dataframe_columns(self):
        self._run_test(lambda df: list(df.columns))

    def test_dataframe_dtypes(self):
        self._run_test(lambda df: list(df.dtypes))
def to_dataframe(
    pcoll,  # type: pvalue.PCollection
    proxy=None,  # type: Optional[pandas.core.generic.NDFrame]
    label=None,  # type: Optional[str]
):
  # type: (...) -> frame_base.DeferredFrame

  """Converts a PCollection to a deferred dataframe-like object, which can
  manipulated with pandas methods like `filter` and `groupby`.

  For example, one might write::

    pcoll = ...
    df = to_dataframe(pcoll, proxy=...)
    result = df.groupby('col').sum()
    pcoll_result = to_pcollection(result)

  A proxy object must be given if the schema for the PCollection is not known.
  """
  if proxy is None:
    if pcoll.element_type is None:
      raise ValueError(
          "Cannot infer a proxy because the input PCollection does not have a "
          "schema defined. Please make sure a schema type is specified for "
          "the input PCollection, or provide a proxy.")
    # If no proxy is given, assume this is an element-wise schema-aware
    # PCollection that needs to be batched.
    if label is None:
      # Attempt to come up with a reasonable, stable label by retrieving
      # the name of these variables in the calling context.
      label = 'BatchElements(%s)' % _var_name(pcoll, 2)
    proxy = schemas.generate_proxy(pcoll.element_type)
    pcoll = pcoll | label >> schemas.BatchRowsAsDataFrame(proxy=proxy)
  return frame_base.DeferredFrame.wrap(
      expressions.PlaceholderExpression(proxy, pcoll))
    def _replace_with_cached_recur(
            self, expr: expressions.Expression,
            replaced_inputs: Dict[str, expressions.Expression]) -> None:
        """Recursive call for `replace_with_cached`.

    Recurses through the expression tree and replaces any cached inputs with
    `PlaceholderExpression`s.
    """

        final_inputs = []

        for input in expr.args():
            pc = self._get_cached(input)

            # Only read from cache when there is the PCollection has been fully
            # computed. This is so that no partial results are used.
            if self._is_computed(pc):

                # Reuse previously seen cached expressions. This is so that the same
                # value isn't cached multiple times.
                if input._id in replaced_inputs:
                    cached = replaced_inputs[input._id]
                else:
                    cached = expressions.PlaceholderExpression(
                        input.proxy(), self._pcollection_cache[input._id])

                    replaced_inputs[input._id] = cached
                final_inputs.append(cached)
            else:
                final_inputs.append(input)
                self._replace_with_cached_recur(input, replaced_inputs)
        expr._args = tuple(final_inputs)
Example #9
0
 def __call__(self, *args, **kwargs):
     result = self._pandas_obj(*args, **kwargs)
     if type(result) in DeferredBase._pandas_type_map.keys():
         placeholder = expressions.PlaceholderExpression(result[0:0])
         self._test_env._inputs[placeholder] = result
         return DeferredBase.wrap(placeholder)
     else:
         return result
Example #10
0
    def expand(self, input_pcolls):
        def wrap_as_dict(values):
            if isinstance(values, dict):
                return values
            elif isinstance(values, tuple):
                return dict(enumerate(values))
            else:
                return {None: values}

        # TODO: Infer the proxy from the input schema.
        def proxy(key):
            if key is None:
                return self._proxy
            else:
                return self._proxy[key]

        # The input can be a dictionary, tuple, or plain PCollection.
        # Wrap as a dict for homogeneity.
        # TODO: Possibly inject batching here.
        input_dict = wrap_as_dict(input_pcolls)
        placeholders = {
            key: frame_base.DeferredFrame.wrap(
                expressions.PlaceholderExpression(proxy(key)))
            for key in input_dict.keys()
        }

        # The calling convention of the user-supplied func varies according to the
        # type of the input.
        if isinstance(input_pcolls, dict):
            result_frames = self._func(**placeholders)
        elif isinstance(input_pcolls, tuple):
            result_frames = self._func(
                *(value for _, value in sorted(placeholders.items())))
        else:
            result_frames = self._func(placeholders[None])

        # Likewise the output may be a dict, tuple, or raw (deferred) Dataframe.
        result_dict = wrap_as_dict(result_frames)

        result_pcolls = self._apply_deferred_ops(
            {
                placeholders[key]._expr: pcoll
                for key, pcoll in input_dict.items()
            }, {key: df._expr
                for key, df in result_dict.items()})

        # Convert the result back into a set of PCollections.
        if isinstance(result_frames, dict):
            return result_pcolls
        elif isinstance(result_frames, tuple):
            return tuple((value for _, value in sorted(result_pcolls.items())))
        else:
            return result_pcolls[None]
Example #11
0
    def test_maybe_inplace(self):
        @frame_base.maybe_inplace
        def add_one(frame):
            return frame + 1

        frames.DeferredSeries.add_one = add_one
        original_expr = expressions.PlaceholderExpression(pd.Series([1, 2, 3]))
        x = frames.DeferredSeries(original_expr)
        x.add_one()
        self.assertIs(x._expr, original_expr)
        x.add_one(inplace=False)
        self.assertIs(x._expr, original_expr)
        x.add_one(inplace=True)
        self.assertIsNot(x._expr, original_expr)
Example #12
0
    def run_scenario(self, input, func):
        expected = func(input)

        empty = input[0:0]
        input_placeholder = expressions.PlaceholderExpression(empty)
        input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
        actual_deferred = func(input_deferred)._expr.evaluate_at(
            expressions.Session({input_placeholder: input}))

        check_correct(expected, actual_deferred)

        with beam.Pipeline() as p:
            input_pcoll = p | beam.Create([input[::2], input[1::2]])
            output_pcoll = input_pcoll | transforms.DataframeTransform(
                func, proxy=empty, yield_elements='pandas')
            assert_that(output_pcoll,
                        lambda actual: check_correct(expected, concat(actual)))
def to_dataframe(
        pcoll,  # type: pvalue.PCollection
        proxy,  # type: pandas.core.generic.NDFrame
):
    # type: (...) -> frame_base.DeferredFrame
    """Convers a PCollection to a deferred dataframe-like object, which can
  manipulated with pandas methods like `filter` and `groupby`.

  For example, one might write::

    pcoll = ...
    df = to_dataframe(pcoll, proxy=...)
    result = df.groupby('col').sum()
    pcoll_result = to_pcollection(result)

  A proxy object must be given if the schema for the PCollection is not known.
  """
    return frame_base.DeferredFrame.wrap(
        expressions.PlaceholderExpression(proxy, pcoll))
Example #14
0
    def run_scenario(self, input, func):
        expected = func(input)

        empty = input[0:0]
        input_placeholder = expressions.PlaceholderExpression(empty)
        input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
        actual_deferred = func(input_deferred)._expr.evaluate_at(
            expressions.Session({input_placeholder: input}))

        def concat(parts):
            if len(parts) > 1:
                return pd.concat(parts)
            elif len(parts) == 1:
                return parts[0]
            else:
                return None

        def check_correct(actual):
            if actual is None:
                raise AssertionError('Empty frame but expected: \n\n%s' %
                                     (expected))
            if isinstance(expected, pd.core.generic.NDFrame):
                sorted_actual = actual.sort_index()
                sorted_expected = expected.sort_index()
                if not sorted_actual.equals(sorted_expected):
                    raise AssertionError('Dataframes not equal: \n\n%s\n\n%s' %
                                         (sorted_actual, sorted_expected))
            else:
                if actual != expected:
                    raise AssertionError('Scalars not equal: %s != %s' %
                                         (actual, expected))

        check_correct(actual_deferred)

        with beam.Pipeline() as p:
            input_pcoll = p | beam.Create([input[::2], input[1::2]])
            output_pcoll = input_pcoll | transforms.DataframeTransform(
                func, proxy=empty)
            assert_that(output_pcoll,
                        lambda actual: check_correct(concat(actual)))
Example #15
0
 def _use_non_parallel_operation(self):
     _ = frame_base.DeferredFrame.wrap(
         expressions.PlaceholderExpression(pd.Series([1, 2,
                                                      3]))).replace('a',
                                                                    'b',
                                                                    limit=1)
Example #16
0
 def wrapper(*args, **kwargs):
   df = pandas_type(*args, **kwargs)
   placeholder = expressions.PlaceholderExpression(df[0:0])
   self._inputs[placeholder] = df
   return deferred_type(placeholder)
Example #17
0
def to_pcollection(
    *dataframes,  # type: Union[frame_base.DeferredFrame, pd.DataFrame, pd.Series]
    label=None,
    always_return_tuple=False,
    yield_elements='schemas',
    include_indexes=False,
    pipeline=None
) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]]:
    """Converts one or more deferred dataframe-like objects back to a PCollection.

  This method creates and applies the actual Beam operations that compute
  the given deferred dataframes, returning a PCollection of their results. By
  default the resulting PCollections are schema-aware PCollections where each
  element is one row from the output dataframes, excluding indexes. This
  behavior can be modified with the `yield_elements` and `include_indexes`
  arguments.

  Also accepts non-deferred pandas dataframes, which are converted to deferred,
  schema'd PCollections. In this case the contents of the entire dataframe are
  serialized into the graph, so for large amounts of data it is preferable to
  write them to disk and read them with one of the read methods.

  If more than one (related) result is desired, it can be more efficient to
  pass them all at the same time to this method.

  Args:
    label: (optional, default "ToPCollection(...)"") the label to use for the
        conversion transform.
    always_return_tuple: (optional, default: False) If true, always return
        a tuple of PCollections, even if there's only one output.
    yield_elements: (optional, default: "schemas") If set to "pandas", return
        PCollections containing the raw Pandas objects (DataFrames or Series),
        if set to "schemas", return an element-wise PCollection, where DataFrame
        and Series instances are expanded to one element per row. DataFrames are
        converted to schema-aware PCollections, where column values can be
        accessed by attribute.
    include_indexes: (optional, default: False) When yield_elements="schemas",
        if include_indexes=True, attempt to include index columns in the output
        schema for expanded DataFrames. Raises an error if any of the index
        levels are unnamed (name=None), or if any of the names are not unique
        among all column and index names.
    pipeline: (optional, unless non-deferred dataframes are passed) Used when
        creating a PCollection from a non-deferred dataframe.
  """
    if not yield_elements in ("pandas", "schemas"):
        raise ValueError("Invalid value for yield_elements argument, '%s'. "
                         "Allowed values are 'pandas' and 'schemas'" %
                         yield_elements)
    if label is None:
        # Attempt to come up with a reasonable, stable label by retrieving the name
        # of these variables in the calling context.
        label = 'ToPCollection(%s)' % ', '.join(
            _var_name(e, 3) for e in dataframes)

    # Support for non-deferred dataframes.
    deferred_dataframes = []
    for ix, df in enumerate(dataframes):
        if isinstance(df, frame_base.DeferredBase):
            # TODO(robertwb): Maybe extract pipeline object?
            deferred_dataframes.append(df)
        elif isinstance(df, (pd.Series, pd.DataFrame)):
            if pipeline is None:
                raise ValueError(
                    'Pipeline keyword required for non-deferred dataframe conversion.'
                )
            deferred = pipeline | '%s_Defer%s' % (label, ix) >> beam.Create(
                [df])
            deferred_dataframes.append(
                frame_base.DeferredFrame.wrap(
                    expressions.PlaceholderExpression(df.iloc[:0], deferred)))
        else:
            raise TypeError(
                'Unable to convert objects of type %s to a PCollection' %
                type(df))
    dataframes = tuple(deferred_dataframes)

    def extract_input(placeholder):
        if not isinstance(placeholder._reference, pvalue.PCollection):
            raise TypeError(
                'Expression roots must have been created with to_dataframe.')
        return placeholder._reference

    placeholders = frozenset.union(
        frozenset(), *[df._expr.placeholders() for df in dataframes])

    # Exclude any dataframes that have already been converted to PCollections.
    # We only want to convert each DF expression once, then re-use.
    new_dataframes = [
        df for df in dataframes if df._expr._id not in TO_PCOLLECTION_CACHE
    ]
    if len(new_dataframes):
        new_results = {p: extract_input(p)
                       for p in placeholders
                       } | label >> transforms._DataframeExpressionsTransform({
                           ix: df._expr
                           for (ix, df) in enumerate(new_dataframes)
                       })  # type: Dict[Any, pvalue.PCollection]

        TO_PCOLLECTION_CACHE.update({
            new_dataframes[ix]._expr._id: pc
            for ix, pc in new_results.items()
        })

    raw_results = {
        ix: TO_PCOLLECTION_CACHE[df._expr._id]
        for ix, df in enumerate(dataframes)
    }

    if yield_elements == "schemas":

        def maybe_unbatch(pc, value):
            if isinstance(value, frame_base._DeferredScalar):
                return pc
            else:
                return _make_unbatched_pcoll(pc, value._expr, include_indexes)

        results = {
            ix: maybe_unbatch(pc, dataframes[ix])
            for (ix, pc) in raw_results.items()
        }
    else:
        results = raw_results

    if len(results) == 1 and not always_return_tuple:
        return results[0]
    else:
        return tuple(value for key, value in sorted(results.items()))
Example #18
0
 def test_expression_proxy_error(self):
   a = expressions.PlaceholderExpression(1)
   b = expressions.PlaceholderExpression('s')
   with self.assertRaises(TypeError):
     expressions.ComputedExpression('add', lambda a, b: a + b, [a, b])
Example #19
0
 def test_expression_proxy(self):
   a = expressions.PlaceholderExpression(1)
   b = expressions.PlaceholderExpression(2)
   a_plus_b = expressions.ComputedExpression('add', lambda a, b: a + b, [a, b])
   self.assertEqual(a_plus_b.proxy(), 3)
Example #20
0
 def test_placeholder_expression(self):
   a = expressions.PlaceholderExpression(None)
   b = expressions.PlaceholderExpression(None)
   session = expressions.Session({a: 1, b: 2})
   self.assertEqual(session.evaluate(a), 1)
   self.assertEqual(session.evaluate(b), 2)