Exemple #1
0
 def aggregate(self, func, axis=0, *args, **kwargs):
     if isinstance(func, list) and len(func) > 1:
         # Aggregate each column separately, then stick them all together.
         rows = [self.agg([f], *args, **kwargs) for f in func]
         return frame_base.DeferredFrame.wrap(
             expressions.ComputedExpression('join_aggregate',
                                            lambda *rows: pd.concat(rows),
                                            [row._expr for row in rows]))
     else:
         # We're only handling a single column.
         base_func = func[0] if isinstance(func, list) else func
         if _is_associative(base_func) and not args and not kwargs:
             intermediate = expressions.elementwise_expression(
                 'pre_aggregate',
                 lambda s: s.agg([base_func], *args, **kwargs),
                 [self._expr])
             allow_nonparallel_final = True
         else:
             intermediate = self._expr
             allow_nonparallel_final = None  # i.e. don't change the value
         with expressions.allow_non_parallel_operations(
                 allow_nonparallel_final):
             return frame_base.DeferredFrame.wrap(
                 expressions.ComputedExpression(
                     'aggregate',
                     lambda s: s.agg(func, *args, **kwargs), [intermediate],
                     preserves_partition_by=partitionings.Singleton(),
                     requires_partition_by=partitionings.Singleton()))
Exemple #2
0
def _run_patched(func, *args, **kwargs):
    set_pandas_options()

    # https://github.com/pandas-dev/pandas/blob/1.0.x/setup.cfg#L63
    optionflags = kwargs.pop('optionflags', 0)
    optionflags |= (doctest.NORMALIZE_WHITESPACE
                    | doctest.IGNORE_EXCEPTION_DETAIL)

    env = TestEnvironment()
    use_beam = kwargs.pop('use_beam', True)
    skip = kwargs.pop('skip', {})
    wont_implement_ok = kwargs.pop('wont_implement_ok', {})
    not_implemented_ok = kwargs.pop('not_implemented_ok', {})
    extraglobs = dict(kwargs.pop('extraglobs', {}))
    extraglobs['pd'] = env.fake_pandas_module()

    try:
        # Unfortunately the runner is not injectable.
        original_doc_test_runner = doctest.DocTestRunner
        doctest.DocTestRunner = lambda **kwargs: BeamDataframeDoctestRunner(
            env,
            use_beam=use_beam,
            wont_implement_ok=wont_implement_ok,
            not_implemented_ok=not_implemented_ok,
            skip=skip,
            **kwargs)
        with expressions.allow_non_parallel_operations():
            return func(*args,
                        extraglobs=extraglobs,
                        optionflags=optionflags,
                        **kwargs)
    finally:
        doctest.DocTestRunner = original_doc_test_runner
Exemple #3
0
def _run_patched(func, *args, **kwargs):
    try:
        # See
        # https://github.com/pandas-dev/pandas/blob/a00202d12d399662b8045a8dd3fdac04f18e1e55/doc/source/conf.py#L319
        np.random.seed(123456)
        np.set_printoptions(precision=4, suppress=True)
        pd.options.display.max_rows = 15

        # https://github.com/pandas-dev/pandas/blob/1.0.x/setup.cfg#L63
        optionflags = kwargs.pop('optionflags', 0)
        optionflags |= (doctest.NORMALIZE_WHITESPACE
                        | doctest.IGNORE_EXCEPTION_DETAIL)

        env = TestEnvironment()
        use_beam = kwargs.pop('use_beam', True)
        skip = kwargs.pop('skip', {})
        wont_implement_ok = kwargs.pop('wont_implement_ok', {})
        extraglobs = dict(kwargs.pop('extraglobs', {}))
        extraglobs['pd'] = env.fake_pandas_module()
        # Unfortunately the runner is not injectable.
        original_doc_test_runner = doctest.DocTestRunner
        doctest.DocTestRunner = lambda **kwargs: BeamDataframeDoctestRunner(
            env,
            use_beam=use_beam,
            wont_implement_ok=wont_implement_ok,
            skip=skip,
            **kwargs)
        with expressions.allow_non_parallel_operations():
            return func(*args,
                        extraglobs=extraglobs,
                        optionflags=optionflags,
                        **kwargs)
    finally:
        doctest.DocTestRunner = original_doc_test_runner
Exemple #4
0
def teststring(text, report=True, **runner_kwargs):
  optionflags = runner_kwargs.pop('optionflags', 0)
  optionflags |= (
      doctest.NORMALIZE_WHITESPACE | doctest.IGNORE_EXCEPTION_DETAIL)

  wont_implement_ok = runner_kwargs.pop('wont_implement_ok', False)
  not_implemented_ok = runner_kwargs.pop('not_implemented_ok', False)

  parser = doctest.DocTestParser()
  runner = BeamDataframeDoctestRunner(
      TestEnvironment(),
      optionflags=optionflags,
      wont_implement_ok={'<string>': ['*']} if wont_implement_ok else None,
      not_implemented_ok={'<string>': ['*']} if not_implemented_ok else None,
      **runner_kwargs)
  test = parser.get_doctest(
      text, {
          'pd': runner.fake_pandas_module(), 'np': np
      },
      '<string>',
      '<string>',
      0)
  with expressions.allow_non_parallel_operations():
    result = runner.run(test)
  if report:
    runner.summarize()
  return result
Exemple #5
0
  def dot(self, other):
    # We want to broadcast the right hand side to all partitions of the left.
    # This is OK, as its index must be the same size as the columns set of self,
    # so cannot be too large.
    class AsScalar(object):
      def __init__(self, value):
        self.value = value

    if isinstance(other, frame_base.DeferredFrame):
      proxy = other._expr.proxy()
      with expressions.allow_non_parallel_operations():
        side = expressions.ComputedExpression(
            'as_scalar',
            lambda df: AsScalar(df),
            [other._expr],
            requires_partition_by=partitionings.Singleton())
    else:
      proxy = pd.DataFrame(columns=range(len(other[0])))
      side = expressions.ConstantExpression(AsScalar(other))

    return frame_base.DeferredFrame.wrap(
        expressions.ComputedExpression(
            'dot',
            lambda left, right: left @ right.value,
            [self._expr, side],
            requires_partition_by=partitionings.Nothing(),
            preserves_partition_by=partitionings.Index(),
            proxy=proxy))
Exemple #6
0
 def test_sum_mean(self):
     df = pd.DataFrame({
         'Animal': ['Falcon', 'Falcon', 'Parrot', 'Parrot'],
         'Speed': [380., 370., 24., 26.]
     })
     with expressions.allow_non_parallel_operations():
         self.run_scenario(df, lambda df: df.groupby('Animal').sum())
         self.run_scenario(df, lambda df: df.groupby('Animal').mean())
Exemple #7
0
 def aggregate(self, func, axis=0, *args, **kwargs):
     if axis is None:
         # Aggregate across all elements by first aggregating across columns,
         # then across rows.
         return self.agg(func, *args, **dict(kwargs, axis=1)).agg(
             func, *args, **dict(kwargs, axis=0))
     elif axis in (1, 'columns'):
         # This is an easy elementwise aggregation.
         return frame_base.DeferredFrame.wrap(
             expressions.ComputedExpression(
                 'aggregate',
                 lambda df: df.agg(func, axis=1, *args, **kwargs),
                 [self._expr],
                 requires_partition_by=partitionings.Nothing()))
     elif len(self._expr.proxy().columns) == 0 or args or kwargs:
         # For these corner cases, just colocate everything.
         return frame_base.DeferredFrame.wrap(
           expressions.ComputedExpression(
               'aggregate',
               lambda df: df.agg(func, *args, **kwargs),
               [self._expr],
               requires_partition_by=partitionings.Singleton()))
     else:
         # In the general case, compute the aggregation of each column separately,
         # then recombine.
         if not isinstance(func, dict):
             col_names = list(self._expr.proxy().columns)
             func = {col: func for col in col_names}
         else:
             col_names = list(func.keys())
         aggregated_cols = []
         for col in col_names:
             funcs = func[col]
             if not isinstance(funcs, list):
                 funcs = [funcs]
             aggregated_cols.append(self[col].agg(funcs, *args, **kwargs))
         # The final shape is different depending on whether any of the columns
         # were aggregated by a list of aggregators.
         with expressions.allow_non_parallel_operations():
             if any(isinstance(funcs, list) for funcs in func.values()):
                 return frame_base.DeferredFrame.wrap(
                     expressions.ComputedExpression(
                         'join_aggregate',
                         lambda *cols: pd.DataFrame(
                             {col: value for col, value in zip(col_names, cols)}),
                         [col._expr for col in aggregated_cols],
                         requires_partition_by=partitionings.Singleton()))
             else:
                 return frame_base.DeferredFrame.wrap(
                   expressions.ComputedExpression(
                       'join_aggregate',
                         lambda *cols: pd.Series(
                             {col: value[0] for col, value in zip(col_names, cols)}),
                       [col._expr for col in aggregated_cols],
                       requires_partition_by=partitionings.Singleton(),
                       proxy=self._expr.proxy().agg(func, *args, **kwargs)))
    def test_scalar(self):
        with expressions.allow_non_parallel_operations():
            a = pd.Series([1, 2, 6])
            self.run_scenario(a, lambda a: a.agg(sum))
            self.run_scenario(a, lambda a: a / a.agg(sum))

            # Tests scalar being used as an input to a downstream stage.
            df = pd.DataFrame({'key': ['a', 'a', 'b'], 'val': [1, 2, 6]})
            self.run_scenario(
                df, lambda df: df.groupby('key').sum().val / df.val.agg(sum))
Exemple #9
0
 def apply(self, func, name=None, args=()):
     if name is None:
         name = func.__name__
     with expressions.allow_non_parallel_operations(
             all(isinstance(arg, _DeferredScalar) for arg in args) or None):
         return DeferredFrame.wrap(
             expressions.ComputedExpression(
                 name,
                 func, [self._expr] + [arg._expr for arg in args],
                 requires_partition_by=partitionings.Singleton()))
Exemple #10
0
  def test_rename(self):
    df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]})
    self.run_scenario(
        df, lambda df: df.rename(columns={'B': 'C'}, index={
            0: 2, 2: 0
        }))

    with expressions.allow_non_parallel_operations():
      self.run_scenario(
          df,
          lambda df: df.rename(
              columns={'B': 'C'}, index={
                  0: 2, 2: 0
              }, errors='raise'))
Exemple #11
0
  def test_filter(self):
    df = pd.DataFrame({
        'Animal': ['Aardvark', 'Ant', 'Elephant', 'Zebra'],
        'Speed': [5, 2, 35, 40]
    })
    self.run_scenario(df, lambda df: df.filter(items=['Animal']))
    self.run_scenario(df, lambda df: df.filter(regex='Anim.*'))
    self.run_scenario(
        df, lambda df: df.set_index('Animal').filter(regex='F.*', axis='index'))

    with expressions.allow_non_parallel_operations():
      a = pd.DataFrame({'col': [1, 2, 3]})
      self.run_scenario(a, lambda a: a.agg(sum))
      self.run_scenario(a, lambda a: a.agg(['mean', 'min', 'max']))
Exemple #12
0
 def nsmallest(self, **kwargs):
   if 'keep' in kwargs and kwargs['keep'] != 'all':
     raise frame_base.WontImplementError('order-sensitive')
   per_partition = expressions.ComputedExpression(
       'nsmallest-per-partition',
       lambda df: df.nsmallest(**kwargs), [self._expr],
       preserves_partition_by=partitionings.Singleton(),
       requires_partition_by=partitionings.Nothing())
   with expressions.allow_non_parallel_operations(True):
     return frame_base.DeferredFrame.wrap(
         expressions.ComputedExpression(
             'nsmallest',
             lambda df: df.nsmallest(**kwargs), [per_partition],
             preserves_partition_by=partitionings.Singleton(),
             requires_partition_by=partitionings.Singleton()))
Exemple #13
0
def teststrings(texts, report=False, **runner_kwargs):
    optionflags = runner_kwargs.pop('optionflags', 0)
    optionflags |= (doctest.NORMALIZE_WHITESPACE
                    | doctest.IGNORE_EXCEPTION_DETAIL)

    parser = doctest.DocTestParser()
    runner = BeamDataframeDoctestRunner(TestEnvironment(),
                                        optionflags=optionflags,
                                        **runner_kwargs)
    globs = {
        'pd': runner.fake_pandas_module(),
        'np': np,
        'option_context': pd.option_context,
    }
    with expressions.allow_non_parallel_operations():
        for name, text in texts.items():
            test = parser.get_doctest(text, globs, name, name, 0)
            runner.run(test)
    if report:
        runner.summarize()
    return runner.summary().result()
Exemple #14
0
 def nlargest(self, keep, **kwargs):
   # TODO(robertwb): Document 'any' option.
   # TODO(robertwb): Consider (conditionally) defaulting to 'any' if no
   # explicit keep parameter is requested.
   if keep == 'any':
     keep = 'first'
   elif keep != 'all':
     raise frame_base.WontImplementError('order-sensitive')
   kwargs['keep'] = keep
   per_partition = expressions.ComputedExpression(
       'nlargest-per-partition',
       lambda df: df.nlargest(**kwargs), [self._expr],
       preserves_partition_by=partitionings.Singleton(),
       requires_partition_by=partitionings.Nothing())
   with expressions.allow_non_parallel_operations(True):
     return frame_base.DeferredFrame.wrap(
         expressions.ComputedExpression(
             'nlargest',
             lambda df: df.nlargest(**kwargs), [per_partition],
             preserves_partition_by=partitionings.Singleton(),
             requires_partition_by=partitionings.Singleton()))
Exemple #15
0
  def dot(self, other):
    left = self._expr
    if isinstance(other, DeferredSeries):
      right = expressions.ComputedExpression(
          'to_dataframe',
          pd.DataFrame, [other._expr],
          requires_partition_by=partitionings.Nothing(),
          preserves_partition_by=partitionings.Index())
      right_is_series = True
    elif isinstance(other, DeferredDataFrame):
      right = other._expr
      right_is_series = False
    else:
      raise frame_base.WontImplementError('non-deferred result')

    dots = expressions.ComputedExpression(
        'dot',
        # Transpose so we can sum across rows.
        (lambda left, right: pd.DataFrame(left @ right).T),
        [left, right],
        requires_partition_by=partitionings.Index())
    with expressions.allow_non_parallel_operations(True):
      sums = expressions.ComputedExpression(
          'sum',
          lambda dots: dots.sum(),  #
          [dots],
          requires_partition_by=partitionings.Singleton())

      if right_is_series:
        result = expressions.ComputedExpression(
            'extract',
            lambda df: df[0], [sums],
            requires_partition_by=partitionings.Singleton())
      else:
        result = sums
      return frame_base.DeferredFrame.wrap(result)
Exemple #16
0
 def test_aggregate(self):
     with expressions.allow_non_parallel_operations():
         a = pd.DataFrame({'col': [1, 2, 3]})
         self.run_scenario(a, lambda a: a.agg(sum))
         self.run_scenario(a, lambda a: a.agg(['mean', 'min', 'max']))