Exemple #1
0
    def test_unbatch_with_index(self, df_or_series, rows):
        proxy = df_or_series[:0]

        with TestPipeline() as p:
            res = (p | beam.Create([df_or_series[::2], df_or_series[1::2]])
                   | schemas.UnbatchPandas(proxy, include_indexes=True))

            assert_that(res, equal_to(rows))
Exemple #2
0
    def test_unbatch_include_index_unnamed_index_raises(self):
        df = pd.DataFrame({'foo': [1, 2, 3, 4]})
        proxy = df[:0]

        with TestPipeline() as p:
            pc = p | beam.Create([df[::2], df[1::2]])

            with self.assertRaisesRegex(ValueError, 'unnamed'):
                _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
Exemple #3
0
    def test_unbatch_include_index_column_conflict_raises(self):
        df = pd.DataFrame({'foo': [1, 2, 3, 4]})
        df.index = pd.Index([4, 3, 2, 1], name='foo')
        proxy = df[:0]

        with TestPipeline() as p:
            pc = p | beam.Create([df[::2], df[1::2]])

            with self.assertRaisesRegex(ValueError, 'foo'):
                _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
Exemple #4
0
    def test_unbatch_include_index_nonunique_index_raises(self):
        df = pd.DataFrame({'foo': [1, 2, 3, 4]})
        df.index = pd.MultiIndex.from_arrays([[1, 2, 3, 4], [4, 3, 2, 1]],
                                             names=['bar', 'bar'])
        proxy = df[:0]

        with TestPipeline() as p:
            pc = p | beam.Create([df[::2], df[1::2]])

            with self.assertRaisesRegex(ValueError, 'bar'):
                _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
Exemple #5
0
    def test_unbatch_no_index(self, df_or_series, rows, beam_type):
        proxy = df_or_series[:0]

        with TestPipeline() as p:
            res = (p | beam.Create([df_or_series[::2], df_or_series[1::2]])
                   | schemas.UnbatchPandas(proxy))

            # Verify that the unbatched PCollection has the expected typehint
            # TODO(BEAM-8538): typehints should support NamedTuple so we can use
            # typehints.is_consistent_with here instead
            self.assert_typehints_equal(res.element_type, beam_type)

            assert_that(res, equal_to(rows))
Exemple #6
0
def _make_unbatched_pcoll(pc: pvalue.PCollection, expr: expressions.Expression,
                          include_indexes: bool):
    label = f"Unbatch '{expr._id}'"
    if include_indexes:
        label += " with indexes"

    if label not in UNBATCHED_CACHE:
        UNBATCHED_CACHE[label] = pc | label >> schemas.UnbatchPandas(
            expr.proxy(), include_indexes=include_indexes)

    # Note unbatched cache is keyed by the expression id as well as parameters
    # for the unbatching (i.e. include_indexes)
    return UNBATCHED_CACHE[label]
Exemple #7
0
    def test_unbatch_datetime(self):

        s = pd.Series(
            pd.date_range('1/1/2000',
                          periods=100,
                          freq='m',
                          tz='America/Los_Angeles'))
        proxy = s[:0]

        with TestPipeline() as p:
            res = (p | beam.Create([s[::2], s[1::2]])
                   | schemas.UnbatchPandas(proxy, include_indexes=True))

            assert_that(res, equal_to(list(s)))
Exemple #8
0
def to_pcollection(
    *dataframes,  # type: frame_base.DeferredFrame
    **kwargs):
  # type: (...) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]]

  """Converts one or more deferred dataframe-like objects back to a PCollection.

  This method creates and applies the actual Beam operations that compute
  the given deferred dataframes, returning a PCollection of their results. By
  default the resulting PCollections are schema-aware PCollections where each
  element is one row from the output dataframes, excluding indexes. This
  behavior can be modified with the `yield_elements` and `include_indexes`
  arguments.

  If more than one (related) result is desired, it can be more efficient to
  pass them all at the same time to this method.

  Args:
    always_return_tuple: (optional, default: False) If true, always return
        a tuple of PCollections, even if there's only one output.
    yield_elements: (optional, default: "schemas") If set to "pandas", return
        PCollections containing the raw Pandas objects (DataFrames or Series),
        if set to "schemas", return an element-wise PCollection, where DataFrame
        and Series instances are expanded to one element per row. DataFrames are
        converted to schema-aware PCollections, where column values can be
        accessed by attribute.
    include_indexes: (optional, default: False) When yield_elements="schemas",
        if include_indexes=True, attempt to include index columns in the output
        schema for expanded DataFrames. Raises an error if any of the index
        levels are unnamed (name=None), or if any of the names are not unique
        among all column and index names.
  """
  label = kwargs.pop('label', None)
  always_return_tuple = kwargs.pop('always_return_tuple', False)
  yield_elements = kwargs.pop('yield_elements', 'schemas')
  if not yield_elements in ("pandas", "schemas"):
    raise ValueError(
        "Invalid value for yield_elements argument, '%s'. "
        "Allowed values are 'pandas' and 'schemas'" % yield_elements)
  include_indexes = kwargs.pop('include_indexes', False)
  assert not kwargs  # TODO(BEAM-7372): Use PEP 3102
  if label is None:
    # Attempt to come up with a reasonable, stable label by retrieving the name
    # of these variables in the calling context.
    current_frame = inspect.currentframe()
    if current_frame is None:
      label = 'ToDataframe(...)'

    else:
      previous_frame = current_frame.f_back

      def name(obj):
        for key, value in previous_frame.f_locals.items():
          if obj is value:
            return key
        for key, value in previous_frame.f_globals.items():
          if obj is value:
            return key
        return '...'

      label = 'ToDataframe(%s)' % ', '.join(name(e) for e in dataframes)

  def extract_input(placeholder):
    if not isinstance(placeholder._reference, pvalue.PCollection):
      raise TypeError(
          'Expression roots must have been created with to_dataframe.')
    return placeholder._reference

  placeholders = frozenset.union(
      frozenset(), *[df._expr.placeholders() for df in dataframes])
  results = {p: extract_input(p)
             for p in placeholders
             } | label >> transforms._DataframeExpressionsTransform(
                 dict((ix, df._expr) for ix, df in enumerate(
                     dataframes)))  # type: Dict[Any, pvalue.PCollection]

  if yield_elements == "schemas":
    results = {
        key: pc
        | "Unbatch '%s'" % dataframes[key]._expr._id >> schemas.UnbatchPandas(
            dataframes[key]._expr.proxy(), include_indexes=include_indexes)
        for (key, pc) in results.items()
    }

  if len(results) == 1 and not always_return_tuple:
    return results[0]
  else:
    return tuple(value for key, value in sorted(results.items()))
Exemple #9
0
 def maybe_unbatch(pc, value):
     if isinstance(value, frame_base._DeferredScalar):
         return pc
     else:
         return pc | "Unbatch '%s'" % value._expr._id >> schemas.UnbatchPandas(
             value._expr.proxy(), include_indexes=include_indexes)