Exemple #1
0
    def test_batch_with_df_transform(self):
        with TestPipeline() as p:
            res = (
                p
                | beam.Create([
                    Animal('Falcon', 380.0),
                    Animal('Falcon', 370.0),
                    Animal('Parrot', 24.0),
                    Animal('Parrot', 26.0)
                ])
                | schemas.BatchRowsAsDataFrame()
                | transforms.DataframeTransform(
                    lambda df: df.groupby('animal').mean(),
                    # TODO: Generate proxy in this case as well
                    proxy=schemas.generate_proxy(Animal),
                    include_indexes=True))
            assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)]))

        # Do the same thing, but use reset_index() to make sure 'animal' is included
        with TestPipeline() as p:
            with beam.dataframe.allow_non_parallel_operations():
                res = (
                    p
                    | beam.Create([
                        Animal('Falcon', 380.0),
                        Animal('Falcon', 370.0),
                        Animal('Parrot', 24.0),
                        Animal('Parrot', 26.0)
                    ])
                    | schemas.BatchRowsAsDataFrame()
                    | transforms.DataframeTransform(
                        lambda df: df.groupby('animal').mean().reset_index(),
                        # TODO: Generate proxy in this case as well
                        proxy=schemas.generate_proxy(Animal)))
                assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
Exemple #2
0
    def test_generate_proxy(self):
        expected = pd.DataFrame({
            'animal': pd.Series(dtype=pd.StringDtype()),
            'max_speed': pd.Series(dtype=np.float64)
        })

        self.assertTrue(schemas.generate_proxy(Animal).equals(expected))
def to_dataframe(
    pcoll,  # type: pvalue.PCollection
    proxy=None,  # type: Optional[pandas.core.generic.NDFrame]
    label=None,  # type: Optional[str]
):
  # type: (...) -> frame_base.DeferredFrame

  """Converts a PCollection to a deferred dataframe-like object, which can
  manipulated with pandas methods like `filter` and `groupby`.

  For example, one might write::

    pcoll = ...
    df = to_dataframe(pcoll, proxy=...)
    result = df.groupby('col').sum()
    pcoll_result = to_pcollection(result)

  A proxy object must be given if the schema for the PCollection is not known.
  """
  if proxy is None:
    if pcoll.element_type is None:
      raise ValueError(
          "Cannot infer a proxy because the input PCollection does not have a "
          "schema defined. Please make sure a schema type is specified for "
          "the input PCollection, or provide a proxy.")
    # If no proxy is given, assume this is an element-wise schema-aware
    # PCollection that needs to be batched.
    if label is None:
      # Attempt to come up with a reasonable, stable label by retrieving
      # the name of these variables in the calling context.
      label = 'BatchElements(%s)' % _var_name(pcoll, 2)
    proxy = schemas.generate_proxy(pcoll.element_type)
    pcoll = pcoll | label >> schemas.BatchRowsAsDataFrame(proxy=proxy)
  return frame_base.DeferredFrame.wrap(
      expressions.PlaceholderExpression(proxy, pcoll))
    def test_generate_proxy(self):
        expected = pd.DataFrame({
            'animal': pd.Series(dtype=unicode),
            'max_speed': pd.Series(dtype=float)
        })

        self.assertTrue(schemas.generate_proxy(Animal).equals(expected))
Exemple #5
0
def to_dataframe(
        pcoll,  # type: pvalue.PCollection
        proxy=None,  # type: pandas.core.generic.NDFrame
):
    # type: (...) -> frame_base.DeferredFrame
    """Convers a PCollection to a deferred dataframe-like object, which can
  manipulated with pandas methods like `filter` and `groupby`.

  For example, one might write::

    pcoll = ...
    df = to_dataframe(pcoll, proxy=...)
    result = df.groupby('col').sum()
    pcoll_result = to_pcollection(result)

  A proxy object must be given if the schema for the PCollection is not known.
  """
    if proxy is None:
        if pcoll.element_type is None:
            raise ValueError(
                "Cannot infer a proxy because the input PCollection does not have a "
                "schema defined. Please make sure a schema type is specified for "
                "the input PCollection, or provide a proxy.")
        # If no proxy is given, assume this is an element-wise schema-aware
        # PCollection that needs to be batched.
        proxy = schemas.generate_proxy(pcoll.element_type)
        pcoll = pcoll | 'BatchElements' >> schemas.BatchRowsAsDataFrame()
    return frame_base.DeferredFrame.wrap(
        expressions.PlaceholderExpression(proxy, pcoll))
Exemple #6
0
    def test_bytes_proxy_roundtrip(self):
        proxy = pd.DataFrame({'bytes': []})
        proxy.bytes = proxy.bytes.astype(bytes)

        roundtripped = schemas.generate_proxy(
            schemas.element_type_from_dataframe(proxy))

        self.assertEqual(roundtripped.bytes.dtype.kind, 'S')
 def test_batch_with_df_transform(self):
     with TestPipeline() as p:
         res = (
             p
             | beam.Create([
                 Animal('Falcon', 380.0),
                 Animal('Falcon', 370.0),
                 Animal('Parrot', 24.0),
                 Animal('Parrot', 26.0)
             ])
             | schemas.BatchRowsAsDataFrame()
             | transforms.DataframeTransform(
                 lambda df: df.groupby('animal').mean(),
                 # TODO: Generate proxy in this case as well
                 proxy=schemas.generate_proxy(Animal)))
         assert_that(
             res,
             matches_df(
                 pd.DataFrame({'max_speed': [375.0, 25.0]},
                              index=pd.Index(data=['Falcon', 'Parrot'],
                                             name='animal'))))
Exemple #8
0
 def test_nice_types_proxy_roundtrip(self):
     roundtripped = schemas.generate_proxy(
         schemas.element_type_from_dataframe(NICE_TYPES_PROXY))
     self.assertTrue(roundtripped.equals(NICE_TYPES_PROXY))