Ejemplo n.º 1
0
    def test_batch_with_df_transform(self):
        with TestPipeline() as p:
            res = (
                p
                | beam.Create([
                    Animal('Falcon', 380.0),
                    Animal('Falcon', 370.0),
                    Animal('Parrot', 24.0),
                    Animal('Parrot', 26.0)
                ])
                | schemas.BatchRowsAsDataFrame()
                | transforms.DataframeTransform(
                    lambda df: df.groupby('animal').mean(),
                    # TODO: Generate proxy in this case as well
                    proxy=schemas.generate_proxy(Animal),
                    include_indexes=True))
            assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)]))

        # Do the same thing, but use reset_index() to make sure 'animal' is included
        with TestPipeline() as p:
            with beam.dataframe.allow_non_parallel_operations():
                res = (
                    p
                    | beam.Create([
                        Animal('Falcon', 380.0),
                        Animal('Falcon', 370.0),
                        Animal('Parrot', 24.0),
                        Animal('Parrot', 26.0)
                    ])
                    | schemas.BatchRowsAsDataFrame()
                    | transforms.DataframeTransform(
                        lambda df: df.groupby('animal').mean().reset_index(),
                        # TODO: Generate proxy in this case as well
                        proxy=schemas.generate_proxy(Animal)))
                assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
def to_dataframe(
    pcoll,  # type: pvalue.PCollection
    proxy=None,  # type: Optional[pandas.core.generic.NDFrame]
    label=None,  # type: Optional[str]
):
  # type: (...) -> frame_base.DeferredFrame

  """Converts a PCollection to a deferred dataframe-like object, which can
  manipulated with pandas methods like `filter` and `groupby`.

  For example, one might write::

    pcoll = ...
    df = to_dataframe(pcoll, proxy=...)
    result = df.groupby('col').sum()
    pcoll_result = to_pcollection(result)

  A proxy object must be given if the schema for the PCollection is not known.
  """
  if proxy is None:
    if pcoll.element_type is None:
      raise ValueError(
          "Cannot infer a proxy because the input PCollection does not have a "
          "schema defined. Please make sure a schema type is specified for "
          "the input PCollection, or provide a proxy.")
    # If no proxy is given, assume this is an element-wise schema-aware
    # PCollection that needs to be batched.
    if label is None:
      # Attempt to come up with a reasonable, stable label by retrieving
      # the name of these variables in the calling context.
      label = 'BatchElements(%s)' % _var_name(pcoll, 2)
    proxy = schemas.generate_proxy(pcoll.element_type)
    pcoll = pcoll | label >> schemas.BatchRowsAsDataFrame(proxy=proxy)
  return frame_base.DeferredFrame.wrap(
      expressions.PlaceholderExpression(proxy, pcoll))
Ejemplo n.º 3
0
def to_dataframe(
        pcoll,  # type: pvalue.PCollection
        proxy=None,  # type: pandas.core.generic.NDFrame
):
    # type: (...) -> frame_base.DeferredFrame
    """Convers a PCollection to a deferred dataframe-like object, which can
  manipulated with pandas methods like `filter` and `groupby`.

  For example, one might write::

    pcoll = ...
    df = to_dataframe(pcoll, proxy=...)
    result = df.groupby('col').sum()
    pcoll_result = to_pcollection(result)

  A proxy object must be given if the schema for the PCollection is not known.
  """
    if proxy is None:
        if pcoll.element_type is None:
            raise ValueError(
                "Cannot infer a proxy because the input PCollection does not have a "
                "schema defined. Please make sure a schema type is specified for "
                "the input PCollection, or provide a proxy.")
        # If no proxy is given, assume this is an element-wise schema-aware
        # PCollection that needs to be batched.
        proxy = schemas.generate_proxy(pcoll.element_type)
        pcoll = pcoll | 'BatchElements' >> schemas.BatchRowsAsDataFrame()
    return frame_base.DeferredFrame.wrap(
        expressions.PlaceholderExpression(proxy, pcoll))
Ejemplo n.º 4
0
  def test_simple_df(self):
    expected = pd.DataFrame({
        'name': list(unicode(i) for i in range(5)),
        'id': list(range(5)),
        'height': list(float(i) for i in range(5))
    },
                            columns=['name', 'id', 'height'])

    with TestPipeline() as p:
      res = (
          p
          | beam.Create([
              Simple(name=unicode(i), id=i, height=float(i)) for i in range(5)
          ])
          | schemas.BatchRowsAsDataFrame(min_batch_size=10, max_batch_size=10))
      assert_that(res, matches_df(expected))
Ejemplo n.º 5
0
    def test_simple_df_with_beam_row(self):
        expected = pd.DataFrame(
            {
                'name': list(str(i) for i in range(5)),
                'id': list(range(5)),
                'height': list(float(i) for i in range(5))
            },
            columns=['name', 'id', 'height'])

        with TestPipeline() as p:
            res = (p
                   | beam.Create([(str(i), i, float(i)) for i in range(5)])
                   | beam.Select(name=lambda r: str(r[0]),
                                 id=lambda r: int(r[1]),
                                 height=lambda r: float(r[2]))
                   | schemas.BatchRowsAsDataFrame(min_batch_size=10,
                                                  max_batch_size=10))
            assert_that(res, matches_df(expected))
Ejemplo n.º 6
0
 def test_batch_with_df_transform(self):
     with TestPipeline() as p:
         res = (
             p
             | beam.Create([
                 Animal('Falcon', 380.0),
                 Animal('Falcon', 370.0),
                 Animal('Parrot', 24.0),
                 Animal('Parrot', 26.0)
             ])
             | schemas.BatchRowsAsDataFrame()
             | transforms.DataframeTransform(
                 lambda df: df.groupby('animal').mean(),
                 # TODO: Generate proxy in this case as well
                 proxy=schemas.generate_proxy(Animal)))
         assert_that(
             res,
             matches_df(
                 pd.DataFrame({'max_speed': [375.0, 25.0]},
                              index=pd.Index(data=['Falcon', 'Parrot'],
                                             name='animal'))))