def test_batch_with_df_transform(self): with TestPipeline() as p: res = ( p | beam.Create([ Animal('Falcon', 380.0), Animal('Falcon', 370.0), Animal('Parrot', 24.0), Animal('Parrot', 26.0) ]) | schemas.BatchRowsAsDataFrame() | transforms.DataframeTransform( lambda df: df.groupby('animal').mean(), # TODO: Generate proxy in this case as well proxy=schemas.generate_proxy(Animal), include_indexes=True)) assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)])) # Do the same thing, but use reset_index() to make sure 'animal' is included with TestPipeline() as p: with beam.dataframe.allow_non_parallel_operations(): res = ( p | beam.Create([ Animal('Falcon', 380.0), Animal('Falcon', 370.0), Animal('Parrot', 24.0), Animal('Parrot', 26.0) ]) | schemas.BatchRowsAsDataFrame() | transforms.DataframeTransform( lambda df: df.groupby('animal').mean().reset_index(), # TODO: Generate proxy in this case as well proxy=schemas.generate_proxy(Animal))) assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
def to_dataframe( pcoll, # type: pvalue.PCollection proxy=None, # type: Optional[pandas.core.generic.NDFrame] label=None, # type: Optional[str] ): # type: (...) -> frame_base.DeferredFrame """Converts a PCollection to a deferred dataframe-like object, which can manipulated with pandas methods like `filter` and `groupby`. For example, one might write:: pcoll = ... df = to_dataframe(pcoll, proxy=...) result = df.groupby('col').sum() pcoll_result = to_pcollection(result) A proxy object must be given if the schema for the PCollection is not known. """ if proxy is None: if pcoll.element_type is None: raise ValueError( "Cannot infer a proxy because the input PCollection does not have a " "schema defined. Please make sure a schema type is specified for " "the input PCollection, or provide a proxy.") # If no proxy is given, assume this is an element-wise schema-aware # PCollection that needs to be batched. if label is None: # Attempt to come up with a reasonable, stable label by retrieving # the name of these variables in the calling context. label = 'BatchElements(%s)' % _var_name(pcoll, 2) proxy = schemas.generate_proxy(pcoll.element_type) pcoll = pcoll | label >> schemas.BatchRowsAsDataFrame(proxy=proxy) return frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(proxy, pcoll))
def to_dataframe( pcoll, # type: pvalue.PCollection proxy=None, # type: pandas.core.generic.NDFrame ): # type: (...) -> frame_base.DeferredFrame """Convers a PCollection to a deferred dataframe-like object, which can manipulated with pandas methods like `filter` and `groupby`. For example, one might write:: pcoll = ... df = to_dataframe(pcoll, proxy=...) result = df.groupby('col').sum() pcoll_result = to_pcollection(result) A proxy object must be given if the schema for the PCollection is not known. """ if proxy is None: if pcoll.element_type is None: raise ValueError( "Cannot infer a proxy because the input PCollection does not have a " "schema defined. Please make sure a schema type is specified for " "the input PCollection, or provide a proxy.") # If no proxy is given, assume this is an element-wise schema-aware # PCollection that needs to be batched. proxy = schemas.generate_proxy(pcoll.element_type) pcoll = pcoll | 'BatchElements' >> schemas.BatchRowsAsDataFrame() return frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(proxy, pcoll))
def test_simple_df(self): expected = pd.DataFrame({ 'name': list(unicode(i) for i in range(5)), 'id': list(range(5)), 'height': list(float(i) for i in range(5)) }, columns=['name', 'id', 'height']) with TestPipeline() as p: res = ( p | beam.Create([ Simple(name=unicode(i), id=i, height=float(i)) for i in range(5) ]) | schemas.BatchRowsAsDataFrame(min_batch_size=10, max_batch_size=10)) assert_that(res, matches_df(expected))
def test_simple_df_with_beam_row(self): expected = pd.DataFrame( { 'name': list(str(i) for i in range(5)), 'id': list(range(5)), 'height': list(float(i) for i in range(5)) }, columns=['name', 'id', 'height']) with TestPipeline() as p: res = (p | beam.Create([(str(i), i, float(i)) for i in range(5)]) | beam.Select(name=lambda r: str(r[0]), id=lambda r: int(r[1]), height=lambda r: float(r[2])) | schemas.BatchRowsAsDataFrame(min_batch_size=10, max_batch_size=10)) assert_that(res, matches_df(expected))
def test_batch_with_df_transform(self): with TestPipeline() as p: res = ( p | beam.Create([ Animal('Falcon', 380.0), Animal('Falcon', 370.0), Animal('Parrot', 24.0), Animal('Parrot', 26.0) ]) | schemas.BatchRowsAsDataFrame() | transforms.DataframeTransform( lambda df: df.groupby('animal').mean(), # TODO: Generate proxy in this case as well proxy=schemas.generate_proxy(Animal))) assert_that( res, matches_df( pd.DataFrame({'max_speed': [375.0, 25.0]}, index=pd.Index(data=['Falcon', 'Parrot'], name='animal'))))