def test_batch_with_df_transform(self): with TestPipeline() as p: res = ( p | beam.Create([ Animal('Falcon', 380.0), Animal('Falcon', 370.0), Animal('Parrot', 24.0), Animal('Parrot', 26.0) ]) | schemas.BatchRowsAsDataFrame() | transforms.DataframeTransform( lambda df: df.groupby('animal').mean(), # TODO: Generate proxy in this case as well proxy=schemas.generate_proxy(Animal), include_indexes=True)) assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)])) # Do the same thing, but use reset_index() to make sure 'animal' is included with TestPipeline() as p: with beam.dataframe.allow_non_parallel_operations(): res = ( p | beam.Create([ Animal('Falcon', 380.0), Animal('Falcon', 370.0), Animal('Parrot', 24.0), Animal('Parrot', 26.0) ]) | schemas.BatchRowsAsDataFrame() | transforms.DataframeTransform( lambda df: df.groupby('animal').mean().reset_index(), # TODO: Generate proxy in this case as well proxy=schemas.generate_proxy(Animal))) assert_that(res, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
def test_generate_proxy(self): expected = pd.DataFrame({ 'animal': pd.Series(dtype=pd.StringDtype()), 'max_speed': pd.Series(dtype=np.float64) }) self.assertTrue(schemas.generate_proxy(Animal).equals(expected))
def to_dataframe( pcoll, # type: pvalue.PCollection proxy=None, # type: Optional[pandas.core.generic.NDFrame] label=None, # type: Optional[str] ): # type: (...) -> frame_base.DeferredFrame """Converts a PCollection to a deferred dataframe-like object, which can manipulated with pandas methods like `filter` and `groupby`. For example, one might write:: pcoll = ... df = to_dataframe(pcoll, proxy=...) result = df.groupby('col').sum() pcoll_result = to_pcollection(result) A proxy object must be given if the schema for the PCollection is not known. """ if proxy is None: if pcoll.element_type is None: raise ValueError( "Cannot infer a proxy because the input PCollection does not have a " "schema defined. Please make sure a schema type is specified for " "the input PCollection, or provide a proxy.") # If no proxy is given, assume this is an element-wise schema-aware # PCollection that needs to be batched. if label is None: # Attempt to come up with a reasonable, stable label by retrieving # the name of these variables in the calling context. label = 'BatchElements(%s)' % _var_name(pcoll, 2) proxy = schemas.generate_proxy(pcoll.element_type) pcoll = pcoll | label >> schemas.BatchRowsAsDataFrame(proxy=proxy) return frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(proxy, pcoll))
def test_generate_proxy(self): expected = pd.DataFrame({ 'animal': pd.Series(dtype=unicode), 'max_speed': pd.Series(dtype=float) }) self.assertTrue(schemas.generate_proxy(Animal).equals(expected))
def to_dataframe( pcoll, # type: pvalue.PCollection proxy=None, # type: pandas.core.generic.NDFrame ): # type: (...) -> frame_base.DeferredFrame """Convers a PCollection to a deferred dataframe-like object, which can manipulated with pandas methods like `filter` and `groupby`. For example, one might write:: pcoll = ... df = to_dataframe(pcoll, proxy=...) result = df.groupby('col').sum() pcoll_result = to_pcollection(result) A proxy object must be given if the schema for the PCollection is not known. """ if proxy is None: if pcoll.element_type is None: raise ValueError( "Cannot infer a proxy because the input PCollection does not have a " "schema defined. Please make sure a schema type is specified for " "the input PCollection, or provide a proxy.") # If no proxy is given, assume this is an element-wise schema-aware # PCollection that needs to be batched. proxy = schemas.generate_proxy(pcoll.element_type) pcoll = pcoll | 'BatchElements' >> schemas.BatchRowsAsDataFrame() return frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(proxy, pcoll))
def test_bytes_proxy_roundtrip(self): proxy = pd.DataFrame({'bytes': []}) proxy.bytes = proxy.bytes.astype(bytes) roundtripped = schemas.generate_proxy( schemas.element_type_from_dataframe(proxy)) self.assertEqual(roundtripped.bytes.dtype.kind, 'S')
def test_batch_with_df_transform(self): with TestPipeline() as p: res = ( p | beam.Create([ Animal('Falcon', 380.0), Animal('Falcon', 370.0), Animal('Parrot', 24.0), Animal('Parrot', 26.0) ]) | schemas.BatchRowsAsDataFrame() | transforms.DataframeTransform( lambda df: df.groupby('animal').mean(), # TODO: Generate proxy in this case as well proxy=schemas.generate_proxy(Animal))) assert_that( res, matches_df( pd.DataFrame({'max_speed': [375.0, 25.0]}, index=pd.Index(data=['Falcon', 'Parrot'], name='animal'))))
def test_nice_types_proxy_roundtrip(self): roundtripped = schemas.generate_proxy( schemas.element_type_from_dataframe(NICE_TYPES_PROXY)) self.assertTrue(roundtripped.equals(NICE_TYPES_PROXY))