def test_unbatch_with_index(self, df_or_series, rows): proxy = df_or_series[:0] with TestPipeline() as p: res = (p | beam.Create([df_or_series[::2], df_or_series[1::2]]) | schemas.UnbatchPandas(proxy, include_indexes=True)) assert_that(res, equal_to(rows))
def test_unbatch_include_index_unnamed_index_raises(self): df = pd.DataFrame({'foo': [1, 2, 3, 4]}) proxy = df[:0] with TestPipeline() as p: pc = p | beam.Create([df[::2], df[1::2]]) with self.assertRaisesRegex(ValueError, 'unnamed'): _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
def test_unbatch_include_index_column_conflict_raises(self): df = pd.DataFrame({'foo': [1, 2, 3, 4]}) df.index = pd.Index([4, 3, 2, 1], name='foo') proxy = df[:0] with TestPipeline() as p: pc = p | beam.Create([df[::2], df[1::2]]) with self.assertRaisesRegex(ValueError, 'foo'): _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
def test_unbatch_include_index_nonunique_index_raises(self): df = pd.DataFrame({'foo': [1, 2, 3, 4]}) df.index = pd.MultiIndex.from_arrays([[1, 2, 3, 4], [4, 3, 2, 1]], names=['bar', 'bar']) proxy = df[:0] with TestPipeline() as p: pc = p | beam.Create([df[::2], df[1::2]]) with self.assertRaisesRegex(ValueError, 'bar'): _ = pc | schemas.UnbatchPandas(proxy, include_indexes=True)
def test_unbatch_no_index(self, df_or_series, rows, beam_type): proxy = df_or_series[:0] with TestPipeline() as p: res = (p | beam.Create([df_or_series[::2], df_or_series[1::2]]) | schemas.UnbatchPandas(proxy)) # Verify that the unbatched PCollection has the expected typehint # TODO(BEAM-8538): typehints should support NamedTuple so we can use # typehints.is_consistent_with here instead self.assert_typehints_equal(res.element_type, beam_type) assert_that(res, equal_to(rows))
def _make_unbatched_pcoll(pc: pvalue.PCollection, expr: expressions.Expression, include_indexes: bool): label = f"Unbatch '{expr._id}'" if include_indexes: label += " with indexes" if label not in UNBATCHED_CACHE: UNBATCHED_CACHE[label] = pc | label >> schemas.UnbatchPandas( expr.proxy(), include_indexes=include_indexes) # Note unbatched cache is keyed by the expression id as well as parameters # for the unbatching (i.e. include_indexes) return UNBATCHED_CACHE[label]
def test_unbatch_datetime(self): s = pd.Series( pd.date_range('1/1/2000', periods=100, freq='m', tz='America/Los_Angeles')) proxy = s[:0] with TestPipeline() as p: res = (p | beam.Create([s[::2], s[1::2]]) | schemas.UnbatchPandas(proxy, include_indexes=True)) assert_that(res, equal_to(list(s)))
def to_pcollection( *dataframes, # type: frame_base.DeferredFrame **kwargs): # type: (...) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]] """Converts one or more deferred dataframe-like objects back to a PCollection. This method creates and applies the actual Beam operations that compute the given deferred dataframes, returning a PCollection of their results. By default the resulting PCollections are schema-aware PCollections where each element is one row from the output dataframes, excluding indexes. This behavior can be modified with the `yield_elements` and `include_indexes` arguments. If more than one (related) result is desired, it can be more efficient to pass them all at the same time to this method. Args: always_return_tuple: (optional, default: False) If true, always return a tuple of PCollections, even if there's only one output. yield_elements: (optional, default: "schemas") If set to "pandas", return PCollections containing the raw Pandas objects (DataFrames or Series), if set to "schemas", return an element-wise PCollection, where DataFrame and Series instances are expanded to one element per row. DataFrames are converted to schema-aware PCollections, where column values can be accessed by attribute. include_indexes: (optional, default: False) When yield_elements="schemas", if include_indexes=True, attempt to include index columns in the output schema for expanded DataFrames. Raises an error if any of the index levels are unnamed (name=None), or if any of the names are not unique among all column and index names. """ label = kwargs.pop('label', None) always_return_tuple = kwargs.pop('always_return_tuple', False) yield_elements = kwargs.pop('yield_elements', 'schemas') if not yield_elements in ("pandas", "schemas"): raise ValueError( "Invalid value for yield_elements argument, '%s'. " "Allowed values are 'pandas' and 'schemas'" % yield_elements) include_indexes = kwargs.pop('include_indexes', False) assert not kwargs # TODO(BEAM-7372): Use PEP 3102 if label is None: # Attempt to come up with a reasonable, stable label by retrieving the name # of these variables in the calling context. current_frame = inspect.currentframe() if current_frame is None: label = 'ToDataframe(...)' else: previous_frame = current_frame.f_back def name(obj): for key, value in previous_frame.f_locals.items(): if obj is value: return key for key, value in previous_frame.f_globals.items(): if obj is value: return key return '...' label = 'ToDataframe(%s)' % ', '.join(name(e) for e in dataframes) def extract_input(placeholder): if not isinstance(placeholder._reference, pvalue.PCollection): raise TypeError( 'Expression roots must have been created with to_dataframe.') return placeholder._reference placeholders = frozenset.union( frozenset(), *[df._expr.placeholders() for df in dataframes]) results = {p: extract_input(p) for p in placeholders } | label >> transforms._DataframeExpressionsTransform( dict((ix, df._expr) for ix, df in enumerate( dataframes))) # type: Dict[Any, pvalue.PCollection] if yield_elements == "schemas": results = { key: pc | "Unbatch '%s'" % dataframes[key]._expr._id >> schemas.UnbatchPandas( dataframes[key]._expr.proxy(), include_indexes=include_indexes) for (key, pc) in results.items() } if len(results) == 1 and not always_return_tuple: return results[0] else: return tuple(value for key, value in sorted(results.items()))
def maybe_unbatch(pc, value): if isinstance(value, frame_base._DeferredScalar): return pc else: return pc | "Unbatch '%s'" % value._expr._id >> schemas.UnbatchPandas( value._expr.proxy(), include_indexes=include_indexes)