def compute_using_beam(self, to_compute): with _InMemoryResultRecorder() as recorder: with beam.Pipeline() as p: input_pcolls = { placeholder: p | 'Create%s' % placeholder >> beam.Create( [input[::2], input[1::2]]) for placeholder, input in self._env._inputs.items() } output_pcolls = (input_pcolls | transforms._DataframeExpressionsTransform({ name: frame._expr for name, frame in to_compute.items() })) for name, output_pcoll in output_pcolls.items(): _ = output_pcoll | 'Record%s' % name >> beam.FlatMap( recorder.record_fn(name)) # pipeline runs, side effects recorded def concat(values): if len(values) > 1: return pd.concat(values) else: return values[0] return { name: concat(recorder.get_recorded(name)) for name in to_compute.keys() }
def to_pcollection( *dataframes, # type: frame_base.DeferredFrame **kwargs): # type: (...) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]] """Converts one or more deferred dataframe-like objects back to a PCollection. This method creates and applies the actual Beam operations that compute the given deferred dataframes, returning a PCollection of their results. If more than one (related) result is desired, it can be more efficient to pass them all at the same time to this method. """ label = kwargs.pop('label', None) always_return_tuple = kwargs.pop('always_return_tuple', False) assert not kwargs # TODO(Py3): Use PEP 3102 if label is None: # Attempt to come up with a reasonable, stable label by retrieving the name # of these variables in the calling context. current_frame = inspect.currentframe() if current_frame is None: label = 'ToDataframe(...)' else: previous_frame = current_frame.f_back def name(obj): for key, value in previous_frame.f_locals.items(): if obj is value: return key for key, value in previous_frame.f_globals.items(): if obj is value: return key return '...' label = 'ToDataframe(%s)' % ', '.join(name(e) for e in dataframes) def extract_input(placeholder): if not isinstance(placeholder._reference, pvalue.PCollection): raise TypeError( 'Expression roots must have been created with to_dataframe.') return placeholder._reference placeholders = frozenset.union( frozenset(), *[df._expr.placeholders() for df in dataframes]) results = {p: extract_input(p) for p in placeholders } | label >> transforms._DataframeExpressionsTransform( dict((ix, df._expr) for ix, df in enumerate( dataframes))) # type: Dict[Any, pvalue.PCollection] if len(results) == 1 and not always_return_tuple: return results[0] else: return tuple(value for key, value in sorted(results.items()))
def to_pcollection( *dataframes, # type: frame_base.DeferredFrame **kwargs): # type: (...) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]] """Converts one or more deferred dataframe-like objects back to a PCollection. This method creates and applies the actual Beam operations that compute the given deferred dataframes, returning a PCollection of their results. By default the resulting PCollections are schema-aware PCollections where each element is one row from the output dataframes, excluding indexes. This behavior can be modified with the `yield_elements` and `include_indexes` arguments. If more than one (related) result is desired, it can be more efficient to pass them all at the same time to this method. Args: always_return_tuple: (optional, default: False) If true, always return a tuple of PCollections, even if there's only one output. yield_elements: (optional, default: "schemas") If set to "pandas", return PCollections containing the raw Pandas objects (DataFrames or Series), if set to "schemas", return an element-wise PCollection, where DataFrame and Series instances are expanded to one element per row. DataFrames are converted to schema-aware PCollections, where column values can be accessed by attribute. include_indexes: (optional, default: False) When yield_elements="schemas", if include_indexes=True, attempt to include index columns in the output schema for expanded DataFrames. Raises an error if any of the index levels are unnamed (name=None), or if any of the names are not unique among all column and index names. """ label = kwargs.pop('label', None) always_return_tuple = kwargs.pop('always_return_tuple', False) yield_elements = kwargs.pop('yield_elements', 'schemas') if not yield_elements in ("pandas", "schemas"): raise ValueError( "Invalid value for yield_elements argument, '%s'. " "Allowed values are 'pandas' and 'schemas'" % yield_elements) include_indexes = kwargs.pop('include_indexes', False) assert not kwargs # TODO(BEAM-7372): Use PEP 3102 if label is None: # Attempt to come up with a reasonable, stable label by retrieving the name # of these variables in the calling context. label = 'ToPCollection(%s)' % ', '.join(_var_name(e, 3) for e in dataframes) def extract_input(placeholder): if not isinstance(placeholder._reference, pvalue.PCollection): raise TypeError( 'Expression roots must have been created with to_dataframe.') return placeholder._reference placeholders = frozenset.union( frozenset(), *[df._expr.placeholders() for df in dataframes]) # Exclude any dataframes that have already been converted to PCollections. # We only want to convert each DF expression once, then re-use. new_dataframes = [ df for df in dataframes if df._expr._id not in TO_PCOLLECTION_CACHE ] if len(new_dataframes): new_results = {p: extract_input(p) for p in placeholders } | label >> transforms._DataframeExpressionsTransform({ ix: df._expr for (ix, df) in enumerate(new_dataframes) }) # type: Dict[Any, pvalue.PCollection] TO_PCOLLECTION_CACHE.update( {new_dataframes[ix]._expr._id: pc for ix, pc in new_results.items()}) raw_results = { ix: TO_PCOLLECTION_CACHE[df._expr._id] for ix, df in enumerate(dataframes) } if yield_elements == "schemas": def maybe_unbatch(pc, value): if isinstance(value, frame_base._DeferredScalar): return pc else: return _make_unbatched_pcoll(pc, value._expr, include_indexes) results = { ix: maybe_unbatch(pc, dataframes[ix]) for (ix, pc) in raw_results.items() } else: results = raw_results if len(results) == 1 and not always_return_tuple: return results[0] else: return tuple(value for key, value in sorted(results.items()))
def to_pcollection( *dataframes, # type: frame_base.DeferredFrame **kwargs): # type: (...) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]] """Converts one or more deferred dataframe-like objects back to a PCollection. This method creates and applies the actual Beam operations that compute the given deferred dataframes, returning a PCollection of their results. By default the resulting PCollections are schema-aware PCollections where each element is one row from the output dataframes, excluding indexes. This behavior can be modified with the `yield_elements` and `include_indexes` arguments. If more than one (related) result is desired, it can be more efficient to pass them all at the same time to this method. Args: always_return_tuple: (optional, default: False) If true, always return a tuple of PCollections, even if there's only one output. yield_elements: (optional, default: "schemas") If set to "pandas", return PCollections containing the raw Pandas objects (DataFrames or Series), if set to "schemas", return an element-wise PCollection, where DataFrame and Series instances are expanded to one element per row. DataFrames are converted to schema-aware PCollections, where column values can be accessed by attribute. include_indexes: (optional, default: False) When yield_elements="schemas", if include_indexes=True, attempt to include index columns in the output schema for expanded DataFrames. Raises an error if any of the index levels are unnamed (name=None), or if any of the names are not unique among all column and index names. """ label = kwargs.pop('label', None) always_return_tuple = kwargs.pop('always_return_tuple', False) yield_elements = kwargs.pop('yield_elements', 'schemas') if not yield_elements in ("pandas", "schemas"): raise ValueError( "Invalid value for yield_elements argument, '%s'. " "Allowed values are 'pandas' and 'schemas'" % yield_elements) include_indexes = kwargs.pop('include_indexes', False) assert not kwargs # TODO(BEAM-7372): Use PEP 3102 if label is None: # Attempt to come up with a reasonable, stable label by retrieving the name # of these variables in the calling context. current_frame = inspect.currentframe() if current_frame is None: label = 'ToDataframe(...)' else: previous_frame = current_frame.f_back def name(obj): for key, value in previous_frame.f_locals.items(): if obj is value: return key for key, value in previous_frame.f_globals.items(): if obj is value: return key return '...' label = 'ToDataframe(%s)' % ', '.join(name(e) for e in dataframes) def extract_input(placeholder): if not isinstance(placeholder._reference, pvalue.PCollection): raise TypeError( 'Expression roots must have been created with to_dataframe.') return placeholder._reference placeholders = frozenset.union( frozenset(), *[df._expr.placeholders() for df in dataframes]) results = {p: extract_input(p) for p in placeholders } | label >> transforms._DataframeExpressionsTransform( dict((ix, df._expr) for ix, df in enumerate( dataframes))) # type: Dict[Any, pvalue.PCollection] if yield_elements == "schemas": results = { key: pc | "Unbatch '%s'" % dataframes[key]._expr._id >> schemas.UnbatchPandas( dataframes[key]._expr.proxy(), include_indexes=include_indexes) for (key, pc) in results.items() } if len(results) == 1 and not always_return_tuple: return results[0] else: return tuple(value for key, value in sorted(results.items()))
def to_pcollection( *dataframes, # type: Union[frame_base.DeferredFrame, pd.DataFrame, pd.Series] label=None, always_return_tuple=False, yield_elements='schemas', include_indexes=False, pipeline=None ) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]]: """Converts one or more deferred dataframe-like objects back to a PCollection. This method creates and applies the actual Beam operations that compute the given deferred dataframes, returning a PCollection of their results. By default the resulting PCollections are schema-aware PCollections where each element is one row from the output dataframes, excluding indexes. This behavior can be modified with the `yield_elements` and `include_indexes` arguments. Also accepts non-deferred pandas dataframes, which are converted to deferred, schema'd PCollections. In this case the contents of the entire dataframe are serialized into the graph, so for large amounts of data it is preferable to write them to disk and read them with one of the read methods. If more than one (related) result is desired, it can be more efficient to pass them all at the same time to this method. Args: label: (optional, default "ToPCollection(...)"") the label to use for the conversion transform. always_return_tuple: (optional, default: False) If true, always return a tuple of PCollections, even if there's only one output. yield_elements: (optional, default: "schemas") If set to "pandas", return PCollections containing the raw Pandas objects (DataFrames or Series), if set to "schemas", return an element-wise PCollection, where DataFrame and Series instances are expanded to one element per row. DataFrames are converted to schema-aware PCollections, where column values can be accessed by attribute. include_indexes: (optional, default: False) When yield_elements="schemas", if include_indexes=True, attempt to include index columns in the output schema for expanded DataFrames. Raises an error if any of the index levels are unnamed (name=None), or if any of the names are not unique among all column and index names. pipeline: (optional, unless non-deferred dataframes are passed) Used when creating a PCollection from a non-deferred dataframe. """ if not yield_elements in ("pandas", "schemas"): raise ValueError("Invalid value for yield_elements argument, '%s'. " "Allowed values are 'pandas' and 'schemas'" % yield_elements) if label is None: # Attempt to come up with a reasonable, stable label by retrieving the name # of these variables in the calling context. label = 'ToPCollection(%s)' % ', '.join( _var_name(e, 3) for e in dataframes) # Support for non-deferred dataframes. deferred_dataframes = [] for ix, df in enumerate(dataframes): if isinstance(df, frame_base.DeferredBase): # TODO(robertwb): Maybe extract pipeline object? deferred_dataframes.append(df) elif isinstance(df, (pd.Series, pd.DataFrame)): if pipeline is None: raise ValueError( 'Pipeline keyword required for non-deferred dataframe conversion.' ) deferred = pipeline | '%s_Defer%s' % (label, ix) >> beam.Create( [df]) deferred_dataframes.append( frame_base.DeferredFrame.wrap( expressions.PlaceholderExpression(df.iloc[:0], deferred))) else: raise TypeError( 'Unable to convert objects of type %s to a PCollection' % type(df)) dataframes = tuple(deferred_dataframes) def extract_input(placeholder): if not isinstance(placeholder._reference, pvalue.PCollection): raise TypeError( 'Expression roots must have been created with to_dataframe.') return placeholder._reference placeholders = frozenset.union( frozenset(), *[df._expr.placeholders() for df in dataframes]) # Exclude any dataframes that have already been converted to PCollections. # We only want to convert each DF expression once, then re-use. new_dataframes = [ df for df in dataframes if df._expr._id not in TO_PCOLLECTION_CACHE ] if len(new_dataframes): new_results = {p: extract_input(p) for p in placeholders } | label >> transforms._DataframeExpressionsTransform({ ix: df._expr for (ix, df) in enumerate(new_dataframes) }) # type: Dict[Any, pvalue.PCollection] TO_PCOLLECTION_CACHE.update({ new_dataframes[ix]._expr._id: pc for ix, pc in new_results.items() }) raw_results = { ix: TO_PCOLLECTION_CACHE[df._expr._id] for ix, df in enumerate(dataframes) } if yield_elements == "schemas": def maybe_unbatch(pc, value): if isinstance(value, frame_base._DeferredScalar): return pc else: return _make_unbatched_pcoll(pc, value._expr, include_indexes) results = { ix: maybe_unbatch(pc, dataframes[ix]) for (ix, pc) in raw_results.items() } else: results = raw_results if len(results) == 1 and not always_return_tuple: return results[0] else: return tuple(value for key, value in sorted(results.items()))