Esempio n. 1
0
    def compute_using_beam(self, to_compute):
        with _InMemoryResultRecorder() as recorder:
            with beam.Pipeline() as p:
                input_pcolls = {
                    placeholder: p
                    | 'Create%s' % placeholder >> beam.Create(
                        [input[::2], input[1::2]])
                    for placeholder, input in self._env._inputs.items()
                }
                output_pcolls = (input_pcolls
                                 | transforms._DataframeExpressionsTransform({
                                     name: frame._expr
                                     for name, frame in to_compute.items()
                                 }))
                for name, output_pcoll in output_pcolls.items():
                    _ = output_pcoll | 'Record%s' % name >> beam.FlatMap(
                        recorder.record_fn(name))
            # pipeline runs, side effects recorded

            def concat(values):
                if len(values) > 1:
                    return pd.concat(values)
                else:
                    return values[0]

            return {
                name: concat(recorder.get_recorded(name))
                for name in to_compute.keys()
            }
def to_pcollection(
        *dataframes,  # type: frame_base.DeferredFrame
        **kwargs):
    # type: (...) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]]
    """Converts one or more deferred dataframe-like objects back to a PCollection.

  This method creates and applies the actual Beam operations that compute
  the given deferred dataframes, returning a PCollection of their results.

  If more than one (related) result is desired, it can be more efficient to
  pass them all at the same time to this method.
  """
    label = kwargs.pop('label', None)
    always_return_tuple = kwargs.pop('always_return_tuple', False)
    assert not kwargs  # TODO(Py3): Use PEP 3102
    if label is None:
        # Attempt to come up with a reasonable, stable label by retrieving the name
        # of these variables in the calling context.
        current_frame = inspect.currentframe()
        if current_frame is None:
            label = 'ToDataframe(...)'

        else:
            previous_frame = current_frame.f_back

            def name(obj):
                for key, value in previous_frame.f_locals.items():
                    if obj is value:
                        return key
                for key, value in previous_frame.f_globals.items():
                    if obj is value:
                        return key
                return '...'

            label = 'ToDataframe(%s)' % ', '.join(name(e) for e in dataframes)

    def extract_input(placeholder):
        if not isinstance(placeholder._reference, pvalue.PCollection):
            raise TypeError(
                'Expression roots must have been created with to_dataframe.')
        return placeholder._reference

    placeholders = frozenset.union(
        frozenset(), *[df._expr.placeholders() for df in dataframes])
    results = {p: extract_input(p)
               for p in placeholders
               } | label >> transforms._DataframeExpressionsTransform(
                   dict((ix, df._expr) for ix, df in enumerate(
                       dataframes)))  # type: Dict[Any, pvalue.PCollection]
    if len(results) == 1 and not always_return_tuple:
        return results[0]
    else:
        return tuple(value for key, value in sorted(results.items()))
def to_pcollection(
    *dataframes,  # type: frame_base.DeferredFrame
    **kwargs):
  # type: (...) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]]

  """Converts one or more deferred dataframe-like objects back to a PCollection.

  This method creates and applies the actual Beam operations that compute
  the given deferred dataframes, returning a PCollection of their results. By
  default the resulting PCollections are schema-aware PCollections where each
  element is one row from the output dataframes, excluding indexes. This
  behavior can be modified with the `yield_elements` and `include_indexes`
  arguments.

  If more than one (related) result is desired, it can be more efficient to
  pass them all at the same time to this method.

  Args:
    always_return_tuple: (optional, default: False) If true, always return
        a tuple of PCollections, even if there's only one output.
    yield_elements: (optional, default: "schemas") If set to "pandas", return
        PCollections containing the raw Pandas objects (DataFrames or Series),
        if set to "schemas", return an element-wise PCollection, where DataFrame
        and Series instances are expanded to one element per row. DataFrames are
        converted to schema-aware PCollections, where column values can be
        accessed by attribute.
    include_indexes: (optional, default: False) When yield_elements="schemas",
        if include_indexes=True, attempt to include index columns in the output
        schema for expanded DataFrames. Raises an error if any of the index
        levels are unnamed (name=None), or if any of the names are not unique
        among all column and index names.
  """
  label = kwargs.pop('label', None)
  always_return_tuple = kwargs.pop('always_return_tuple', False)
  yield_elements = kwargs.pop('yield_elements', 'schemas')
  if not yield_elements in ("pandas", "schemas"):
    raise ValueError(
        "Invalid value for yield_elements argument, '%s'. "
        "Allowed values are 'pandas' and 'schemas'" % yield_elements)
  include_indexes = kwargs.pop('include_indexes', False)
  assert not kwargs  # TODO(BEAM-7372): Use PEP 3102
  if label is None:
    # Attempt to come up with a reasonable, stable label by retrieving the name
    # of these variables in the calling context.
    label = 'ToPCollection(%s)' % ', '.join(_var_name(e, 3) for e in dataframes)

  def extract_input(placeholder):
    if not isinstance(placeholder._reference, pvalue.PCollection):
      raise TypeError(
          'Expression roots must have been created with to_dataframe.')
    return placeholder._reference

  placeholders = frozenset.union(
      frozenset(), *[df._expr.placeholders() for df in dataframes])

  # Exclude any dataframes that have already been converted to PCollections.
  # We only want to convert each DF expression once, then re-use.
  new_dataframes = [
      df for df in dataframes if df._expr._id not in TO_PCOLLECTION_CACHE
  ]
  if len(new_dataframes):
    new_results = {p: extract_input(p)
                   for p in placeholders
                   } | label >> transforms._DataframeExpressionsTransform({
                       ix: df._expr
                       for (ix, df) in enumerate(new_dataframes)
                   })  # type: Dict[Any, pvalue.PCollection]

    TO_PCOLLECTION_CACHE.update(
        {new_dataframes[ix]._expr._id: pc
         for ix, pc in new_results.items()})

  raw_results = {
      ix: TO_PCOLLECTION_CACHE[df._expr._id]
      for ix,
      df in enumerate(dataframes)
  }

  if yield_elements == "schemas":

    def maybe_unbatch(pc, value):
      if isinstance(value, frame_base._DeferredScalar):
        return pc
      else:
        return _make_unbatched_pcoll(pc, value._expr, include_indexes)

    results = {
        ix: maybe_unbatch(pc, dataframes[ix])
        for (ix, pc) in raw_results.items()
    }
  else:
    results = raw_results

  if len(results) == 1 and not always_return_tuple:
    return results[0]
  else:
    return tuple(value for key, value in sorted(results.items()))
Esempio n. 4
0
def to_pcollection(
    *dataframes,  # type: frame_base.DeferredFrame
    **kwargs):
  # type: (...) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]]

  """Converts one or more deferred dataframe-like objects back to a PCollection.

  This method creates and applies the actual Beam operations that compute
  the given deferred dataframes, returning a PCollection of their results. By
  default the resulting PCollections are schema-aware PCollections where each
  element is one row from the output dataframes, excluding indexes. This
  behavior can be modified with the `yield_elements` and `include_indexes`
  arguments.

  If more than one (related) result is desired, it can be more efficient to
  pass them all at the same time to this method.

  Args:
    always_return_tuple: (optional, default: False) If true, always return
        a tuple of PCollections, even if there's only one output.
    yield_elements: (optional, default: "schemas") If set to "pandas", return
        PCollections containing the raw Pandas objects (DataFrames or Series),
        if set to "schemas", return an element-wise PCollection, where DataFrame
        and Series instances are expanded to one element per row. DataFrames are
        converted to schema-aware PCollections, where column values can be
        accessed by attribute.
    include_indexes: (optional, default: False) When yield_elements="schemas",
        if include_indexes=True, attempt to include index columns in the output
        schema for expanded DataFrames. Raises an error if any of the index
        levels are unnamed (name=None), or if any of the names are not unique
        among all column and index names.
  """
  label = kwargs.pop('label', None)
  always_return_tuple = kwargs.pop('always_return_tuple', False)
  yield_elements = kwargs.pop('yield_elements', 'schemas')
  if not yield_elements in ("pandas", "schemas"):
    raise ValueError(
        "Invalid value for yield_elements argument, '%s'. "
        "Allowed values are 'pandas' and 'schemas'" % yield_elements)
  include_indexes = kwargs.pop('include_indexes', False)
  assert not kwargs  # TODO(BEAM-7372): Use PEP 3102
  if label is None:
    # Attempt to come up with a reasonable, stable label by retrieving the name
    # of these variables in the calling context.
    current_frame = inspect.currentframe()
    if current_frame is None:
      label = 'ToDataframe(...)'

    else:
      previous_frame = current_frame.f_back

      def name(obj):
        for key, value in previous_frame.f_locals.items():
          if obj is value:
            return key
        for key, value in previous_frame.f_globals.items():
          if obj is value:
            return key
        return '...'

      label = 'ToDataframe(%s)' % ', '.join(name(e) for e in dataframes)

  def extract_input(placeholder):
    if not isinstance(placeholder._reference, pvalue.PCollection):
      raise TypeError(
          'Expression roots must have been created with to_dataframe.')
    return placeholder._reference

  placeholders = frozenset.union(
      frozenset(), *[df._expr.placeholders() for df in dataframes])
  results = {p: extract_input(p)
             for p in placeholders
             } | label >> transforms._DataframeExpressionsTransform(
                 dict((ix, df._expr) for ix, df in enumerate(
                     dataframes)))  # type: Dict[Any, pvalue.PCollection]

  if yield_elements == "schemas":
    results = {
        key: pc
        | "Unbatch '%s'" % dataframes[key]._expr._id >> schemas.UnbatchPandas(
            dataframes[key]._expr.proxy(), include_indexes=include_indexes)
        for (key, pc) in results.items()
    }

  if len(results) == 1 and not always_return_tuple:
    return results[0]
  else:
    return tuple(value for key, value in sorted(results.items()))
Esempio n. 5
0
def to_pcollection(
    *dataframes,  # type: Union[frame_base.DeferredFrame, pd.DataFrame, pd.Series]
    label=None,
    always_return_tuple=False,
    yield_elements='schemas',
    include_indexes=False,
    pipeline=None
) -> Union[pvalue.PCollection, Tuple[pvalue.PCollection, ...]]:
    """Converts one or more deferred dataframe-like objects back to a PCollection.

  This method creates and applies the actual Beam operations that compute
  the given deferred dataframes, returning a PCollection of their results. By
  default the resulting PCollections are schema-aware PCollections where each
  element is one row from the output dataframes, excluding indexes. This
  behavior can be modified with the `yield_elements` and `include_indexes`
  arguments.

  Also accepts non-deferred pandas dataframes, which are converted to deferred,
  schema'd PCollections. In this case the contents of the entire dataframe are
  serialized into the graph, so for large amounts of data it is preferable to
  write them to disk and read them with one of the read methods.

  If more than one (related) result is desired, it can be more efficient to
  pass them all at the same time to this method.

  Args:
    label: (optional, default "ToPCollection(...)"") the label to use for the
        conversion transform.
    always_return_tuple: (optional, default: False) If true, always return
        a tuple of PCollections, even if there's only one output.
    yield_elements: (optional, default: "schemas") If set to "pandas", return
        PCollections containing the raw Pandas objects (DataFrames or Series),
        if set to "schemas", return an element-wise PCollection, where DataFrame
        and Series instances are expanded to one element per row. DataFrames are
        converted to schema-aware PCollections, where column values can be
        accessed by attribute.
    include_indexes: (optional, default: False) When yield_elements="schemas",
        if include_indexes=True, attempt to include index columns in the output
        schema for expanded DataFrames. Raises an error if any of the index
        levels are unnamed (name=None), or if any of the names are not unique
        among all column and index names.
    pipeline: (optional, unless non-deferred dataframes are passed) Used when
        creating a PCollection from a non-deferred dataframe.
  """
    if not yield_elements in ("pandas", "schemas"):
        raise ValueError("Invalid value for yield_elements argument, '%s'. "
                         "Allowed values are 'pandas' and 'schemas'" %
                         yield_elements)
    if label is None:
        # Attempt to come up with a reasonable, stable label by retrieving the name
        # of these variables in the calling context.
        label = 'ToPCollection(%s)' % ', '.join(
            _var_name(e, 3) for e in dataframes)

    # Support for non-deferred dataframes.
    deferred_dataframes = []
    for ix, df in enumerate(dataframes):
        if isinstance(df, frame_base.DeferredBase):
            # TODO(robertwb): Maybe extract pipeline object?
            deferred_dataframes.append(df)
        elif isinstance(df, (pd.Series, pd.DataFrame)):
            if pipeline is None:
                raise ValueError(
                    'Pipeline keyword required for non-deferred dataframe conversion.'
                )
            deferred = pipeline | '%s_Defer%s' % (label, ix) >> beam.Create(
                [df])
            deferred_dataframes.append(
                frame_base.DeferredFrame.wrap(
                    expressions.PlaceholderExpression(df.iloc[:0], deferred)))
        else:
            raise TypeError(
                'Unable to convert objects of type %s to a PCollection' %
                type(df))
    dataframes = tuple(deferred_dataframes)

    def extract_input(placeholder):
        if not isinstance(placeholder._reference, pvalue.PCollection):
            raise TypeError(
                'Expression roots must have been created with to_dataframe.')
        return placeholder._reference

    placeholders = frozenset.union(
        frozenset(), *[df._expr.placeholders() for df in dataframes])

    # Exclude any dataframes that have already been converted to PCollections.
    # We only want to convert each DF expression once, then re-use.
    new_dataframes = [
        df for df in dataframes if df._expr._id not in TO_PCOLLECTION_CACHE
    ]
    if len(new_dataframes):
        new_results = {p: extract_input(p)
                       for p in placeholders
                       } | label >> transforms._DataframeExpressionsTransform({
                           ix: df._expr
                           for (ix, df) in enumerate(new_dataframes)
                       })  # type: Dict[Any, pvalue.PCollection]

        TO_PCOLLECTION_CACHE.update({
            new_dataframes[ix]._expr._id: pc
            for ix, pc in new_results.items()
        })

    raw_results = {
        ix: TO_PCOLLECTION_CACHE[df._expr._id]
        for ix, df in enumerate(dataframes)
    }

    if yield_elements == "schemas":

        def maybe_unbatch(pc, value):
            if isinstance(value, frame_base._DeferredScalar):
                return pc
            else:
                return _make_unbatched_pcoll(pc, value._expr, include_indexes)

        results = {
            ix: maybe_unbatch(pc, dataframes[ix])
            for (ix, pc) in raw_results.items()
        }
    else:
        results = raw_results

    if len(results) == 1 and not always_return_tuple:
        return results[0]
    else:
        return tuple(value for key, value in sorted(results.items()))