Esempio n. 1
0
 def expand(self, pcoll):
     columns = [
         name
         for name, _ in named_fields_from_element_type(pcoll.element_type)
     ]
     return pcoll | self._batch_elements_transform | beam.Map(
         lambda batch: pd.DataFrame.from_records(batch, columns=columns))
Esempio n. 2
0
def elements_to_df(elements, include_window_info=False, element_type=None):
  # type: (List[WindowedValue], bool, Any) -> DataFrame

  """Parses the given elements into a Dataframe.

  If the elements are a list of WindowedValues, then it will break out the
  elements into their own DataFrame and return it. If include_window_info is
  True, then it will concatenate the windowing information onto the elements
  DataFrame.
  """
  try:
    columns_names = [
        name for name, _ in named_fields_from_element_type(element_type)
    ]
  except TypeError:
    columns_names = None

  rows = []
  windowed_info = []
  for e in elements:
    rows.append(e.value)
    if include_window_info:
      windowed_info.append([e.timestamp.micros, e.windows, e.pane_info])

  rows_df = pd.DataFrame(rows, columns=columns_names)
  if include_window_info:
    windowed_info_df = pd.DataFrame(
        windowed_info, columns=['event_time', 'windows', 'pane_info'])
    final_df = pd.concat([rows_df, windowed_info_df], axis=1)
  else:
    final_df = rows_df

  return final_df
Esempio n. 3
0
def generate_proxy(element_type):
    # type: (type) -> pd.DataFrame
    """Generate a proxy pandas object for the given PCollection element_type.

  Currently only supports generating a DataFrame proxy from a schema-aware
  PCollection.
  """
    fields = named_fields_from_element_type(element_type)
    proxy = pd.DataFrame(columns=[name for name, _ in fields])

    for name, typehint in fields:
        # Default to np.object. This is lossy, we won't be able to recover the type
        # at the output.
        dtype = BEAM_TO_PANDAS.get(typehint, np.object)
        proxy[name] = proxy[name].astype(dtype)

    return proxy