def expand(self, pcoll): columns = [ name for name, _ in named_fields_from_element_type(pcoll.element_type) ] return pcoll | self._batch_elements_transform | beam.Map( lambda batch: pd.DataFrame.from_records(batch, columns=columns))
def elements_to_df(elements, include_window_info=False, element_type=None): # type: (List[WindowedValue], bool, Any) -> DataFrame """Parses the given elements into a Dataframe. If the elements are a list of WindowedValues, then it will break out the elements into their own DataFrame and return it. If include_window_info is True, then it will concatenate the windowing information onto the elements DataFrame. """ try: columns_names = [ name for name, _ in named_fields_from_element_type(element_type) ] except TypeError: columns_names = None rows = [] windowed_info = [] for e in elements: rows.append(e.value) if include_window_info: windowed_info.append([e.timestamp.micros, e.windows, e.pane_info]) rows_df = pd.DataFrame(rows, columns=columns_names) if include_window_info: windowed_info_df = pd.DataFrame( windowed_info, columns=['event_time', 'windows', 'pane_info']) final_df = pd.concat([rows_df, windowed_info_df], axis=1) else: final_df = rows_df return final_df
def generate_proxy(element_type): # type: (type) -> pd.DataFrame """Generate a proxy pandas object for the given PCollection element_type. Currently only supports generating a DataFrame proxy from a schema-aware PCollection. """ fields = named_fields_from_element_type(element_type) proxy = pd.DataFrame(columns=[name for name, _ in fields]) for name, typehint in fields: # Default to np.object. This is lossy, we won't be able to recover the type # at the output. dtype = BEAM_TO_PANDAS.get(typehint, np.object) proxy[name] = proxy[name].astype(dtype) return proxy