Ejemplo n.º 1
0
    def test_convert_yield_pandas(self):
        with beam.Pipeline() as p:
            a = pd.Series([1, 2, 3])
            b = pd.Series([100, 200, 300])

            pc_a = p | 'A' >> beam.Create([a])
            pc_b = p | 'B' >> beam.Create([b])

            df_a = convert.to_dataframe(pc_a, proxy=a[:0])
            df_b = convert.to_dataframe(pc_b, proxy=b[:0])

            df_2a = 2 * df_a
            df_3a = 3 * df_a
            df_ab = df_a * df_b

            # Converting multiple results at a time can be more efficient.
            pc_2a, pc_ab = convert.to_pcollection(df_2a,
                                                  df_ab,
                                                  yield_elements='pandas')
            # But separate conversions can be done as well.
            pc_3a = convert.to_pcollection(df_3a, yield_elements='pandas')

            assert_that(pc_2a,
                        equal_to_unordered_series(2 * a),
                        label='Check2a')
            assert_that(pc_3a,
                        equal_to_unordered_series(3 * a),
                        label='Check3a')
            assert_that(pc_ab,
                        equal_to_unordered_series(a * b),
                        label='Checkab')
Ejemplo n.º 2
0
  def test_convert_yield_pandas(self):
    def equal_to_unordered_series(expected):
      def check(actual):
        actual = pd.concat(actual)
        if sorted(expected) != sorted(actual):
          raise AssertionError(
              'Series not equal: \n%s\n%s\n' % (expected, actual))

      return check

    with beam.Pipeline() as p:
      a = pd.Series([1, 2, 3])
      b = pd.Series([100, 200, 300])

      pc_a = p | 'A' >> beam.Create([a])
      pc_b = p | 'B' >> beam.Create([b])

      df_a = convert.to_dataframe(pc_a, proxy=a[:0])
      df_b = convert.to_dataframe(pc_b, proxy=b[:0])

      df_2a = 2 * df_a
      df_3a = 3 * df_a
      df_ab = df_a * df_b

      # Converting multiple results at a time can be more efficient.
      pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab,
                                            yield_elements='pandas')
      # But separate conversions can be done as well.
      pc_3a = convert.to_pcollection(df_3a, yield_elements='pandas')

      assert_that(pc_2a, equal_to_unordered_series(2 * a), label='Check2a')
      assert_that(pc_3a, equal_to_unordered_series(3 * a), label='Check3a')
      assert_that(pc_ab, equal_to_unordered_series(a * b), label='Checkab')
Ejemplo n.º 3
0
 def test_convert_with_none(self):
     # Ensure the logical Any type allows (nullable) None, see BEAM-12587.
     df = pd.DataFrame({'A': ['str', 10, None], 'B': [None, 'str', 20]})
     with beam.Pipeline() as p:
         res = convert.to_pcollection(df, pipeline=p) | beam.Map(tuple)
         assert_that(res,
                     equal_to([(row.A, row.B) for _, row in df.iterrows()]))
Ejemplo n.º 4
0
  def _watch(self, pcolls):
    # type: (List[beam.pvalue.PCollection]) -> None

    """Watch any pcollections not being watched.

    This allows for the underlying caching layer to identify the PCollection as
    something to be cached.
    """

    watched_pcollections = set()
    watched_dataframes = set()
    for watching in ie.current_env().watching():
      for _, val in watching:
        if isinstance(val, beam.pvalue.PCollection):
          watched_pcollections.add(val)
        elif isinstance(val, DeferredBase):
          watched_dataframes.add(val)

    # Convert them one-by-one to generate a unique label for each. This allows
    # caching at a more fine-grained granularity.
    #
    # TODO(BEAM-12388): investigate the mixing pcollections in multiple
    # pipelines error when using the default label.
    for df in watched_dataframes:
      pcoll = to_pcollection(df, yield_elements='pandas', label=str(df._expr))
      watched_pcollections.add(pcoll)
    for pcoll in pcolls:
      if pcoll not in watched_pcollections:
        ie.current_env().watch(
            {'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
    def run_scenario(self, input, func):
        expected = func(input)

        empty = input.iloc[0:0]
        input_placeholder = expressions.PlaceholderExpression(empty)
        input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
        actual_deferred = func(input_deferred)._expr.evaluate_at(
            expressions.Session({input_placeholder: input}))

        check_correct(expected, actual_deferred)

        with beam.Pipeline() as p:
            input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]])
            input_df = convert.to_dataframe(input_pcoll, proxy=empty)
            output_df = func(input_df)

            output_proxy = output_df._expr.proxy()
            if isinstance(output_proxy, pd.core.generic.NDFrame):
                self.assertTrue(
                    output_proxy.iloc[:0].equals(expected.iloc[:0]),
                    ('Output proxy is incorrect:\n'
                     f'Expected:\n{expected.iloc[:0]}\n\n'
                     f'Actual:\n{output_proxy.iloc[:0]}'))
            else:
                self.assertEqual(type(output_proxy), type(expected))

            output_pcoll = convert.to_pcollection(output_df,
                                                  yield_elements='pandas')

            assert_that(output_pcoll,
                        lambda actual: check_correct(expected, concat(actual)))
Ejemplo n.º 6
0
  def expand(self, input_pcolls):
    # Avoid circular import.
    from apache_beam.dataframe import convert

    # Convert inputs to a flat dict.
    input_dict = _flatten(input_pcolls)  # type: Dict[Any, PCollection]
    proxies = _flatten(self._proxy)
    input_frames = {
        k: convert.to_dataframe(pc, proxies[k])
        for k, pc in input_dict.items()
    }  # type: Dict[Any, DeferredFrame]

    # Apply the function.
    frames_input = _substitute(input_pcolls, input_frames)
    if isinstance(frames_input, dict):
      result_frames = self._func(**frames_input)
    elif isinstance(frames_input, tuple):
      result_frames = self._func(*frames_input)
    else:
      result_frames = self._func(frames_input)

    # Compute results as a tuple.
    result_frames_dict = _flatten(result_frames)
    keys = list(result_frames_dict.keys())
    result_frames_tuple = tuple(result_frames_dict[key] for key in keys)
    result_pcolls_tuple = convert.to_pcollection(
        *result_frames_tuple, label='Eval', always_return_tuple=True)

    # Convert back to the structure returned by self._func.
    result_pcolls_dict = dict(zip(keys, result_pcolls_tuple))
    return _substitute(result_frames, result_pcolls_dict)
Ejemplo n.º 7
0
def ReadCSVToPandas(
    p: beam.Pipeline,
    *args,
    **kwargs,
) -> PCollection[pd.DataFrame]:
    data = p | "Read CSV" >> df_io.read_csv(*args, **kwargs)
    return df_convert.to_pcollection(data, yield_elements='pandas')
Ejemplo n.º 8
0
    def test_convert_non_deferred(self):
        with beam.Pipeline() as p:
            s1 = pd.Series([1, 2, 3])
            s2 = convert.to_dataframe(p | beam.Create([100, 200, 300]))

            pc1, pc2 = convert.to_pcollection(s1, s2, pipeline=p)
            assert_that(pc1, equal_to([1, 2, 3]), label='CheckNonDeferred')
            assert_that(pc2, equal_to([100, 200, 300]), label='CheckDeferred')
Ejemplo n.º 9
0
Archivo: io.py Proyecto: nielm/beam
 def expand(self, p):
     from apache_beam.dataframe import convert  # avoid circular import
     df = p | self._reader
     if self._objects_as_strings:
         for col, t in zip(df.columns, df.dtypes):
             if t == object:
                 df[col] = df[col].astype(pd.StringDtype())
     return convert.to_pcollection(df,
                                   include_indexes=self._include_indexes)
Ejemplo n.º 10
0
    def test_convert_memoization(self):
        with beam.Pipeline() as p:
            a = pd.Series([1, 2, 3])
            b = pd.Series([100, 200, 300])

            pc_a = p | 'A' >> beam.Create([a])
            pc_b = p | 'B' >> beam.Create([b])

            df_a = convert.to_dataframe(pc_a, proxy=a[:0])
            df_b = convert.to_dataframe(pc_b, proxy=b[:0])

            df_2a = 2 * df_a
            df_3a = 3 * df_a
            df_ab = df_a * df_b

            # Two calls to to_pcollection with the same Dataframe should produce the
            # same PCollection(s)
            pc_2a_, pc_ab_ = convert.to_pcollection(df_2a, df_ab)
            pc_3a, pc_2a, pc_ab = convert.to_pcollection(df_3a, df_2a, df_ab)

            self.assertIs(pc_2a, pc_2a_)
            self.assertIs(pc_ab, pc_ab_)
            self.assertIsNot(pc_3a, pc_2a)
            self.assertIsNot(pc_3a, pc_ab)

            # The same conversions without the unbatching transform should also cache
            # PCollections
            pc_2a_pandas_, pc_ab_pandas_ = convert.to_pcollection(
                df_2a, df_ab, yield_elements='pandas')
            pc_3a_pandas, pc_2a_pandas, pc_ab_pandas = convert.to_pcollection(
                df_3a, df_2a, df_ab, yield_elements='pandas')

            self.assertIs(pc_2a_pandas, pc_2a_pandas_)
            self.assertIs(pc_ab_pandas, pc_ab_pandas_)
            self.assertIsNot(pc_3a_pandas, pc_2a_pandas)
            self.assertIsNot(pc_3a_pandas, pc_ab_pandas)

            # .. but the cached PCollections should be different
            self.assertIsNot(pc_2a_pandas, pc_2a)
            self.assertIsNot(pc_ab_pandas, pc_ab)
            self.assertIsNot(pc_3a_pandas, pc_3a)
    def test_batching_beam_row_to_dataframe(self):
        with beam.Pipeline() as p:
            df = convert.to_dataframe(
                p
                | beam.Create([(u'Falcon', 380.), (
                    u'Falcon', 370.), (u'Parrot', 24.), (u'Parrot', 26.)])
                | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])))

            result = convert.to_pcollection(df.groupby('Animal').mean(),
                                            include_indexes=True)

            assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
Ejemplo n.º 12
0
  def test_read_fwf(self):
    input = self.temp_dir(
        {'all.fwf': '''
A     B
11a   0
37a   1
389a  2
    '''.strip()})
    with beam.Pipeline() as p:
      df = p | io.read_fwf(input + 'all.fwf')
      rows = convert.to_pcollection(df) | beam.Map(tuple)
      assert_that(rows, equal_to([('11a', 0), ('37a', 1), ('389a', 2)]))
Ejemplo n.º 13
0
def deferred_df_to_pcollection(df):
  assert isinstance(df, DeferredBase), '{} is not a DeferredBase'.format(df)

  # The proxy is used to output a DataFrame with the correct columns.
  #
  # TODO(BEAM-11064): Once type hints are implemented for pandas, use those
  # instead of the proxy.
  cache = ExpressionCache()
  cache.replace_with_cached(df._expr)

  proxy = df._expr.proxy()
  return to_pcollection(df, yield_elements='pandas', label=str(df._expr)), proxy
Ejemplo n.º 14
0
    def test_convert(self):
        with beam.Pipeline() as p:
            a = pd.Series([1, 2, 3])
            b = pd.Series([100, 200, 300])

            pc_a = p | 'A' >> beam.Create(a)
            pc_b = p | 'B' >> beam.Create(b)

            df_a = convert.to_dataframe(pc_a)
            df_b = convert.to_dataframe(pc_b)

            df_2a = 2 * df_a
            df_3a = 3 * df_a
            df_ab = df_a * df_b

            # Converting multiple results at a time can be more efficient.
            pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab)
            # But separate conversions can be done as well.
            pc_3a = convert.to_pcollection(df_3a)

            assert_that(pc_2a, equal_to(list(2 * a)), label='Check2a')
            assert_that(pc_3a, equal_to(list(3 * a)), label='Check3a')
            assert_that(pc_ab, equal_to(list(a * b)), label='Checkab')
Ejemplo n.º 15
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Import this here to avoid pickling the main session.
    import re

    # The pipeline will be run on exiting the with block.
    with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:

        # [START DataFrame_wordcount]

        # Read the text file[pattern] into a PCollection.
        lines = p | 'Read' >> ReadFromText(known_args.input)

        words = (
            lines
            | 'Split' >> beam.FlatMap(
                lambda line: re.findall(r'[\w]+', line)).with_output_types(str)
            # Map to Row objects to generate a schema suitable for conversion
            # to a dataframe.
            | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word)))

        df = to_dataframe(words)
        df['count'] = 1
        counted = df.groupby('word').sum()
        counted.to_csv(known_args.output)

        # Deferred DataFrames can also be converted back to schema'd PCollections
        counted_pc = to_pcollection(counted, include_indexes=True)

        # [END DataFrame_wordcount]

        # Print out every word that occurred >50 times
        _ = (counted_pc
             | beam.Filter(lambda row: row.count > 50)
             | beam.Map(lambda row: f'{row.word}: {row.count}')
             | beam.Map(print))
Ejemplo n.º 16
0
  def test_batching_beam_row_to_dataframe(self):
    with beam.Pipeline() as p:
      df = convert.to_dataframe(
          p
          | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (
              u'Parrot', 24.), (u'Parrot', 26.)])
          | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])))

      result = convert.to_pcollection(df.groupby('Animal').mean())

      assert_that(
          result,
          df_equal_to(
              pd.DataFrame({
                  'Animal': ['Falcon', 'Parrot'], 'Speed': [375., 25.]
              }).set_index('Animal')))
Ejemplo n.º 17
0
    def test_dataframes(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))
        data = p | beam.Create([
            1, 2, 3
        ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
        df = to_dataframe(data)
        pcoll = to_pcollection(df)

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
        pd.testing.assert_frame_equal(df_expected, ib.collect(data, n=10))
        pd.testing.assert_frame_equal(df_expected, ib.collect(df, n=10))
        pd.testing.assert_frame_equal(df_expected, ib.collect(pcoll, n=10))
Ejemplo n.º 18
0
    def _watch(self, pcolls):
        # type: (List[beam.pvalue.PCollection]) -> None
        """Watch any pcollections not being watched.

    This allows for the underlying caching layer to identify the PCollection as
    something to be cached.
    """

        watched_pcollections = set()
        watched_dataframes = set()
        for watching in ie.current_env().watching():
            for _, val in watching:
                if isinstance(val, beam.pvalue.PCollection):
                    watched_pcollections.add(val)
                elif isinstance(val, DeferredBase):
                    watched_dataframes.add(val)
        # Convert them all in a single step for efficiency.
        for pcoll in to_pcollection(*watched_dataframes,
                                    always_return_tuple=True):
            watched_pcollections.add(pcoll)
        for pcoll in pcolls:
            if pcoll not in watched_pcollections:
                ie.current_env().watch(
                    {'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
Ejemplo n.º 19
0
 def test_convert_scalar(self):
     with beam.Pipeline() as p:
         pc = p | 'A' >> beam.Create([1, 2, 3])
         s = convert.to_dataframe(pc)
         pc_sum = convert.to_pcollection(s.sum())
         assert_that(pc_sum, equal_to([6]))
Ejemplo n.º 20
0
def read_csv_as_pcoll(pipeline, path):
    label = os.path.basename(path)
    raw_df = (pipeline | f"ReadCSV{label}" >> df_io.read_csv(path))
    return df_convert.to_pcollection(raw_df,
                                     pipeline=pipeline,
                                     label=f"ToPColl{label}")
Ejemplo n.º 21
0
def run():

    options = MyOptions()

    with beam.Pipeline(options=options) as p:

        immigration_data = (
            p
            | "Read Immigration Data" >> beam.io.parquetio.ReadFromParquet(
                p.options.input_dir.get() +
                "data.parquet\*").with_output_types(ImmigrationData)
            | "Immigration dictionary collection to row" >>
            beam.Map(ToRowImmigration))

        df_immigration = to_dataframe(immigration_data)

        cities_data = (
            p
            | "Read city data" >> beam.io.ReadFromText(
                p.options.input_dir.get() + "us-cities-demographics.csv",
                skip_header_lines=1,
            )
            | "Parse city data" >> beam.ParDo(SplitCityData())
            | "Get average demographics per State" >> beam.CombinePerKey(
                AverageDictFn())
            | "Key to Column" >> beam.ParDo(
                OrganizeCityData()).with_output_types(CityData)
            | "City dictionary collection to row" >> beam.Map(ToRowCity))

        df_cities = to_dataframe(cities_data)

        df_immigration = df_immigration[df_immigration["i94addr"].notna()]
        df_immigration = df_immigration.join(df_cities, rsuffix="_city")

        airport_data = (
            p
            | "Read airport data" >> beam.io.ReadFromText(
                p.options.input_dir.get() + "airport-codes_csv_2.csv",
                skip_header_lines=1,
            )
            | "Parse airport data" >> beam.ParDo(
                SplitAirportData()).with_output_types(AirportData)
            | "Airport dictionary collection to row" >> beam.Map(ToRowAirport))

        df_airport = to_dataframe(airport_data)

        df_immigration = df_immigration[df_immigration["i94port"].notna()]
        join_data = df_immigration.join(df_airport, rsuffix="_airport")

        join_data = to_pcollection(join_data, include_indexes=False)

        # ---#---#---# TODO #---#---#---#
        # -------- Join temperature data to dataset --------

        # temperature_data = (
        #     p | "Read city data" >> beam.io.ReadFromText('GlobalLandTemperaturesByCity.csv', skip_header_lines=1) |
        #     "Parse temperature data" >> beam.ParDo(SplitTempData()).with_output_types(TemperatureData)  |
        #     'Temperature dictionary filtered and to row' >> beam.ParDo(FilterAndToRowTemperature())
        # )

        # -------- NotImplementedError: grouby(as_index = False) and drop_duplicates()--------
        # df_immigration_2 = df_immigration.filter(
        #         items=['arrdate','i94mon','municipality','i94port']
        #     ).groupby(
        #         by = ['arrdate','i94mon','municipality','i94port'], as_index = False
        #     )

        # df_temperature = to_dataframe(temperature_data)

        # df_temperature_2 = df_immigration_2.join(df_temperature,
        #                                 (df_immigration_2.municipality == df_temperature.municipality) \
        #                                 & (df_immigration_2.i94mon == df_temperature.month ), 'left')

        output = join_data | "Save data to file" >> beam.io.WriteToText(
            p.options.output_dir.get())
Ejemplo n.º 22
0
def collect(pcoll, n='inf', duration='inf', include_window_info=False):
    """Materializes the elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory. The user can specify either the max number of elements to read
  or the maximum duration of elements to read. When a limiter is not supplied,
  it is assumed to be infinite.

  Args:
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read in integer seconds or
        a string duration. Default 'inf'.

  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    if isinstance(pcoll, DeferredBase):
        pcoll = to_pcollection(pcoll)

    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, int):
        assert duration > 0, (
            'duration needs to be positive, a duration string, '
            'or the string \'inf\'')

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    user_pipeline = pcoll.pipeline
    recording_manager = ie.current_env().get_recording_manager(
        user_pipeline, create_if_absent=True)

    recording = recording_manager.record([pcoll],
                                         max_n=n,
                                         max_duration=duration)

    try:
        elements = list(recording.stream(pcoll).read())
    except KeyboardInterrupt:
        recording.cancel()
        return pd.DataFrame()

    return elements_to_df(elements,
                          include_window_info=include_window_info,
                          element_type=pcoll.element_type)
Ejemplo n.º 23
0
def collect(pcoll, n='inf', duration='inf', include_window_info=False):
    """Materializes the elements from a PCollection into a Dataframe.

  This reads each element from file and reads only the amount that it needs
  into memory. The user can specify either the max number of elements to read
  or the maximum duration of elements to read. When a limiter is not supplied,
  it is assumed to be infinite.

  Args:
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read in integer seconds or
        a string duration. Default 'inf'.
    include_window_info: (optional) if True, appends the windowing information
        to each row. Default False.

  For example::

    p = beam.Pipeline(InteractiveRunner())
    init = p | 'Init' >> beam.Create(range(10))
    square = init | 'Square' >> beam.Map(lambda x: x * x)

    # Run the pipeline and bring the PCollection into memory as a Dataframe.
    in_memory_square = head(square, n=5)
  """
    # Remember the element type so we can make an informed decision on how to
    # collect the result in elements_to_df.
    if isinstance(pcoll, DeferredBase):
        # Get the proxy so we can get the output shape of the DataFrame.
        # TODO(BEAM-11064): Once type hints are implemented for pandas, use those
        # instead of the proxy.
        element_type = pcoll._expr.proxy()
        pcoll = to_pcollection(pcoll,
                               yield_elements='pandas',
                               label=str(pcoll._expr))
        watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
    else:
        element_type = pcoll.element_type

    assert isinstance(pcoll, beam.pvalue.PCollection), (
        '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, int):
        assert duration > 0, (
            'duration needs to be positive, a duration string, '
            'or the string \'inf\'')

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    user_pipeline = pcoll.pipeline
    recording_manager = ie.current_env().get_recording_manager(
        user_pipeline, create_if_absent=True)

    recording = recording_manager.record([pcoll],
                                         max_n=n,
                                         max_duration=duration)

    try:
        elements = list(recording.stream(pcoll).read())
    except KeyboardInterrupt:
        recording.cancel()
        return pd.DataFrame()

    if n == float('inf'):
        n = None

    # Collecting DataFrames may have a length > n, so slice again to be sure. Note
    # that array[:None] returns everything.
    return elements_to_df(elements,
                          include_window_info=include_window_info,
                          element_type=element_type)[:n]
Ejemplo n.º 24
0
def _as_pc(df, label=None):
    from apache_beam.dataframe import convert  # avoid circular import
    # TODO(roberwb): Amortize the computation for multiple writes?
    return convert.to_pcollection(df, yield_elements='pandas', label=label)
Ejemplo n.º 25
0
def write_csv(df, path, *args, **kwargs):
  from apache_beam.dataframe import convert  # avoid circular import
  # TODO(roberwb): Amortize the computation for multiple writes?
  return convert.to_pcollection(df) | _WriteToPandas(
      pd.DataFrame.to_csv, path, args, kwargs, incremental=True, binary=False)
Ejemplo n.º 26
0
 def __ror__(self, other, label=None):
     if isinstance(other, frame_base.DeferredBase):
         from apache_beam.dataframe import convert  # avoid circular import
         # TODO(roberwb): Amortize the computation for multiple writes?
         other = convert.to_pcollection(other, yield_elements='pandas')
     return super(_WriteToPandas, self).__ror__(other, label)
Ejemplo n.º 27
0
    def _run_read_write_test(self,
                             format,
                             read_kwargs={},
                             write_kwargs={},
                             check_options={},
                             requires=()):

        for module in requires:
            try:
                importlib.import_module(module)
            except ImportError:
                raise unittest.SkipTest('Missing dependency: %s' % module)
        small = pd.DataFrame({
            'label': ['11a', '37a', '389a'],
            'rank': [0, 1, 2]
        })
        big = pd.DataFrame({'number': list(range(1000))})
        big['float'] = big.number.map(math.sqrt)
        big['text'] = big.number.map(lambda n: 'f' + 'o' * n)

        def frame_equal_to(expected_, check_index=True, check_names=True):
            def check(actual):
                expected = expected_
                try:
                    actual = pd.concat(actual)
                    if not check_index:
                        expected = expected.sort_values(list(
                            expected.columns)).reset_index(drop=True)
                        actual = actual.sort_values(list(
                            actual.columns)).reset_index(drop=True)
                    if not check_names:
                        actual = actual.rename(columns=dict(
                            zip(actual.columns, expected.columns)))
                    return assert_frame_equal(expected,
                                              actual,
                                              check_like=True)
                except:
                    print("EXPECTED")
                    print(expected)
                    print("ACTUAL")
                    print(actual)
                    raise

            return check

        for df in (small, big):
            with tempfile.TemporaryDirectory() as dir:
                dest = os.path.join(dir, 'out')
                try:
                    with beam.Pipeline() as p:
                        deferred_df = convert.to_dataframe(
                            p | beam.Create([df[::3], df[1::3], df[2::3]]),
                            proxy=df[:0])
                        # This does the write.
                        getattr(deferred_df, 'to_%s' % format)(dest,
                                                               **write_kwargs)
                    with beam.Pipeline() as p:
                        # Now do the read.
                        # TODO(robertwb): Allow reading from pcoll of paths to do it all in
                        # one pipeline.

                        result = convert.to_pcollection(
                            p | getattr(io, 'read_%s' % format)(dest + '*', **
                                                                read_kwargs),
                            yield_elements='pandas')
                        assert_that(result,
                                    frame_equal_to(df, **check_options))
                except:
                    os.system('head -n 100 ' + dest + '*')
                    raise
Ejemplo n.º 28
0
def show(*pcolls,
         include_window_info=False,
         visualize_data=False,
         n='inf',
         duration='inf'):
    # type: (*Union[Dict[Any, PCollection], Iterable[PCollection], PCollection], bool, bool, Union[int, str], Union[int, str]) -> None
    """Shows given PCollections in an interactive exploratory way if used within
  a notebook, or prints a heading sampled data if used within an ipython shell.
  Noop if used in a non-interactive environment.

  Args:
    include_window_info: (optional) if True, windowing information of the
        data will be visualized too. Default is false.
    visualize_data: (optional) by default, the visualization contains data
        tables rendering data from given pcolls separately as if they are
        converted into dataframes. If visualize_data is True, there will be a
        more dive-in widget and statistically overview widget of the data.
        Otherwise, those 2 data visualization widgets will not be displayed.
    n: (optional) max number of elements to visualize. Default 'inf'.
    duration: (optional) max duration of elements to read in integer seconds or
        a string duration. Default 'inf'.

  The given pcolls can be dictionary of PCollections (as values), or iterable
  of PCollections or plain PCollection values.

  The user can specify either the max number of elements with `n` to read
  or the maximum duration of elements to read with `duration`. When a limiter is
  not supplied, it is assumed to be infinite.

  By default, the visualization contains data tables rendering data from given
  pcolls separately as if they are converted into dataframes. If visualize_data
  is True, there will be a more dive-in widget and statistically overview widget
  of the data. Otherwise, those 2 data visualization widgets will not be
  displayed.

  Ad hoc builds a pipeline fragment including only transforms that are
  necessary to produce data for given PCollections pcolls, runs the pipeline
  fragment to compute data for those pcolls and then visualizes the data.

  The function is always blocking. If used within a notebook, the data
  visualized might be dynamically updated before the function returns as more
  and more data could getting processed and emitted when the pipeline fragment
  is being executed. If used within an ipython shell, there will be no dynamic
  plotting but a static plotting in the end of pipeline fragment execution.

  The PCollections given must belong to the same pipeline.

    For example::

      p = beam.Pipeline(InteractiveRunner())
      init = p | 'Init' >> beam.Create(range(1000))
      square = init | 'Square' >> beam.Map(lambda x: x * x)
      cube = init | 'Cube' >> beam.Map(lambda x: x ** 3)

      # Below builds a pipeline fragment from the defined pipeline `p` that
      # contains only applied transforms of `Init` and `Square`. Then the
      # interactive runner runs the pipeline fragment implicitly to compute data
      # represented by PCollection `square` and visualizes it.
      show(square)

      # This is equivalent to `show(square)` because `square` depends on `init`
      # and `init` is included in the pipeline fragment and computed anyway.
      show(init, square)

      # Below is similar to running `p.run()`. It computes data for both
      # PCollection `square` and PCollection `cube`, then visualizes them.
      show(square, cube)
  """
    flatten_pcolls = []
    for pcoll_container in pcolls:
        if isinstance(pcoll_container, dict):
            flatten_pcolls.extend(pcoll_container.values())
        elif isinstance(pcoll_container,
                        (beam.pvalue.PCollection, DeferredBase)):
            flatten_pcolls.append(pcoll_container)
        else:
            try:
                flatten_pcolls.extend(iter(pcoll_container))
            except TypeError:
                raise ValueError(
                    'The given pcoll %s is not a dict, an iterable or a PCollection.'
                    % pcoll_container)

    # Iterate through the given PCollections and convert any deferred DataFrames
    # or Series into PCollections.
    pcolls = []

    # The element type is used to help visualize the given PCollection. For the
    # deferred DataFrame/Series case it is the proxy of the frame.
    element_types = {}
    for pcoll in flatten_pcolls:
        if isinstance(pcoll, DeferredBase):
            proxy = pcoll._expr.proxy()
            pcoll = to_pcollection(pcoll,
                                   yield_elements='pandas',
                                   label=str(pcoll._expr))
            element_type = proxy
            watch({'anonymous_pcollection_{}'.format(id(pcoll)): pcoll})
        else:
            element_type = pcoll.element_type

        element_types[pcoll] = element_type

        pcolls.append(pcoll)
        assert isinstance(pcoll, beam.pvalue.PCollection), (
            '{} is not an apache_beam.pvalue.PCollection.'.format(pcoll))

    assert len(pcolls) > 0, (
        'Need at least 1 PCollection to show data visualization.')

    user_pipeline = pcolls[0].pipeline

    if isinstance(n, str):
        assert n == 'inf', (
            'Currently only the string \'inf\' is supported. This denotes reading '
            'elements until the recording is stopped via a kernel interrupt.')
    elif isinstance(n, int):
        assert n > 0, 'n needs to be positive or the string \'inf\''

    if isinstance(duration, int):
        assert duration > 0, (
            'duration needs to be positive, a duration string, '
            'or the string \'inf\'')

    if n == 'inf':
        n = float('inf')

    if duration == 'inf':
        duration = float('inf')

    recording_manager = ie.current_env().get_recording_manager(
        user_pipeline, create_if_absent=True)
    recording = recording_manager.record(pcolls,
                                         max_n=n,
                                         max_duration=duration)

    # Catch a KeyboardInterrupt to gracefully cancel the recording and
    # visualizations.
    try:
        # If in notebook, static plotting computed pcolls as computation is done.
        if ie.current_env().is_in_notebook:
            for stream in recording.computed().values():
                visualize(stream,
                          include_window_info=include_window_info,
                          display_facets=visualize_data,
                          element_type=element_types[stream.pcoll])
        elif ie.current_env().is_in_ipython:
            for stream in recording.computed().values():
                visualize(stream,
                          include_window_info=include_window_info,
                          element_type=element_types[stream.pcoll])

        if recording.is_computed():
            return

        # If in notebook, dynamic plotting as computation goes.
        if ie.current_env().is_in_notebook:
            for stream in recording.uncomputed().values():
                visualize(stream,
                          dynamic_plotting_interval=1,
                          include_window_info=include_window_info,
                          display_facets=visualize_data,
                          element_type=element_types[stream.pcoll])

        # Invoke wait_until_finish to ensure the blocking nature of this API without
        # relying on the run to be blocking.
        recording.wait_until_finish()

        # If just in ipython shell, plotting once when the computation is completed.
        if ie.current_env(
        ).is_in_ipython and not ie.current_env().is_in_notebook:
            for stream in recording.computed().values():
                visualize(stream, include_window_info=include_window_info)

    except KeyboardInterrupt:
        if recording:
            recording.cancel()