コード例 #1
0
ファイル: convert_test.py プロジェクト: piter75/apache-beam
    def test_convert_yield_pandas(self):
        with beam.Pipeline() as p:
            a = pd.Series([1, 2, 3])
            b = pd.Series([100, 200, 300])

            pc_a = p | 'A' >> beam.Create([a])
            pc_b = p | 'B' >> beam.Create([b])

            df_a = convert.to_dataframe(pc_a, proxy=a[:0])
            df_b = convert.to_dataframe(pc_b, proxy=b[:0])

            df_2a = 2 * df_a
            df_3a = 3 * df_a
            df_ab = df_a * df_b

            # Converting multiple results at a time can be more efficient.
            pc_2a, pc_ab = convert.to_pcollection(df_2a,
                                                  df_ab,
                                                  yield_elements='pandas')
            # But separate conversions can be done as well.
            pc_3a = convert.to_pcollection(df_3a, yield_elements='pandas')

            assert_that(pc_2a,
                        equal_to_unordered_series(2 * a),
                        label='Check2a')
            assert_that(pc_3a,
                        equal_to_unordered_series(3 * a),
                        label='Check3a')
            assert_that(pc_ab,
                        equal_to_unordered_series(a * b),
                        label='Checkab')
コード例 #2
0
ファイル: convert_test.py プロジェクト: comtef/beam
  def test_convert_yield_pandas(self):
    def equal_to_unordered_series(expected):
      def check(actual):
        actual = pd.concat(actual)
        if sorted(expected) != sorted(actual):
          raise AssertionError(
              'Series not equal: \n%s\n%s\n' % (expected, actual))

      return check

    with beam.Pipeline() as p:
      a = pd.Series([1, 2, 3])
      b = pd.Series([100, 200, 300])

      pc_a = p | 'A' >> beam.Create([a])
      pc_b = p | 'B' >> beam.Create([b])

      df_a = convert.to_dataframe(pc_a, proxy=a[:0])
      df_b = convert.to_dataframe(pc_b, proxy=b[:0])

      df_2a = 2 * df_a
      df_3a = 3 * df_a
      df_ab = df_a * df_b

      # Converting multiple results at a time can be more efficient.
      pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab,
                                            yield_elements='pandas')
      # But separate conversions can be done as well.
      pc_3a = convert.to_pcollection(df_3a, yield_elements='pandas')

      assert_that(pc_2a, equal_to_unordered_series(2 * a), label='Check2a')
      assert_that(pc_3a, equal_to_unordered_series(3 * a), label='Check3a')
      assert_that(pc_ab, equal_to_unordered_series(a * b), label='Checkab')
コード例 #3
0
    def test_dataframes_with_multi_index_get_result(self):
        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        data = [
            Record('a', 20, 170),
            Record('a', 30, 170),
            Record('b', 22, 180),
            Record('c', 18, 150)
        ]

        aggregate = lambda df: df.groupby(['name', 'height']).mean()['age']

        deferred_df = aggregate(to_dataframe(p | beam.Create(data)))
        df_expected = aggregate(pd.DataFrame(data))

        # Watch the local scope for Interactive Beam so that values will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        pd.testing.assert_series_equal(df_expected,
                                       ib.collect(deferred_df, n=10))
コード例 #4
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # Import this here to avoid pickling the main session.
    import re

    # The pipeline will be run on exiting the with block.
    with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | 'Read' >> ReadFromText(known_args.input)

        words = (
            lines
            | 'Split' >> beam.FlatMap(
                lambda line: re.findall(r'[\w]+', line)).with_output_types(str)
            # Map to Row objects to generate a schema suitable for conversion
            # to a dataframe.
            | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word)))

        df = to_dataframe(words)
        df['count'] = 1
        counted = df.groupby('word').sum()
        counted.to_csv(known_args.output)
コード例 #5
0
  def expand(self, input_pcolls):
    # Avoid circular import.
    from apache_beam.dataframe import convert

    # Convert inputs to a flat dict.
    input_dict = _flatten(input_pcolls)  # type: Dict[Any, PCollection]
    proxies = _flatten(self._proxy)
    input_frames = {
        k: convert.to_dataframe(pc, proxies[k])
        for k, pc in input_dict.items()
    }  # type: Dict[Any, DeferredFrame]

    # Apply the function.
    frames_input = _substitute(input_pcolls, input_frames)
    if isinstance(frames_input, dict):
      result_frames = self._func(**frames_input)
    elif isinstance(frames_input, tuple):
      result_frames = self._func(*frames_input)
    else:
      result_frames = self._func(frames_input)

    # Compute results as a tuple.
    result_frames_dict = _flatten(result_frames)
    keys = list(result_frames_dict.keys())
    result_frames_tuple = tuple(result_frames_dict[key] for key in keys)
    result_pcolls_tuple = convert.to_pcollection(
        *result_frames_tuple, label='Eval', always_return_tuple=True)

    # Convert back to the structure returned by self._func.
    result_pcolls_dict = dict(zip(keys, result_pcolls_tuple))
    return _substitute(result_frames, result_pcolls_dict)
コード例 #6
0
    def run_scenario(self, input, func):
        expected = func(input)

        empty = input.iloc[0:0]
        input_placeholder = expressions.PlaceholderExpression(empty)
        input_deferred = frame_base.DeferredFrame.wrap(input_placeholder)
        actual_deferred = func(input_deferred)._expr.evaluate_at(
            expressions.Session({input_placeholder: input}))

        check_correct(expected, actual_deferred)

        with beam.Pipeline() as p:
            input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]])
            input_df = convert.to_dataframe(input_pcoll, proxy=empty)
            output_df = func(input_df)

            output_proxy = output_df._expr.proxy()
            if isinstance(output_proxy, pd.core.generic.NDFrame):
                self.assertTrue(
                    output_proxy.iloc[:0].equals(expected.iloc[:0]),
                    ('Output proxy is incorrect:\n'
                     f'Expected:\n{expected.iloc[:0]}\n\n'
                     f'Actual:\n{output_proxy.iloc[:0]}'))
            else:
                self.assertEqual(type(output_proxy), type(expected))

            output_pcoll = convert.to_pcollection(output_df,
                                                  yield_elements='pandas')

            assert_that(output_pcoll,
                        lambda actual: check_correct(expected, concat(actual)))
コード例 #7
0
    def expand(self, root):
        # TODO(robertwb): Handle streaming (with explicit schema).
        paths_pcoll = root | beam.Create([self.path])
        first = io.filesystems.FileSystems.match(
            [self.path], limits=[1])[0].metadata_list[0].path
        with io.filesystems.FileSystems.open(first) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        pcoll = (paths_pcoll
                 | fileio.MatchFiles(self.path)
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll,
                                    proxy=_prefix_range_index_with(
                                        ':', sample[:0]))
コード例 #8
0
    def expand(self, root):
        paths_pcoll = root | beam.Create([self.path])
        match = io.filesystems.FileSystems.match([self.path], limits=[1])[0]
        if not match.metadata_list:
            # TODO(BEAM-12031): This should be allowed for streaming pipelines if
            # user provides an explicit schema.
            raise FileNotFoundError(f"Found no files that match {self.path!r}")
        first_path = match.metadata_list[0].path
        with io.filesystems.FileSystems.open(first_path) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        pcoll = (paths_pcoll
                 | fileio.MatchFiles(self.path)
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll,
                                    proxy=_prefix_range_index_with(
                                        ':', sample[:0]))
コード例 #9
0
ファイル: io_test.py プロジェクト: justsh/incubator-beam
    def test_windowed_write(self):
        output = self.temp_dir()
        with beam.Pipeline() as p:
            pc = (p | beam.Create(
                [MyRow(timestamp=i, value=i % 3) for i in range(20)])
                  | beam.Map(lambda v: beam.window.TimestampedValue(
                      v, v.timestamp)).with_output_types(MyRow)
                  | beam.WindowInto(
                      beam.window.FixedWindows(10)).with_output_types(MyRow))

            deferred_df = convert.to_dataframe(pc)
            deferred_df.to_csv(output + 'out.csv', index=False)

        first_window_files = (f'{output}out.csv-'
                              f'{datetime.utcfromtimestamp(0).isoformat()}*')
        self.assertCountEqual(
            ['timestamp,value'] + [f'{i},{i%3}' for i in range(10)],
            set(self.read_all_lines(first_window_files, delete=True)))

        second_window_files = (f'{output}out.csv-'
                               f'{datetime.utcfromtimestamp(10).isoformat()}*')
        self.assertCountEqual(
            ['timestamp,value'] + [f'{i},{i%3}' for i in range(10, 20)],
            set(self.read_all_lines(second_window_files, delete=True)))

        # Check that we've read (and removed) every output file
        self.assertEqual(len(glob.glob(f'{output}out.csv*')), 0)
コード例 #10
0
ファイル: convert_test.py プロジェクト: piter75/apache-beam
    def test_convert_non_deferred(self):
        with beam.Pipeline() as p:
            s1 = pd.Series([1, 2, 3])
            s2 = convert.to_dataframe(p | beam.Create([100, 200, 300]))

            pc1, pc2 = convert.to_pcollection(s1, s2, pipeline=p)
            assert_that(pc1, equal_to([1, 2, 3]), label='CheckNonDeferred')
            assert_that(pc2, equal_to([100, 200, 300]), label='CheckDeferred')
コード例 #11
0
ファイル: io.py プロジェクト: nielm/beam
 def expand(self, pcoll):
     from apache_beam.dataframe import convert  # avoid circular import
     return {
         'files_written':
         self._writer_func(convert.to_dataframe(pcoll), *self._args, **
                           self._kwargs)
         | beam.Map(lambda file_result: file_result.file_name).
         with_output_types(str)
     }
コード例 #12
0
  def test_parse_dataframes(self):
    """Tests that it correctly parses a DataFrame.
    """
    deferred = to_dataframe(beam.Pipeline() | beam.Create([Record(0, 0, 0)]))

    els = [windowed_value(pd.DataFrame(Record(n, 0, 0))) for n in range(10)]

    actual_df = utils.elements_to_df(
        els, element_type=deferred._expr.proxy()).reset_index(drop=True)
    expected_df = pd.concat([e.value for e in els], ignore_index=True)
    pd.testing.assert_frame_equal(actual_df, expected_df)
コード例 #13
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/shakespeare/kinglear.txt',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=True,
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # Import this here to avoid pickling the main session.
  import re

  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:

    # Read the text file[pattern] into a PCollection.
    lines = p | 'Read' >> ReadFromText(known_args.input)

    words = (
        lines
        | 'Split' >> beam.FlatMap(
            lambda line: re.findall(r'[\w]+', line)).with_output_types(str)
        # Map to Row objects to generate a schema suitable for conversion
        # to a dataframe.
        | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word)))

    df = to_dataframe(words)
    for k, pc in
    print(df)
    """
    df['count'] = 1
    counted = df.groupby('word').sum()
    counted.to_csv(known_args.output)

    # Deferred DataFrames can also be converted back to schema'd PCollections
    counted_pc = to_pcollection(counted, include_indexes=True)

    # Print out every word that occurred >50 times
    _ = (
        counted_pc
        | beam.Filter(lambda row: row.count > 50)
        | beam.Map(lambda row: f'{row.word}: {row.count}')
        | beam.Map(print))
    """

if __name__ == '__main__':
  logging.getLogger().setLevel(logging.INFO)
  run()
コード例 #14
0
ファイル: convert_test.py プロジェクト: piter75/apache-beam
    def test_convert_memoization(self):
        with beam.Pipeline() as p:
            a = pd.Series([1, 2, 3])
            b = pd.Series([100, 200, 300])

            pc_a = p | 'A' >> beam.Create([a])
            pc_b = p | 'B' >> beam.Create([b])

            df_a = convert.to_dataframe(pc_a, proxy=a[:0])
            df_b = convert.to_dataframe(pc_b, proxy=b[:0])

            df_2a = 2 * df_a
            df_3a = 3 * df_a
            df_ab = df_a * df_b

            # Two calls to to_pcollection with the same Dataframe should produce the
            # same PCollection(s)
            pc_2a_, pc_ab_ = convert.to_pcollection(df_2a, df_ab)
            pc_3a, pc_2a, pc_ab = convert.to_pcollection(df_3a, df_2a, df_ab)

            self.assertIs(pc_2a, pc_2a_)
            self.assertIs(pc_ab, pc_ab_)
            self.assertIsNot(pc_3a, pc_2a)
            self.assertIsNot(pc_3a, pc_ab)

            # The same conversions without the unbatching transform should also cache
            # PCollections
            pc_2a_pandas_, pc_ab_pandas_ = convert.to_pcollection(
                df_2a, df_ab, yield_elements='pandas')
            pc_3a_pandas, pc_2a_pandas, pc_ab_pandas = convert.to_pcollection(
                df_3a, df_2a, df_ab, yield_elements='pandas')

            self.assertIs(pc_2a_pandas, pc_2a_pandas_)
            self.assertIs(pc_ab_pandas, pc_ab_pandas_)
            self.assertIsNot(pc_3a_pandas, pc_2a_pandas)
            self.assertIsNot(pc_3a_pandas, pc_ab_pandas)

            # .. but the cached PCollections should be different
            self.assertIsNot(pc_2a_pandas, pc_2a)
            self.assertIsNot(pc_ab_pandas, pc_ab)
            self.assertIsNot(pc_3a_pandas, pc_3a)
コード例 #15
0
    def test_batching_beam_row_to_dataframe(self):
        with beam.Pipeline() as p:
            df = convert.to_dataframe(
                p
                | beam.Create([(u'Falcon', 380.), (
                    u'Falcon', 370.), (u'Parrot', 24.), (u'Parrot', 26.)])
                | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])))

            result = convert.to_pcollection(df.groupby('Animal').mean(),
                                            include_indexes=True)

            assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
コード例 #16
0
ファイル: io_test.py プロジェクト: mszb/beam
    def test_double_write(self):
        output = self.temp_dir()
        with beam.Pipeline() as p:
            pc1 = p | 'create pc1' >> beam.Create(
                [SimpleRow(value=i) for i in [1, 2]])
            pc2 = p | 'create pc2' >> beam.Create(
                [SimpleRow(value=i) for i in [3, 4]])

            deferred_df1 = convert.to_dataframe(pc1)
            deferred_df2 = convert.to_dataframe(pc2)

            deferred_df1.to_csv(f'{output}out1.csv',
                                transform_label="Writing to csv PC1",
                                index=False)
            deferred_df2.to_csv(f'{output}out2.csv',
                                transform_label="Writing to csv PC2",
                                index=False)

        self.assertCountEqual(['value', '1', '2'],
                              set(self.read_all_lines(output + 'out1.csv*')))
        self.assertCountEqual(['value', '3', '4'],
                              set(self.read_all_lines(output + 'out2.csv*')))
コード例 #17
0
ファイル: flight_delays.py プロジェクト: nielm/beam
def run_flight_delay_pipeline(pipeline,
                              start_date=None,
                              end_date=None,
                              output=None):
    query = f"""
  SELECT
    FlightDate AS date,
    IATA_CODE_Reporting_Airline AS airline,
    Origin AS departure_airport,
    Dest AS arrival_airport,
    DepDelay AS departure_delay,
    ArrDelay AS arrival_delay
  FROM `apache-beam-testing.airline_ontime_data.flights`
  WHERE
    FlightDate >= '{start_date}' AND FlightDate <= '{end_date}' AND
    DepDelay IS NOT NULL AND ArrDelay IS NOT NULL
  """

    # Import this here to avoid pickling the main session.
    import time
    from apache_beam import window

    def to_unixtime(s):
        return time.mktime(s.timetuple())

    # The pipeline will be run on exiting the with block.
    with pipeline as p:
        tbl = (
            p
            | 'read table' >> beam.io.ReadFromBigQuery(query=query,
                                                       use_standard_sql=True)
            | 'assign timestamp' >> beam.Map(
                lambda x: window.TimestampedValue(x, to_unixtime(x['date'])))
            # Use beam.Select to make sure data has a schema
            # The casts in lambdas ensure data types are properly inferred
            | 'set schema' >> beam.Select(
                date=lambda x: str(x['date']),
                airline=lambda x: str(x['airline']),
                departure_airport=lambda x: str(x['departure_airport']),
                arrival_airport=lambda x: str(x['arrival_airport']),
                departure_delay=lambda x: float(x['departure_delay']),
                arrival_delay=lambda x: float(x['arrival_delay'])))

        daily = tbl | 'daily windows' >> beam.WindowInto(
            beam.window.FixedWindows(60 * 60 * 24))

        # group the flights data by carrier
        df = to_dataframe(daily)
        result = df.groupby('airline').apply(get_mean_delay_at_top_airports)
        result.to_csv(output)
コード例 #18
0
ファイル: convert_test.py プロジェクト: piter75/apache-beam
    def test_convert(self):
        with beam.Pipeline() as p:
            a = pd.Series([1, 2, 3])
            b = pd.Series([100, 200, 300])

            pc_a = p | 'A' >> beam.Create(a)
            pc_b = p | 'B' >> beam.Create(b)

            df_a = convert.to_dataframe(pc_a)
            df_b = convert.to_dataframe(pc_b)

            df_2a = 2 * df_a
            df_3a = 3 * df_a
            df_ab = df_a * df_b

            # Converting multiple results at a time can be more efficient.
            pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab)
            # But separate conversions can be done as well.
            pc_3a = convert.to_pcollection(df_3a)

            assert_that(pc_2a, equal_to(list(2 * a)), label='Check2a')
            assert_that(pc_3a, equal_to(list(3 * a)), label='Check3a')
            assert_that(pc_ab, equal_to(list(a * b)), label='Checkab')
コード例 #19
0
  def test_batching_beam_row_to_dataframe(self):
    with beam.Pipeline() as p:
      df = convert.to_dataframe(
          p
          | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (
              u'Parrot', 24.), (u'Parrot', 26.)])
          | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])))

      result = convert.to_pcollection(df.groupby('Animal').mean())

      assert_that(
          result,
          df_equal_to(
              pd.DataFrame({
                  'Animal': ['Falcon', 'Parrot'], 'Speed': [375., 25.]
              }).set_index('Animal')))
コード例 #20
0
  def expand(self, root):
    # TODO(robertwb): Handle streaming (with explicit schema).
    paths_pcoll = root | beam.Create([self.path])
    first = io.filesystems.FileSystems.match([self.path],
                                             limits=[1
                                                     ])[0].metadata_list[0].path
    with io.filesystems.FileSystems.open(first) as handle:
      df = next(self.reader(handle, *self.args, chunksize=100, **self.kwargs))

    pcoll = (
        paths_pcoll
        | fileio.MatchFiles(self.path)
        | fileio.ReadMatches()
        | beam.ParDo(_ReadFromPandasDoFn(self.reader, self.args, self.kwargs)))
    from apache_beam.dataframe import convert
    return convert.to_dataframe(
        pcoll, proxy=_prefix_range_index_with(':', df[:0]))
コード例 #21
0
  def test_dataframes(self):
    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))
    data = p | beam.Create(
        [1, 2, 3]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))
    df = to_dataframe(data)

    # Watch the local scope for Interactive Beam so that values will be cached.
    ib.watch(locals())

    # This is normally done in the interactive_utils when a transform is
    # applied but needs an IPython environment. So we manually run this here.
    ie.current_env().track_user_pipelines()

    df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]})
    pd.testing.assert_frame_equal(
        df_expected, ib.collect(df, n=10).reset_index(drop=True))
コード例 #22
0
ファイル: io.py プロジェクト: fernando-wizeline/beam
    def expand(self, root):
        paths_pcoll = root | beam.Create([self.path])
        match = io.filesystems.FileSystems.match([self.path], limits=[1])[0]
        if not match.metadata_list:
            # TODO(BEAM-12031): This should be allowed for streaming pipelines if
            # user provides an explicit schema.
            raise FileNotFoundError(f"Found no files that match {self.path!r}")
        first_path = match.metadata_list[0].path
        with io.filesystems.FileSystems.open(first_path) as handle:
            if not self.binary:
                handle = TextIOWrapper(handle)
            if self.incremental:
                sample = next(
                    self.reader(handle, *self.args,
                                **dict(self.kwargs, chunksize=100)))
            else:
                sample = self.reader(handle, *self.args, **self.kwargs)

        matches_pcoll = paths_pcoll | fileio.MatchAll()
        indices_pcoll = (
            matches_pcoll.pipeline
            | 'DoOnce' >> beam.Create([None])
            | beam.Map(
                lambda _, paths:
                {path: ix
                 for ix, path in enumerate(sorted(paths))},
                paths=beam.pvalue.AsList(matches_pcoll
                                         | beam.Map(lambda match: match.path)))
        )

        pcoll = (matches_pcoll
                 | beam.Reshuffle()
                 | fileio.ReadMatches()
                 | beam.ParDo(
                     _ReadFromPandasDoFn(self.reader, self.args, self.kwargs,
                                         self.binary, self.incremental,
                                         self.splitter),
                     path_indices=beam.pvalue.AsSingleton(indices_pcoll)))
        from apache_beam.dataframe import convert
        return convert.to_dataframe(pcoll, proxy=sample[:0])
コード例 #23
0
    def test_dataframe_caching(self, cell):

        # Create a pipeline that exercises the DataFrame API. This will also use
        # caching in the background.
        with cell:  # Cell 1
            p = beam.Pipeline(interactive_runner.InteractiveRunner())
            ib.watch({'p': p})

        with cell:  # Cell 2
            data = p | beam.Create([
                1, 2, 3
            ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x))

            with beam.dataframe.allow_non_parallel_operations():
                df = to_dataframe(data).reset_index(drop=True)

            ib.collect(df)

        with cell:  # Cell 3
            df['output'] = df['square'] * df['cube']
            ib.collect(df)

        with cell:  # Cell 4
            df['output'] = 0
            ib.collect(df)

        # We use a trace through the graph to perform an isomorphism test. The end
        # output should look like a linear graph. This indicates that the dataframe
        # transform was correctly broken into separate pieces to cache. If caching
        # isn't enabled, all the dataframe computation nodes are connected to a
        # single shared node.
        trace = []

        # Only look at the top-level transforms for the isomorphism. The test
        # doesn't care about the transform implementations, just the overall shape.
        class TopLevelTracer(beam.pipeline.PipelineVisitor):
            def _find_root_producer(self,
                                    node: beam.pipeline.AppliedPTransform):
                if node is None or not node.full_label:
                    return None

                parent = self._find_root_producer(node.parent)
                if parent is None:
                    return node

                return parent

            def _add_to_trace(self, node, trace):
                if '/' not in str(node):
                    if node.inputs:
                        producer = self._find_root_producer(
                            node.inputs[0].producer)
                        producer_name = producer.full_label if producer else ''
                        trace.append((producer_name, node.full_label))

            def visit_transform(self, node: beam.pipeline.AppliedPTransform):
                self._add_to_trace(node, trace)

            def enter_composite_transform(
                    self, node: beam.pipeline.AppliedPTransform):
                self._add_to_trace(node, trace)

        p.visit(TopLevelTracer())

        # Do the isomorphism test which states that the topological sort of the
        # graph yields a linear graph.
        trace_string = '\n'.join(str(t) for t in trace)
        prev_producer = ''
        for producer, consumer in trace:
            self.assertEqual(producer, prev_producer, trace_string)
            prev_producer = consumer
コード例 #24
0
ファイル: convert_test.py プロジェクト: piter75/apache-beam
 def test_convert_scalar(self):
     with beam.Pipeline() as p:
         pc = p | 'A' >> beam.Create([1, 2, 3])
         s = convert.to_dataframe(pc)
         pc_sum = convert.to_pcollection(s.sum())
         assert_that(pc_sum, equal_to([6]))
コード例 #25
0
def run():

    options = MyOptions()

    with beam.Pipeline(options=options) as p:

        immigration_data = (
            p
            | "Read Immigration Data" >> beam.io.parquetio.ReadFromParquet(
                p.options.input_dir.get() +
                "data.parquet\*").with_output_types(ImmigrationData)
            | "Immigration dictionary collection to row" >>
            beam.Map(ToRowImmigration))

        df_immigration = to_dataframe(immigration_data)

        cities_data = (
            p
            | "Read city data" >> beam.io.ReadFromText(
                p.options.input_dir.get() + "us-cities-demographics.csv",
                skip_header_lines=1,
            )
            | "Parse city data" >> beam.ParDo(SplitCityData())
            | "Get average demographics per State" >> beam.CombinePerKey(
                AverageDictFn())
            | "Key to Column" >> beam.ParDo(
                OrganizeCityData()).with_output_types(CityData)
            | "City dictionary collection to row" >> beam.Map(ToRowCity))

        df_cities = to_dataframe(cities_data)

        df_immigration = df_immigration[df_immigration["i94addr"].notna()]
        df_immigration = df_immigration.join(df_cities, rsuffix="_city")

        airport_data = (
            p
            | "Read airport data" >> beam.io.ReadFromText(
                p.options.input_dir.get() + "airport-codes_csv_2.csv",
                skip_header_lines=1,
            )
            | "Parse airport data" >> beam.ParDo(
                SplitAirportData()).with_output_types(AirportData)
            | "Airport dictionary collection to row" >> beam.Map(ToRowAirport))

        df_airport = to_dataframe(airport_data)

        df_immigration = df_immigration[df_immigration["i94port"].notna()]
        join_data = df_immigration.join(df_airport, rsuffix="_airport")

        join_data = to_pcollection(join_data, include_indexes=False)

        # ---#---#---# TODO #---#---#---#
        # -------- Join temperature data to dataset --------

        # temperature_data = (
        #     p | "Read city data" >> beam.io.ReadFromText('GlobalLandTemperaturesByCity.csv', skip_header_lines=1) |
        #     "Parse temperature data" >> beam.ParDo(SplitTempData()).with_output_types(TemperatureData)  |
        #     'Temperature dictionary filtered and to row' >> beam.ParDo(FilterAndToRowTemperature())
        # )

        # -------- NotImplementedError: grouby(as_index = False) and drop_duplicates()--------
        # df_immigration_2 = df_immigration.filter(
        #         items=['arrdate','i94mon','municipality','i94port']
        #     ).groupby(
        #         by = ['arrdate','i94mon','municipality','i94port'], as_index = False
        #     )

        # df_temperature = to_dataframe(temperature_data)

        # df_temperature_2 = df_immigration_2.join(df_temperature,
        #                                 (df_immigration_2.municipality == df_temperature.municipality) \
        #                                 & (df_immigration_2.i94mon == df_temperature.month ), 'left')

        output = join_data | "Save data to file" >> beam.io.WriteToText(
            p.options.output_dir.get())
コード例 #26
0
    def _run_read_write_test(self,
                             format,
                             read_kwargs={},
                             write_kwargs={},
                             check_options={},
                             requires=()):

        for module in requires:
            try:
                importlib.import_module(module)
            except ImportError:
                raise unittest.SkipTest('Missing dependency: %s' % module)
        small = pd.DataFrame({
            'label': ['11a', '37a', '389a'],
            'rank': [0, 1, 2]
        })
        big = pd.DataFrame({'number': list(range(1000))})
        big['float'] = big.number.map(math.sqrt)
        big['text'] = big.number.map(lambda n: 'f' + 'o' * n)

        def frame_equal_to(expected_, check_index=True, check_names=True):
            def check(actual):
                expected = expected_
                try:
                    actual = pd.concat(actual)
                    if not check_index:
                        expected = expected.sort_values(list(
                            expected.columns)).reset_index(drop=True)
                        actual = actual.sort_values(list(
                            actual.columns)).reset_index(drop=True)
                    if not check_names:
                        actual = actual.rename(columns=dict(
                            zip(actual.columns, expected.columns)))
                    return assert_frame_equal(expected,
                                              actual,
                                              check_like=True)
                except:
                    print("EXPECTED")
                    print(expected)
                    print("ACTUAL")
                    print(actual)
                    raise

            return check

        for df in (small, big):
            with tempfile.TemporaryDirectory() as dir:
                dest = os.path.join(dir, 'out')
                try:
                    with beam.Pipeline() as p:
                        deferred_df = convert.to_dataframe(
                            p | beam.Create([df[::3], df[1::3], df[2::3]]),
                            proxy=df[:0])
                        # This does the write.
                        getattr(deferred_df, 'to_%s' % format)(dest,
                                                               **write_kwargs)
                    with beam.Pipeline() as p:
                        # Now do the read.
                        # TODO(robertwb): Allow reading from pcoll of paths to do it all in
                        # one pipeline.

                        result = convert.to_pcollection(
                            p | getattr(io, 'read_%s' % format)(dest + '*', **
                                                                read_kwargs),
                            yield_elements='pandas')
                        assert_that(result,
                                    frame_equal_to(df, **check_options))
                except:
                    os.system('head -n 100 ' + dest + '*')
                    raise