Esempio n. 1
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
        lines = p | beam.io.ReadFromPubSub(
            subscription=known_args.input_subscription)
    else:
        lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (
        lines
        | 'split' >>
        (beam.ParDo(WordExtractingDoFn()).with_output_types(six.text_type))
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | beam.WindowInto(window.FixedWindows(15, 0))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    output | beam.io.WriteToPubSub(known_args.output_topic)

    result = p.run()
    result.wait_until_finish()
def run(argv=None):
  """Build and run the pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--output_topic', required=True,
      help=('Output PubSub topic of the form '
            '"projects/<PROJECT>/topic/<TOPIC>".'))
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument(
      '--input_topic',
      help=('Input PubSub topic of the form '
            '"projects/<PROJECT>/topics/<TOPIC>".'))
  group.add_argument(
      '--input_subscription',
      help=('Input PubSub subscription of the form '
            '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  pipeline_options.view_as(StandardOptions).streaming = True
  with beam.Pipeline(options=pipeline_options) as p:

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
      lines = p | beam.io.ReadFromPubSub(
          subscription=known_args.input_subscription)
    else:
      lines = p | beam.io.ReadFromPubSub(topic=known_args.input_topic)

    # Count the occurrences of each word.
    def count_ones(word_ones):
      (word, ones) = word_ones
      return (word, sum(ones))

    counts = (lines
              | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn())
              | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn'))
              | 'Split' >> (beam.ParDo(WordExtractingDoFn())
                            .with_output_types(unicode))
              | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
              | beam.WindowInto(window.FixedWindows(5, 0))
              | 'GroupByKey' >> beam.GroupByKey()
              | 'CountOnes' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
      (word, count) = word_count
      return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write to PubSub.
    # pylint: disable=expression-not-assigned
    output | beam.io.WriteStringsToPubSub(known_args.output_topic)

    def check_gbk_format():
      # A matcher that checks that the output of GBK is of the form word: count.
      def matcher(elements):
        # pylint: disable=unused-variable
        actual_elements_in_window, window = elements
        for elm in actual_elements_in_window:
          assert re.match(r'\S+:\s+\d+', elm) is not None
      return matcher

    # Check that the format of the output is correct.
    assert_that(
        output,
        check_gbk_format(),
        use_global_window=False,
        label='Assert word:count format.')

    # Check also that elements are ouput in the right window.
    # This expects exactly 1 occurrence of any subset of the elements
    # 150, 151, 152, 153, 154 in the window [150, 155)
    # or exactly 1 occurrence of any subset of the elements
    # 210, 211, 212, 213, 214 in the window [210, 215).
    expected_window_to_elements = {
        window.IntervalWindow(150, 155): [
            ('150: 1'), ('151: 1'), ('152: 1'), ('153: 1'), ('154: 1'),
        ],
        window.IntervalWindow(210, 215): [
            ('210: 1'), ('211: 1'), ('212: 1'), ('213: 1'), ('214: 1'),
        ],
    }

    # To pass, publish numbers in [150-155) or [210-215) with no repeats.
    # To fail, publish a repeated number in the range above range.
    # For example: '210 213 151 213'
    assert_that(
        output,
        equal_to_per_window(expected_window_to_elements),
        use_global_window=False,
        label='Assert correct streaming windowing.')
Esempio n. 3
0
def run(argv=None):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--output', required=True, help=('Output' 'op.csv'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
        messages = (
            p
            | beam.io.ReadFromPubSub(subscription=known_args.input_subscription
                                     ).with_output_types(bytes))
    else:
        messages = (p
                    | beam.io.ReadFromPubSub(
                        topic=known_args.input_topic).with_output_types(bytes))

    lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    counts = (
        lines
        | 'split' >>
        (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))

        #DISCARDING
        #| beam.WindowInto(window.SlidingWindows(30, 1))
        | 'window' >> beam.WindowInto(window.FixedWindows(30),
                                      trigger=AfterProcessingTime(20))
        #,trigger=AfterProcessingTime(20),accumulation_mode=AccumulationMode.DISCARDING)
        #| 'window' >> beam.WindowInto(window.GlobalWindows(),trigger=Repeatedly(AfterCount(3)),accumulation_mode=AccumulationMode.ACCUMULATING)
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(count_ones)
        | 'format' >> beam.Map(format_result))

    counts | 'write' >> WriteToText(known_args.output)

    result = p.run()
    result.wait_until_finish()