Beispiel #1
0
    def test_read_messages_timestamp_attribute_rfc3339_success(
            self, mock_pubsub):
        payload = 'payload'
        message_id = 'message_id'
        attributes = {'time': '2018-03-12T13:37:01.234567Z'}
        publish_time = '2018-03-12T13:37:01.234567Z'
        data = [
            create_client_message(payload, message_id, attributes,
                                  publish_time)
        ]
        expected_data = [
            TestWindowedValue(
                PubsubMessage(payload, attributes),
                timestamp.Timestamp.from_rfc3339(attributes['time']),
                [window.GlobalWindow()]),
        ]

        mock_pubsub.Client = functools.partial(FakePubsubClient, data)
        mock_pubsub.subscription.AutoAck = FakeAutoAck

        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                                  None,
                                  'a_label',
                                  with_attributes=True,
                                  timestamp_attribute='time'))
        assert_that(pcoll, equal_to(expected_data), reify_windows=True)
        p.run()
Beispiel #2
0
    def _run_pubsub_bq_pipeline(self, method, triggering_frequency=None):
        l = [i for i in range(self._SIZE)]

        matchers = [
            PipelineStateMatcher(PipelineState.RUNNING),
            BigqueryFullResultStreamingMatcher(project=self.project,
                                               query="SELECT number FROM %s" %
                                               self.output_table,
                                               data=[(i, ) for i in l])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=hc.all_of(*matchers),
            wait_until_finish_duration=self.WAIT_UNTIL_FINISH_DURATION,
            experiments='use_beam_bq_sink',
            streaming=True)

        def add_schema_info(element):
            yield {'number': element}

        messages = [str(i).encode('utf-8') for i in l]
        for message in messages:
            self.pub_client.publish(self.input_topic.name, message)

        with beam.Pipeline(argv=args) as p:
            mesages = (p
                       | ReadFromPubSub(subscription=self.input_sub.name)
                       | beam.ParDo(add_schema_info))
            _ = mesages | WriteToBigQuery(
                self.output_table,
                schema=self.SCHEMA,
                method=method,
                triggering_frequency=triggering_frequency)
Beispiel #3
0
    def test_expand_with_subscription(self):
        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadFromPubSub(
                     None,
                     'projects/fakeprj/subscriptions/a_subscription',
                     'a_label',
                     with_attributes=False,
                     timestamp_attribute=None)
                 | beam.Map(lambda x: x))
        self.assertEqual(bytes, pcoll.element_type)

        # Apply the necessary PTransformOverrides.
        overrides = _get_transform_overrides(p.options)
        p.replace_all(overrides)

        # Note that the direct output of ReadFromPubSub will be replaced
        # by a PTransformOverride, so we use a no-op Map.
        read_transform = pcoll.producer.inputs[0].producer.transform

        # Ensure that the properties passed through correctly
        source = read_transform._source
        self.assertEqual('a_subscription', source.subscription_name)
        self.assertEqual('a_label', source.id_label)
    def expand(self, input):
        # [START EXERCISE 3]:
        # Docs: https://beam.apache.org/documentation/sdks/pydoc/2.5.0/apache_beam.io.gcp.pubsub.html

        # Determine whether to use files or topic based on options.
        if (not self.args.input == None) and (not self.args.input == ""):
            return (
                input
                # Read game events from files. See exercise2.
                # Don't forget to parse events or to include the TimestampedValue transform to assign timestamps to events.
                | beam.io.ReadFromText(self.args.input)
                | ParDo(ParseEventFn())
                | beam.Map(lambda element: TimestampedValue(
                    element, element[self.TIMESTAMP_ATTRIBUTE])))
        else:
            return (
                input
                # Read game events from Pub/Sub topic self.options.topic using custom timestamps, which
                # are extracted from the pubsub attribute TIMESTAMP_ATTRIBUTE.
                # Use ReadFromPubSub() and use parameters topic and timestamp_attribute.
                # https://beam.apache.org/documentation/sdks/python-streaming/
                | ReadFromPubSub(self.args.topic,
                                 timestamp_attribute=self.TIMESTAMP_ATTRIBUTE)

                # Parse the messages the same way as when they come from the text file. Note that we no
                # longer have to run WithTimestamps transform, as the timestamps are already set by
                # ReadFromPubSub.
                | ParDo(ParseEventFn()))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_topic',
        dest='input_topic',
        help='Input topic in the form projects/<project>/topics/<topic>')
    parser.add_argument('--output',
                        dest='output_file',
                        help='Output file where to write')
    parser.add_argument('--table', dest='table_name', help='BQ table name')
    parser.add_argument('--dataset', dest='dataset_id', help='BQ dataset')
    parser.add_argument('--project_id', dest='project_id', help='Project ID')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project=<your-project>'])
    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromPubSub(topic=known_args.input_topic)

        def str_to_dict(str_line):
            import pandas as pd
            df_rows = eval(str_line)
            pd.DataFrame.from_dict(df_rows)
            bq_rows = eval(re.sub('\[|\]', '', str_line.decode('utf-8')))
            logging.info(bq_rows)
            return bq_rows

        lines = lines | 'String to dict' >> beam.Map(str_to_dict)
        lines = lines | 'Output to BQ' >> WriteToBigQuery(
            table=known_args.table_name,
            dataset=known_args.dataset_id,
            project=known_args.project_id)
Beispiel #6
0
    def test_read_messages_timestamp_attribute_fail_parse(self, mock_pubsub):
        data = b'data'
        attributes = {'time': '1337 unparseable'}
        publish_time_secs = 1520861821
        publish_time_nanos = 234567000
        ack_id = 'ack_id'
        pull_response = test_utils.create_pull_response([
            test_utils.PullResponseMessage(data, attributes, publish_time_secs,
                                           publish_time_nanos, ack_id)
        ])
        mock_pubsub.return_value.pull.return_value = pull_response

        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        _ = (p
             | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                              None,
                              None,
                              with_attributes=True,
                              timestamp_attribute='time'))
        with self.assertRaisesRegex(ValueError, r'parse'):
            p.run()
        mock_pubsub.return_value.acknowledge.assert_not_called()

        mock_pubsub.return_value.api.transport.channel.close.assert_has_calls(
            [mock.call()])
Beispiel #7
0
    def test_read_messages_timestamp_attribute_milli_success(
            self, mock_pubsub):
        data = b'data'
        attributes = {'time': '1337'}
        publish_time_secs = 1520861821
        publish_time_nanos = 234567000
        ack_id = 'ack_id'
        pull_response = test_utils.create_pull_response([
            test_utils.PullResponseMessage(data, attributes, publish_time_secs,
                                           publish_time_nanos, ack_id)
        ])
        expected_elements = [
            TestWindowedValue(
                PubsubMessage(data, attributes),
                timestamp.Timestamp(micros=int(attributes['time']) * 1000),
                [window.GlobalWindow()]),
        ]
        mock_pubsub.return_value.pull.return_value = pull_response

        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            pcoll = (p
                     | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                                      None,
                                      None,
                                      with_attributes=True,
                                      timestamp_attribute='time'))
            assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
        mock_pubsub.return_value.acknowledge.assert_has_calls(
            [mock.call(subscription=mock.ANY, ack_ids=[ack_id])])

        mock_pubsub.return_value.close.assert_has_calls([mock.call()])
Beispiel #8
0
def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      required=True,
                      help='Input Pub/Sub subscription to read from.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output BigQuery table to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  (p | 'read' >> ReadFromPubSub(subscription=known_args.input)
     | 'extract words' >> beam.FlatMap(extract_words)
     | 'transform to kv' >> beam.Map(lambda x: (x,1))
     | 'window per minute' >> beam.WindowInto(
                                window.FixedWindows(5),
                                trigger=trigger.AfterProcessingTime(delay=10),
                                accumulation_mode=trigger.AccumulationMode.DISCARDING)
     | 'group by words' >> beam.GroupByKey()
     | 'count ones' >> beam.Map(count_ones)
     | 'format for bq' >> beam.Map(format_for_bigquery)
     | 'write to bigquery' >> WriteToBigQuery(table=known_args.output))

  result = p.run()
  result.wait_until_finish()
Beispiel #9
0
  def test_expand(self):
    options = PipelineOptions([])
    options.view_as(StandardOptions).streaming = True
    p = TestPipeline(options=options)
    pcoll = (
        p
        | ReadFromPubSub('projects/fakeprj/topics/baz')
        | WriteToPubSub(
            'projects/fakeprj/topics/a_topic', with_attributes=True)
        | beam.Map(lambda x: x))

    # Apply the necessary PTransformOverrides.
    overrides = _get_transform_overrides(options)
    p.replace_all(overrides)

    # Note that the direct output of ReadFromPubSub will be replaced
    # by a PTransformOverride, so we use a no-op Map.
    write_transform = pcoll.producer.inputs[0].producer.transform

    # Ensure that the properties passed through correctly
    self.assertEqual('a_topic', write_transform.dofn.short_topic_name)
    self.assertEqual(True, write_transform.dofn.with_attributes)
    # TODO(BEAM-4275): These properties aren't supported yet in direct runner.
    self.assertEqual(None, write_transform.dofn.id_label)
    self.assertEqual(None, write_transform.dofn.timestamp_attribute)
Beispiel #10
0
def run(run_local):
    JOB_NAME = 'firestore-stream-{}'.format(
        datetime.datetime.now().strftime('%Y-%m-%d-%H%M%S'))

    pipeline_options = {
        'project': PROJECT,
        'staging_location': 'gs://' + BUCKET + '/staging',
        'runner': 'DataflowRunner',
        'job_name': JOB_NAME,
        'disk_size_gb': 100,
        'temp_location': 'gs://' + BUCKET + '/temp',
        'save_main_session': True,
        'requirements_file': 'requirements.txt',
        'streaming': True
    }

    if run_local:
        pipeline_options['runner'] = 'DirectRunner'

    options = PipelineOptions.from_dictionary(pipeline_options)

    p = beam.Pipeline(options=options)
    crawled_features = (p
                        | 'ReadPubsub' >> ReadFromPubSub(
                            topic=PUBSUB_TOPIC).with_output_types(bytes)
                        | 'JSONParse' >> beam.Map(lambda x: json.loads(x)))

    firebase_stream = (crawled_features
                       | 'WriteFirebase' >> beam.ParDo(FirestoreWriteDoFn()))

    p.run()
Beispiel #11
0
    def test_expand_with_other_options(self):
        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        pcoll = (p
                 | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                                  None,
                                  'a_label',
                                  with_attributes=True,
                                  timestamp_attribute='time')
                 | beam.Map(lambda x: x))
        self.assertEqual(PubsubMessage, pcoll.element_type)

        # Apply the necessary PTransformOverrides.
        overrides = _get_transform_overrides(options)
        p.replace_all(overrides)

        # Note that the direct output of ReadFromPubSub will be replaced
        # by a PTransformOverride, so we use a no-op Map.
        read_transform = pcoll.producer.inputs[0].producer.transform

        # Ensure that the properties passed through correctly
        source = read_transform._source
        self.assertTrue(source.with_attributes)
        self.assertEqual('time', source.timestamp_attribute)
def main():
    # bq_source = BigQuerySource(query="""
    #                            SELECT created_at, text
    #                            FROM got_sentiment.got_tweets
    #                            """,
    #                            validate=False, coder=None,
    #                            use_standard_sql=True, flatten_results=True,
    #                            kms_key=None)

    # Removed attributes from ReadFromPubSub:
    #                              with_attributes=False,
    #                             timestamp_attribute='created_at'

    # Create the Pipeline with the specified options.
    with Pipeline(options=options) as p:
        results = (
            p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC)
            | 'Window' >> WindowInto(window.FixedWindows(60))
            | 'Emit_needed_values' >> FlatMap(emit_values, entity_map)
            | 'Combine' >> CombinePerKey(EntityScoreCombine())
            | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn())
            | 'FormatForWrite' >> Map(format_for_write)
            | 'Write' >> WriteToBigQuery('streaming_scores',
                                         dataset=BQ_DATASET,
                                         project=PROJECT_ID,
                                         create_disposition='CREATE_IF_NEEDED',
                                         write_disposition='WRITE_APPEND',
                                         batch_size=20))
Beispiel #13
0
    def test_read_messages_success(self, mock_pubsub):
        data = 'data'
        publish_time_secs = 1520861821
        publish_time_nanos = 234567000
        attributes = {'key': 'value'}
        ack_id = 'ack_id'
        pull_response = test_utils.create_pull_response([
            test_utils.PullResponseMessage(data, attributes, publish_time_secs,
                                           publish_time_nanos, ack_id)
        ])
        expected_elements = [
            TestWindowedValue(PubsubMessage(data, attributes),
                              timestamp.Timestamp(1520861821.234567),
                              [window.GlobalWindow()])
        ]
        mock_pubsub.return_value.pull.return_value = pull_response

        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        p = TestPipeline(options=options)
        pcoll = (p
                 | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                                  None,
                                  None,
                                  with_attributes=True))
        assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
        p.run()
        mock_pubsub.return_value.acknowledge.assert_has_calls(
            [mock.call(mock.ANY, [ack_id])])
Beispiel #14
0
    def test_read_messages_timestamp_attribute_missing(self, mock_pubsub):
        data = b'data'
        attributes = {}
        publish_time_secs = 1520861821
        publish_time_nanos = 234567000
        publish_time = '2018-03-12T13:37:01.234567Z'
        ack_id = 'ack_id'
        pull_response = test_utils.create_pull_response([
            test_utils.PullResponseMessage(data, attributes, publish_time_secs,
                                           publish_time_nanos, ack_id)
        ])
        expected_elements = [
            TestWindowedValue(PubsubMessage(data, attributes),
                              timestamp.Timestamp.from_rfc3339(publish_time),
                              [window.GlobalWindow()]),
        ]
        mock_pubsub.return_value.pull.return_value = pull_response

        options = PipelineOptions([])
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            pcoll = (p
                     | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                                      None,
                                      None,
                                      with_attributes=True,
                                      timestamp_attribute='nonexistent'))
            assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
        mock_pubsub.return_value.acknowledge.assert_has_calls(
            [mock.call(mock.ANY, [ack_id])])

        mock_pubsub.return_value.api.transport.channel.close.assert_has_calls(
            [mock.call()])
Beispiel #15
0
    def test_read_messages_timestamp_attribute_rfc3339_success(
            self, mock_pubsub):
        data = 'data'
        attributes = {'time': '2018-03-12T13:37:01.234567Z'}
        publish_time_secs = 1337000000
        publish_time_nanos = 133700000
        ack_id = 'ack_id'
        pull_response = test_utils.create_pull_response([
            test_utils.PullResponseMessage(data, attributes, publish_time_secs,
                                           publish_time_nanos, ack_id)
        ])
        expected_elements = [
            TestWindowedValue(
                PubsubMessage(data, attributes),
                timestamp.Timestamp.from_rfc3339(attributes['time']),
                [window.GlobalWindow()]),
        ]
        mock_pubsub.return_value.pull.return_value = pull_response

        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                                  None,
                                  None,
                                  with_attributes=True,
                                  timestamp_attribute='time'))
        assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
        p.run()
        mock_pubsub.return_value.acknowledge.assert_has_calls(
            [mock.call(mock.ANY, [ack_id])])
Beispiel #16
0
  def test_read_messages_timestamp_attribute_missing(self, mock_pubsub):
    data = 'data'
    message_id = 'message_id'
    attributes = {}
    publish_time = '2018-03-12T13:37:01.234567Z'
    payloads = [
        create_client_message(data, message_id, attributes, publish_time)]
    expected_elements = [
        TestWindowedValue(
            PubsubMessage(data, attributes),
            timestamp.Timestamp.from_rfc3339(publish_time),
            [window.GlobalWindow()]),
    ]

    mock_pubsub.Client = functools.partial(FakePubsubClient, payloads)
    mock_pubsub.subscription.AutoAck = FakeAutoAck

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    pcoll = (p
             | ReadFromPubSub(
                 'projects/fakeprj/topics/a_topic', None, None,
                 with_attributes=True, timestamp_attribute='nonexistent'))
    assert_that(pcoll, equal_to(expected_elements), reify_windows=True)
    p.run()
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--subscription",
        dest="subscription",
        required=True,
        help=
        'Input PubSub subscription of the form "projects/<PROJECT>/subscriptions/<SUBSCRIPTION>".',
    )
    parser.add_argument(
        "--bigquery_table",
        dest="bigquery_table",
        required=True,
        help="The fully-qualified BigQuery table to which to write.",
    )
    parser.add_argument(
        "--bigquery_table_for_failed_rows",
        dest="bigquery_table_for_failed_rows",
        required=True,
        help=
        "The fully-qualified BigQuery table to which to write failed inserts.",
    )

    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(StandardOptions).streaming = True

    p = beam.Pipeline(options=pipeline_options)
    # yapf: disable
    messages = (
        p
        | "ReadFromPubSub" >> ReadFromPubSub(subscription=known_args.subscription)
            .with_output_types(bytes)
        | "ParseAndValidateMessages" >> beam.ParDo(ValidateMessages())
            .with_outputs(ValidateMessages.OUTPUT_TAG, main="valid_messages"))

    valid_messages = messages["valid_messages"]
    invalid_messages = messages[ValidateMessages.OUTPUT_TAG]

    (invalid_messages
        | "InvalidMessages:TupleToDict" >> beam.Map(tuple_to_dict)
        | "InvalidMessages:WriteToBigQuery" >> WriteRowsToBigQuery(
            table_name=known_args.bigquery_table_for_failed_rows))

    failed_rows = (
        valid_messages
        | "ValidMessages:WriteToBigQuery" >> WriteRowsToBigQuery(
            table_name=known_args.bigquery_table))

    failed_rows_pcoll = failed_rows["FailedRows"]

    (failed_rows_pcoll
        | "FailedInserts:TupleToDict" >> beam.Map(tuple_to_dict)
        | "FailedInserts:WriteToBigQuery" >> WriteRowsToBigQuery(
            table_name=known_args.bigquery_table_for_failed_rows))
    # yapf: enable
    result = p.run()
    result.wait_until_finish()
Beispiel #18
0
 def test_expand_with_both_topic_and_subscription(self):
     with self.assertRaisesRegexp(
             ValueError,
             "Only one of topic or subscription should be provided."):
         ReadFromPubSub('a_topic',
                        'a_subscription',
                        'a_label',
                        timestamp_attribute=None)
Beispiel #19
0
 def test_expand_with_no_topic_or_subscription(self):
     with self.assertRaisesRegex(
             ValueError,
             "Either a topic or subscription must be provided."):
         ReadFromPubSub(None,
                        None,
                        'a_label',
                        with_attributes=False,
                        timestamp_attribute=None)
Beispiel #20
0
 def test_read_message_id_label_unsupported(self, unused_mock_pubsub):
     # id_label is unsupported in DirectRunner.
     options = PipelineOptions([])
     options.view_as(StandardOptions).streaming = True
     with self.assertRaisesRegex(NotImplementedError,
                                 r'id_label is not supported'):
         with TestPipeline(options=options) as p:
             _ = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic',
                                     None, 'a_label'))
Beispiel #21
0
def run(argv=None):
    class MessageParser(beam.DoFn):
        # It is required to parse messages for GBK operation.
        # Otherwise there are encoding problems.
        def process(self, item):
            if item.attributes:
                k, v = item.attributes.popitem()
                yield (str(k), str(v))

    class ParserToBytes(beam.DoFn):
        # Parsing to bytes is required for saving in PubSub.
        def process(self, item):
            _, v = item
            yield bytes(v, encoding='utf8')

    parser = argparse.ArgumentParser()
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    parser.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    parser.add_argument('--metrics_namespace',
                        help=('Namespace of metrics '
                              '"string".'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # pylint: disable=expression-not-assigned
    (p
     | ReadFromPubSub(subscription=known_args.input_subscription,
                      with_attributes=True)
     | 'Window' >> beam.WindowInto(window.FixedWindows(1000, 0))
     | 'Measure time: Start' >> beam.ParDo(
         MeasureTime(known_args.metrics_namespace))
     | 'Count messages' >> beam.ParDo(
         CountMessages(known_args.metrics_namespace))
     | 'Parse' >> beam.ParDo(MessageParser())
     | 'GroupByKey' >> beam.GroupByKey()
     | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]])
     | 'Measure time: End' >> beam.ParDo(
         MeasureTime(known_args.metrics_namespace))
     | 'Parse to bytes' >> beam.ParDo(ParserToBytes())
     | 'Write' >> beam.io.WriteToPubSub(topic=known_args.output_topic))

    result = p.run()
    result.wait_until_finish()
    logging.error(result)
    return result
Beispiel #22
0
def run():
    with beam.Pipeline(options=PipelineOptions(streaming=True)) as p:
        pc = (p | ReadFromPubSub(topic=get_topic_path())
              | beam.WindowInto(
                  window.FixedWindows(WINDOW_SIZE),
                  accumulation_mode=trigger.AccumulationMode.DISCARDING)
              | 'AddWindowInfo' >> beam.ParDo(add_window_info)
              | beam.CombinePerKey(sum)
              | beam.ParDo(prepare_element)
              | 'Print' >> beam.ParDo(print_fn)
              | WriteToBigQuery(BIGQUERY_TABLE_ID))
Beispiel #23
0
  def test_read_data_success(self, mock_pubsub):
    data_encoded = u'🤷 ¯\\_(ツ)_/¯'.encode('utf-8')
    publish_time = '2018-03-12T13:37:01.234567Z'
    payloads = [create_client_message(data_encoded, None, None, publish_time)]
    expected_elements = [data_encoded]

    mock_pubsub.Client = functools.partial(FakePubsubClient, payloads)
    mock_pubsub.subscription.AutoAck = FakeAutoAck

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    pcoll = (p
             | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None))
    assert_that(pcoll, equal_to(expected_elements))
    p.run()
Beispiel #24
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='Input topic to process.')
    parser.add_argument('--output',
                        dest='output',
                        help='Output log name to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    isScreen = lambda event, name: event['jsonPayload']['name'] == name
    isEvent = lambda event, eventName: event['jsonPayload']['event'
                                                            ] == eventName

    syncStart = lambda event: isScreen(event, 'Sync')
    syncFinish = lambda event: event['jsonPayload'][
        'type'] == 'screen' and type(event['jsonPayload'][
            'properties']) == dict and event['jsonPayload']['properties'][
                'previousScreen'] == 'Sync' and event['jsonPayload'][
                    'properties']['currentScreen'] != 'Sync'

    loadStart = lambda event: isEvent(event, 'APP_LOADED')
    loadFinish = lambda event: isEvent(event, 'COMPONENT_MOUNT')

    transactionStart = lambda event: isScreen(event, 'Send')
    transactionFinish = lambda event: isEvent(event, 'send_invite') or isEvent(
        event, 'send_dollar_confirm')

    with beam.Pipeline(options=pipeline_options) as p:

        events = (p
                  | ReadFromPubSub(known_args.input, with_attributes=True)
                  | ParsePubSubJson())

        time_to_sync_measurements = events | TimeBetween(
            'time_to_sync', 2 * 60, syncStart, syncFinish)
        time_to_load = events | TimeBetween('time_to_load', 60, loadStart,
                                            loadFinish)
        time_to_send_transaction = events | TimeBetween(
            'time_to_send_transaction', 5 * 60, transactionStart,
            transactionFinish)

        ((time_to_sync_measurements, time_to_load, time_to_send_transaction)
         | beam.Flatten()
         | WriteToStackdriverLogging(known_args.output))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        help='Input topic to process.')
    parser.add_argument('--output',
                        dest='output',
                        help='Output log name to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        (p
         | ReadFromPubSub(known_args.input, with_attributes=True)
         | ParsePubSubJson()
         | WriteToStackdriverLogging(known_args.output))
Beispiel #26
0
    def test_read_data_success(self, mock_pubsub):
        data_encoded = u'🤷 ¯\\_(ツ)_/¯'.encode('utf-8')
        ack_id = 'ack_id'
        pull_response = test_utils.create_pull_response(
            [test_utils.PullResponseMessage(data_encoded, ack_id=ack_id)])
        expected_elements = [data_encoded]
        mock_pubsub.return_value.pull.return_value = pull_response

        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadFromPubSub('projects/fakeprj/topics/a_topic', None,
                                  None))
        assert_that(pcoll, equal_to(expected_elements))
        p.run()
        mock_pubsub.return_value.acknowledge.assert_has_calls(
            [mock.call(mock.ANY, [ack_id])])
Beispiel #27
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input_topic',
        dest='input_topic',
        help='Input topic in the form projects/<project>/topics/<topic>')
    parser.add_argument('--output',
                        dest='output_file',
                        help='Output file where to write')
    parser.add_argument('--table', dest='table_name', help='BQ table name')
    parser.add_argument('--dataset', dest='dataset_id', help='BQ dataset')
    parser.add_argument('--project_id', dest='project_id', help='Project ID')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(['--project=main-training-project', '--streaming'])
    """
    pipeline_args.extend(['--runner=DataflowRunner',
                             '--project=yourprojectid',
                             '--staging_location=gs://yourgsbucket',
                             '--temp_location=gs://yourgsbucket',
                         '--job_name=your-job-name'])
    """
    pipeline_options = PipelineOptions(pipeline_args)
    #pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromPubSub(topic=known_args.input_topic)

        def str_to_dict(str_line):
            import pandas as pd
            import nonpypimodule
            import changecommentfield

            df_rows = eval(str_line)
            pd.DataFrame.from_dict(df_rows)
            bq_rows = eval(re.sub('\[|\]', '', str_line.decode('utf-8')))
            bq_rows['post'] = nonpypimodule.return_sentence()
            bq_rows = changecommentfield.change_field(bq_rows)
            logging.info(bq_rows)
            return bq_rows

        lines = lines | 'String to dict' >> beam.Map(str_to_dict)
        lines = lines | 'Output to BQ' >> WriteToBigQuery(
            table=known_args.table_name,
            dataset=known_args.dataset_id,
            project=known_args.project_id)
Beispiel #28
0
    def test_expand_deprecated(self):
        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadFromPubSub('projects/fakeprj/topics/baz')
                 | WriteStringsToPubSub('projects/fakeprj/topics/a_topic')
                 | beam.Map(lambda x: x))

        # Apply the necessary PTransformOverrides.
        overrides = _get_transform_overrides(p.options)
        p.replace_all(overrides)

        # Note that the direct output of ReadFromPubSub will be replaced
        # by a PTransformOverride, so we use a no-op Map.
        write_transform = pcoll.producer.inputs[0].producer.transform

        # Ensure that the properties passed through correctly
        self.assertEqual('a_topic', write_transform.dofn.short_topic_name)
Beispiel #29
0
  def test_read_message_id_label_unsupported(self, mock_pubsub):
    # id_label is unsupported in DirectRunner.
    data = 'data'
    message_id = 'message_id'
    attributes = {'time': '1337 unparseable'}
    publish_time = '2018-03-12T13:37:01.234567Z'
    payloads = [
        create_client_message(data, message_id, attributes, publish_time)]

    mock_pubsub.Client = functools.partial(FakePubsubClient, payloads)
    mock_pubsub.subscription.AutoAck = FakeAutoAck

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    _ = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label'))
    with self.assertRaisesRegexp(NotImplementedError,
                                 r'id_label is not supported'):
      p.run()
Beispiel #30
0
  def test_read_messages_timestamp_attribute_fail_parse(self, mock_pubsub):
    data = 'data'
    message_id = 'message_id'
    attributes = {'time': '1337 unparseable'}
    publish_time = '2018-03-12T13:37:01.234567Z'
    payloads = [
        create_client_message(data, message_id, attributes, publish_time)]

    mock_pubsub.Client = functools.partial(FakePubsubClient, payloads)
    mock_pubsub.subscription.AutoAck = FakeAutoAck

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    _ = (p
         | ReadFromPubSub(
             'projects/fakeprj/topics/a_topic', None, 'a_label',
             with_attributes=True, timestamp_attribute='time'))
    with self.assertRaisesRegexp(ValueError, r'parse'):
      p.run()