def test_proto_conversion(self): data = 'data' attributes = {'k1': 'v1', 'k2': 'v2'} m = PubsubMessage(data, attributes) m_converted = PubsubMessage._from_proto_str(m._to_proto_str()) self.assertEqual(m_converted.data, data) self.assertEqual(m_converted.attributes, attributes)
def test_read_messages_timestamp_attribute_missing(self, mock_pubsub): data = b'data' attributes = {} publish_time_secs = 1520861821 publish_time_nanos = 234567000 publish_time = '2018-03-12T13:37:01.234567Z' ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage(data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue(PubsubMessage(data, attributes), timestamp.Timestamp.from_rfc3339(publish_time), [window.GlobalWindow()]), ] mock_pubsub.return_value.pull.return_value = pull_response options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='nonexistent')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls( [mock.call(mock.ANY, [ack_id])])
def test_read_messages_timestamp_attribute_milli_success(self, mock_pubsub): data = b'data' attributes = {'time': '1337'} publish_time_secs = 1520861821 publish_time_nanos = 234567000 ack_id = 'ack_id' pull_response = test_utils.create_pull_response([ test_utils.PullResponseMessage( data, attributes, publish_time_secs, publish_time_nanos, ack_id) ]) expected_elements = [ TestWindowedValue( PubsubMessage(data, attributes), timestamp.Timestamp(micros=int(attributes['time']) * 1000), [window.GlobalWindow()]), ] mock_pubsub.return_value.pull.return_value = pull_response options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) pcoll = (p | ReadFromPubSub( 'projects/fakeprj/topics/a_topic', None, None, with_attributes=True, timestamp_attribute='time')) assert_that(pcoll, equal_to(expected_elements), reify_windows=True) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls([ mock.call(mock.ANY, [ack_id])]) mock_pubsub.return_value.api.transport.channel.close.assert_has_calls([ mock.call()])
def test_write_messages_unsupported_features(self, mock_pubsub): data = b'data' attributes = {'key': 'value'} payloads = [PubsubMessage(data, attributes)] options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) _ = (p | Create(payloads) | WriteToPubSub('projects/fakeprj/topics/a_topic', id_label='a_label')) with self.assertRaisesRegexp(NotImplementedError, r'id_label is not supported'): p.run() options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True p = TestPipeline(options=options) _ = (p | Create(payloads) | WriteToPubSub('projects/fakeprj/topics/a_topic', timestamp_attribute='timestamp')) with self.assertRaisesRegexp(NotImplementedError, r'timestamp_attribute is not supported'): p.run()
def _wait_for_messages(self, subscription, expected_num, timeout): """Wait for messages from given subscription.""" logging.debug('Start pulling messages from %s', subscription.full_name) total_messages = [] start_time = time.time() while time.time() - start_time <= timeout: pulled = subscription.pull(max_messages=MAX_MESSAGES_IN_ONE_PULL) for ack_id, message in pulled: if not self.with_attributes: total_messages.append(message.data) continue msg = PubsubMessage._from_message(message) if self.strip_attributes: for attr in self.strip_attributes: try: del msg.attributes[attr] except KeyError: msg.attributes[attr] = ( 'PubSubMessageMatcher error: ' 'expected attribute not found.') total_messages.append(msg) subscription.acknowledge([ack_id]) if len(total_messages) >= expected_num: return total_messages time.sleep(1) logging.error('Timeout after %d sec. Received %d messages from %s.', timeout, len(total_messages), subscription.full_name) return total_messages
def test_message_matcher_attributes_success(self, mock_get_sub, unsued_mock): self.init_matcher(expected_msg=[PubsubMessage(b'a', {'k': 'v'})], with_attributes=True) mock_sub = mock_get_sub.return_value mock_sub.pull.side_effect = [ create_pull_response([PullResponseMessage(b'a', {'k': 'v'})]) ] hc_assert_that(self.mock_presult, self.pubsub_matcher) self.assertEqual(mock_sub.pull.call_count, 1) self.assertEqual(mock_sub.acknowledge.call_count, 1)
def test_message_matcher_attributes_fail(self, mock_get_sub, unsued_mock): self.init_matcher(with_attributes=True) self.pubsub_matcher.expected_msg = [PubsubMessage('a', {})] mock_sub = mock_get_sub.return_value msg_a = pubsub.message.Message(b'a', 'unused_id') msg_a.attributes['k'] = 'v' # Unexpected. mock_sub.pull.side_effect = [[(1, msg_a)]] with self.assertRaisesRegexp(AssertionError, r'Unexpected'): hc_assert_that(self.mock_presult, self.pubsub_matcher) self.assertEqual(mock_sub.pull.call_count, 1)
def test_message_matcher_attributes_success(self, mock_get_sub, unsued_mock): self.init_matcher(with_attributes=True) self.pubsub_matcher.expected_msg = [PubsubMessage('a', {'k': 'v'})] mock_sub = mock_get_sub.return_value msg_a = pubsub.message.Message(b'a', 'unused_id') msg_a.attributes['k'] = 'v' mock_sub.pull.side_effect = [[(1, msg_a)]] hc_assert_that(self.mock_presult, self.pubsub_matcher) self.assertEqual(mock_sub.pull.call_count, 1)
def test_write_to_pubsub_with_attributes(self): mock_pubsub = mock.Mock() topic_path = "project/fakeproj/topics/faketopic" data = b'data' attributes = {'key': 'value'} message = PubsubMessage(data, attributes) utils.write_to_pubsub( mock_pubsub, topic_path, [message], with_attributes=True) mock_pubsub.publish.assert_has_calls( [mock.call(topic_path, data, **attributes), mock.call().result()])
def _inject_numbers(self, topic, num_messages): """Inject numbers as test data to PubSub.""" for n in range(num_messages): user = {'name': f'conall_{n}'} user_str = json.dumps(user) # logging.info(f'Injecting {user_str} to topic {topic.name}') msg = PubsubMessage(b'conall_0', {'timestamp': '1608051184000'}) self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes)
def test_message_matcher_attributes_fail(self, mock_get_sub, unsued_mock): self.init_matcher(expected_msg=[PubsubMessage(b'a', {})], with_attributes=True) mock_sub = mock_get_sub.return_value # Unexpected attribute 'k'. mock_sub.pull.side_effect = [ create_pull_response([PullResponseMessage(b'a', {'k': 'v'})]) ] with self.assertRaisesRegex(AssertionError, r'Unexpected'): hc_assert_that(self.mock_presult, self.pubsub_matcher) self.assertEqual(mock_sub.pull.call_count, 1) self.assertEqual(mock_sub.acknowledge.call_count, 1)
def test_message_matcher_strip_fail(self, mock_get_sub, unsued_mock): self.init_matcher(with_attributes=True, strip_attributes=['id', 'timestamp']) self.pubsub_matcher.expected_msg = [PubsubMessage('a', {'k': 'v'})] mock_sub = mock_get_sub.return_value # msg_a is missing attribute 'timestamp'. msg_a = pubsub.message.Message(b'a', 'unused_id') msg_a.attributes['id'] = 'foo' msg_a.attributes['k'] = 'v' mock_sub.pull.side_effect = [[(1, msg_a)]] with self.assertRaisesRegexp(AssertionError, r'Stripped attributes'): hc_assert_that(self.mock_presult, self.pubsub_matcher) self.assertEqual(mock_sub.pull.call_count, 1)
def test_message_matcher_strip_fail(self, mock_get_sub, unsued_mock): self.init_matcher(expected_msg=[PubsubMessage(b'a', {'k': 'v'})], with_attributes=True, strip_attributes=['id', 'timestamp']) mock_sub = mock_get_sub.return_value # Message is missing attribute 'timestamp'. mock_sub.pull.side_effect = [create_pull_response([ PullResponseMessage(b'a', {'id': 'foo', 'k': 'v'}) ])] with self.assertRaisesRegex(AssertionError, r'Stripped attributes'): hc_assert_that(self.mock_presult, self.pubsub_matcher) self.assertEqual(mock_sub.pull.call_count, 1) self.assertEqual(mock_sub.acknowledge.call_count, 1)
def test_write_messages_with_attributes_success(self, mock_pubsub): data = b'data' attributes = {'key': 'value'} payloads = [PubsubMessage(data, attributes)] options = PipelineOptions([]) options.view_as(StandardOptions).streaming = True with TestPipeline(options=options) as p: _ = (p | Create(payloads) | WriteToPubSub('projects/fakeprj/topics/a_topic', with_attributes=True)) mock_pubsub.return_value.publish.assert_has_calls( [mock.call(mock.ANY, data, **attributes)])
def _get_element(message): parsed_message = PubsubMessage._from_message(message) if (timestamp_attribute and timestamp_attribute in parsed_message.attributes): rfc3339_or_milli = parsed_message.attributes[timestamp_attribute] try: timestamp = Timestamp.from_rfc3339(rfc3339_or_milli) except ValueError: try: timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000) except ValueError as e: raise ValueError('Bad timestamp value: %s' % e) else: timestamp = Timestamp.from_rfc3339(message.service_timestamp) return timestamp, parsed_message
def test_write_messages_with_attributes_success(self, mock_pubsub): data = 'data' attributes = {'key': 'value'} payloads = [PubsubMessage(data, attributes)] expected_payloads = [[data, attributes]] mock_pubsub.Client = functools.partial(FakePubsubClient, messages_write=expected_payloads) p = TestPipeline() p.options.view_as(StandardOptions).streaming = True _ = (p | Create(payloads) | WriteToPubSub('projects/fakeprj/topics/a_topic', with_attributes=True)) p.run()
def test_read_messages_success(self, mock_pubsub): payload = 'payload' message_id = 'message_id' attributes = {'attribute': 'value'} data = [pubsub.message.Message(payload, message_id, attributes)] expected_data = [PubsubMessage(payload, message_id, attributes, None)] mock_pubsub.Client = functools.partial(FakePubsubClient, data) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadMessagesFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label')) assert_that(pcoll, equal_to(expected_data)) p.run()
def test_read_from_pubsub_with_attributes(self): mock_pubsub = mock.Mock() subscription_path = "project/fakeproj/subscriptions/fakesub" data = b'data' ack_id = 'ack_id' attributes = {'key': 'value'} message = PubsubMessage(data, attributes) pull_response = test_utils.create_pull_response( [test_utils.PullResponseMessage(data, attributes, ack_id=ack_id)]) mock_pubsub.pull.return_value = pull_response output = utils.read_from_pubsub(mock_pubsub, subscription_path, with_attributes=True, number_of_elements=1) self.assertEqual([message], output) mock_pubsub.acknowledge.assert_called_once_with( subscription_path, [ack_id])
def _get_element(message): parsed_message = PubsubMessage._from_message(message) if (timestamp_attribute and timestamp_attribute in parsed_message.attributes): rfc3339_or_milli = parsed_message.attributes[timestamp_attribute] try: timestamp = Timestamp.from_rfc3339(rfc3339_or_milli) except ValueError: try: timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000) except ValueError as e: raise ValueError('Bad timestamp value: %s' % e) else: timestamp = Timestamp(message.publish_time.seconds, message.publish_time.nanos // 1000) return timestamp, parsed_message
def _wait_for_messages(self, expected_num, timeout): """Wait for messages from given subscription.""" total_messages = [] total_messages_all_details = [] sub_client = pubsub.SubscriberClient() start_time = time.time() while time.time() - start_time <= timeout: response = sub_client.pull( subscription=self.sub_name, max_messages=self.max_messages_in_one_pull, timeout=self.pull_timeout) for rm in response.received_messages: msg = PubsubMessage._from_message(rm.message) full_message = (msg.data, msg.attributes, msg.attributes, msg.publish_time, msg.ordering_key) if not self.with_attributes: total_messages.append(msg.data) total_messages_all_details.append(full_message) continue if self.strip_attributes: for attr in self.strip_attributes: try: del msg.attributes[attr] except KeyError: msg.attributes[attr] = ( 'PubSubMessageMatcher error: ' 'expected attribute not found.') total_messages.append(msg) total_messages_all_details.append(full_message) ack_ids = [rm.ack_id for rm in response.received_messages] if ack_ids: sub_client.acknowledge(subscription=self.sub_name, ack_ids=ack_ids) if len(total_messages) >= expected_num: break time.sleep(self.sleep_time) if time.time() - start_time > timeout: _LOGGER.error( 'Timeout after %d sec. Received %d messages from %s.', timeout, len(total_messages), self.sub_name) return total_messages, total_messages_all_details
def _get_element(message): parsed_message = PubsubMessage._from_message(message) if timestamp_attribute: try: rfc3339_or_milli = parsed_message.attributes[timestamp_attribute] except KeyError as e: raise KeyError('Timestamp attribute not found: %s' % e) try: timestamp = Timestamp.from_rfc3339(rfc3339_or_milli) except ValueError: try: timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000) except ValueError as e: raise ValueError('Bad timestamp value: %s' % e) else: timestamp = Timestamp.from_rfc3339(message.service_timestamp) return timestamp, parsed_message
def _get_element(message): parsed_message = PubsubMessage._from_message(message) if (timestamp_attribute and timestamp_attribute in parsed_message.attributes): rfc3339_or_milli = parsed_message.attributes[timestamp_attribute] try: timestamp = Timestamp(micros=int(rfc3339_or_milli) * 1000) except ValueError: try: timestamp = Timestamp.from_rfc3339(rfc3339_or_milli) except ValueError as e: raise ValueError('Bad timestamp value: %s' % e) else: if message.publish_time is None: raise ValueError('No publish time present in message: %s' % message) timestamp = Timestamp.from_utc_datetime(message.publish_time) return timestamp, parsed_message
def test_read_from_pubsub_many(self): response_size = 33 number_of_elements = 100 mock_pubsub = mock.Mock() subscription_path = "project/fakeproj/subscriptions/fakesub" data_list = [ 'data {}'.format(i).encode("utf-8") for i in range(number_of_elements) ] attributes_list = [{ 'key': 'value {}'.format(i) } for i in range(number_of_elements)] ack_ids = ['ack_id_{}'.format(i) for i in range(number_of_elements)] messages = [ PubsubMessage(data, attributes) for data, attributes in zip(data_list, attributes_list) ] response_messages = [ test_utils.PullResponseMessage(data, attributes, ack_id=ack_id) for data, attributes, ack_id in zip(data_list, attributes_list, ack_ids) ] class SequentialPullResponse(object): def __init__(self, response_messages, response_size): self.response_messages = response_messages self.response_size = response_size self._index = 0 def __call__(self, *args, **kwargs): start = self._index self._index += self.response_size response = test_utils.create_pull_response( self.response_messages[start:start + self.response_size]) return response mock_pubsub.pull.side_effect = SequentialPullResponse( response_messages, response_size) output = utils.read_from_pubsub(mock_pubsub, subscription_path, with_attributes=True, number_of_elements=number_of_elements) self.assertEqual(messages, output) self._assert_ack_ids_equal(mock_pubsub, ack_ids)
def _test_read_messages_success(self, mock_pubsub): payload = 'payload' message_id = 'message_id' publish_time = '2018-03-12T13:37:01.234567Z' attributes = {'key': 'value'} data = [create_client_message( payload, message_id, attributes, publish_time)] expected_data = [TestWindowedValue(PubsubMessage(payload, attributes), timestamp.Timestamp(1520861821.234567), [window.GlobalWindow()])] mock_pubsub.Client = functools.partial(FakePubsubClient, data) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label', with_attributes=True)) assert_that(pcoll, equal_to(expected_data), reify_windows=True) p.run()
def _wait_for_messages(self, expected_num, timeout): """Wait for messages from given subscription.""" total_messages = [] sub_client = pubsub.SubscriberClient() start_time = time.time() while time.time() - start_time <= timeout: response = sub_client.pull(self.sub_name, max_messages=MAX_MESSAGES_IN_ONE_PULL, return_immediately=True) for rm in response.received_messages: msg = PubsubMessage._from_message(rm.message) if not self.with_attributes: if isinstance(msg.data, bytes): msg.data = msg.data.decode('utf-8') total_messages.append(msg.data) continue if self.strip_attributes: for attr in self.strip_attributes: try: del msg.attributes[attr] except KeyError: msg.attributes[attr] = ( 'PubSubMessageMatcher error: ' 'expected attribute not found.') total_messages.append(msg) ack_ids = [rm.ack_id for rm in response.received_messages] if ack_ids: sub_client.acknowledge(self.sub_name, ack_ids) if len(total_messages) >= expected_num: break time.sleep(1) if time.time() - start_time > timeout: logging.error( 'Timeout after %d sec. Received %d messages from %s.', timeout, len(total_messages), self.sub_name) return total_messages
def _wait_for_messages(self, expected_num, timeout): """Wait for messages from given subscription.""" total_messages = [] sub_client = pubsub.SubscriberClient() start_time = time.time() while time.time() - start_time <= timeout: response = sub_client.pull(self.sub_name, max_messages=MAX_MESSAGES_IN_ONE_PULL, return_immediately=True) for rm in response.received_messages: msg = PubsubMessage._from_message(rm.message) if not self.with_attributes: total_messages.append(msg.data) continue if self.strip_attributes: for attr in self.strip_attributes: try: del msg.attributes[attr] except KeyError: msg.attributes[attr] = ('PubSubMessageMatcher error: ' 'expected attribute not found.') total_messages.append(msg) ack_ids = [rm.ack_id for rm in response.received_messages] if ack_ids: sub_client.acknowledge(self.sub_name, ack_ids) if len(total_messages) >= expected_num: break time.sleep(1) if time.time() - start_time > timeout: logging.error('Timeout after %d sec. Received %d messages from %s.', timeout, len(total_messages), self.sub_name) return total_messages
class PubSubIntegrationTest(unittest.TestCase): ID_LABEL = 'id' TIMESTAMP_ATTRIBUTE = 'timestamp' INPUT_MESSAGES = [ # Use ID_LABEL attribute to deduplicate messages with the same ID. PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), PubsubMessage('data001', {ID_LABEL: 'foo'}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the IT # pipeline writes back the timestamp of each element (as reported by # Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage('data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), ] EXPECTED_OUTPUT_MESSAGES = [ PubsubMessage('data001-seen', {'processed': 'IT'}), PubsubMessage( 'data002-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), ] def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pubsub_client = pubsub.Client(project=self.project) self.input_topic = self.pubsub_client.topic(INPUT_TOPIC + self.uuid) self.output_topic = self.pubsub_client.topic(OUTPUT_TOPIC + self.uuid) self.input_sub = self.input_topic.subscription(INPUT_SUB + self.uuid) self.output_sub = self.output_topic.subscription(OUTPUT_SUB + self.uuid) self.input_topic.create() self.output_topic.create() test_utils.wait_for_topics_created( [self.input_topic, self.output_topic]) self.input_sub.create() self.output_sub.create() def tearDown(self): test_utils.cleanup_subscriptions([self.input_sub, self.output_sub]) test_utils.cleanup_topics([self.input_topic, self.output_topic]) def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Build expected dataset. # Set extra options to the pipeline for test purpose state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES if not with_attributes: expected_messages = [ pubsub_msg.data for pubsub_msg in expected_messages ] pubsub_msg_verifier = PubSubMessageMatcher( self.project, OUTPUT_SUB + self.uuid, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=[self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE]) extra_opts = { 'input_subscription': self.input_sub.full_name, 'output_topic': self.output_topic.full_name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. test_utils.wait_for_subscriptions_created([self.input_sub]) for msg in self.INPUT_MESSAGES: self.input_topic.publish(msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) @attr('IT') def test_streaming_data_only(self): self._test_streaming(with_attributes=False) @attr('IT') def test_streaming_with_attributes(self): self._test_streaming(with_attributes=True)
def test_payload_valid(self): _ = PubsubMessage('', None) _ = PubsubMessage('data', None) _ = PubsubMessage(None, {'k': 'v'})
def _get_element(message): if self.source.with_attributes: return PubsubMessage._from_message(message) else: return message.data
def test_payload_invalid(self): with self.assertRaisesRegex(ValueError, r'data.*attributes.*must be set'): _ = PubsubMessage(None, None) with self.assertRaisesRegex(ValueError, r'data.*attributes.*must be set'): _ = PubsubMessage(None, {})
def test_payload_publish_invalid(self): with self.assertRaisesRegex(ValueError, r'data field.*10MB'): msg = PubsubMessage(b'0' * 1024 * 1024 * 11, None) msg._to_proto_str(for_publish=True) with self.assertRaisesRegex(ValueError, 'attribute key'): msg = PubsubMessage(b'0', {'0' * 257: '0'}) msg._to_proto_str(for_publish=True) with self.assertRaisesRegex(ValueError, 'attribute value'): msg = PubsubMessage(b'0', {'0' * 100: '0' * 1025}) msg._to_proto_str(for_publish=True) with self.assertRaisesRegex(ValueError, '100 attributes'): attributes = {} for i in range(0, 101): attributes[str(i)] = str(i) msg = PubsubMessage(b'0', attributes) msg._to_proto_str(for_publish=True) with self.assertRaisesRegex(ValueError, 'ordering key'): msg = PubsubMessage(b'0', None, ordering_key='0' * 1301) msg._to_proto_str(for_publish=True)
class PubSubIntegrationTest(unittest.TestCase): ID_LABEL = 'id' TIMESTAMP_ATTRIBUTE = 'timestamp' INPUT_MESSAGES = { # TODO(BEAM-4275): DirectRunner doesn't support reading or writing # label_ids, nor writing timestamp attributes. Once these features exist, # TestDirectRunner and TestDataflowRunner should behave identically. 'TestDirectRunner': [ PubsubMessage(b'data001', {}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the # IT pipeline writes back the timestamp of each element (as reported # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage( b'data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), PubsubMessage(b'data003\xab\xac', {}), PubsubMessage( b'data004\xab\xac', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }) ], 'TestDataflowRunner': [ # Use ID_LABEL attribute to deduplicate messages with the same ID. PubsubMessage(b'data001', {ID_LABEL: 'foo'}), PubsubMessage(b'data001', {ID_LABEL: 'foo'}), PubsubMessage(b'data001', {ID_LABEL: 'foo'}), # For those elements that have the TIMESTAMP_ATTRIBUTE attribute, the # IT pipeline writes back the timestamp of each element (as reported # by Beam), as a TIMESTAMP_ATTRIBUTE + '_out' attribute. PubsubMessage( b'data002', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }), PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}), PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}), PubsubMessage(b'data003\xab\xac', {ID_LABEL: 'foo2'}), PubsubMessage( b'data004\xab\xac', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', }) ], } EXPECTED_OUTPUT_MESSAGES = { 'TestDirectRunner': [ PubsubMessage(b'data001-seen', {'processed': 'IT'}), PubsubMessage( b'data002-seen', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), PubsubMessage(b'data003\xab\xac-seen', {'processed': 'IT'}), PubsubMessage( b'data004\xab\xac-seen', { TIMESTAMP_ATTRIBUTE: '2018-07-11T02:02:50.149000Z', TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }) ], 'TestDataflowRunner': [ PubsubMessage(b'data001-seen', {'processed': 'IT'}), PubsubMessage( b'data002-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }), PubsubMessage(b'data003\xab\xac-seen', {'processed': 'IT'}), PubsubMessage( b'data004\xab\xac-seen', { TIMESTAMP_ATTRIBUTE + '_out': '2018-07-11T02:02:50.149000Z', 'processed': 'IT', }) ], } def setUp(self): self.test_pipeline = TestPipeline(is_integration_test=True) self.runner_name = type(self.test_pipeline.runner).__name__ self.project = self.test_pipeline.get_option('project') self.uuid = str(uuid.uuid4()) # Set up PubSub environment. from google.cloud import pubsub self.pub_client = pubsub.PublisherClient() self.input_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, INPUT_TOPIC + self.uuid)) self.output_topic = self.pub_client.create_topic( self.pub_client.topic_path(self.project, OUTPUT_TOPIC + self.uuid)) self.sub_client = pubsub.SubscriberClient() self.input_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, INPUT_SUB + self.uuid), self.input_topic.name) self.output_sub = self.sub_client.create_subscription( self.sub_client.subscription_path(self.project, OUTPUT_SUB + self.uuid), self.output_topic.name) def tearDown(self): test_utils.cleanup_subscriptions( self.sub_client, [self.input_sub, self.output_sub]) test_utils.cleanup_topics( self.pub_client, [self.input_topic, self.output_topic]) def _test_streaming(self, with_attributes): """Runs IT pipeline with message verifier. Args: with_attributes: False - Reads and writes message data only. True - Reads and writes message data and attributes. Also verifies id_label and timestamp_attribute features. """ # Set on_success_matcher to verify pipeline state and pubsub output. These # verifications run on a (remote) worker. # Expect the state to be RUNNING since a streaming pipeline is usually # never DONE. The test runner will cancel the pipeline after verification. state_verifier = PipelineStateMatcher(PipelineState.RUNNING) expected_messages = self.EXPECTED_OUTPUT_MESSAGES[self.runner_name] if not with_attributes: expected_messages = [pubsub_msg.data for pubsub_msg in expected_messages] if self.runner_name == 'TestDirectRunner': strip_attributes = None else: strip_attributes = [self.ID_LABEL, self.TIMESTAMP_ATTRIBUTE] pubsub_msg_verifier = PubSubMessageMatcher( self.project, self.output_sub.name, expected_messages, timeout=MESSAGE_MATCHER_TIMEOUT_S, with_attributes=with_attributes, strip_attributes=strip_attributes) extra_opts = { 'input_subscription': self.input_sub.name, 'output_topic': self.output_topic.name, 'wait_until_finish_duration': TEST_PIPELINE_DURATION_MS, 'on_success_matcher': all_of(state_verifier, pubsub_msg_verifier) } # Generate input data and inject to PubSub. for msg in self.INPUT_MESSAGES[self.runner_name]: self.pub_client.publish(self.input_topic.name, msg.data, **msg.attributes) # Get pipeline options from command argument: --test-pipeline-options, # and start pipeline job by calling pipeline main function. pubsub_it_pipeline.run_pipeline( argv=self.test_pipeline.get_full_options_as_args(**extra_opts), with_attributes=with_attributes, id_label=self.ID_LABEL, timestamp_attribute=self.TIMESTAMP_ATTRIBUTE) @attr('IT') def test_streaming_data_only(self): self._test_streaming(with_attributes=False) @attr('IT') def test_streaming_with_attributes(self): self._test_streaming(with_attributes=True)