def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--topic', required=True, help='Subscription to read GCS change notifications from.') parser.add_argument('--output', required=True, help='Output file to write results to.') args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # consume pubsub events print("Reading from pubsub topic: %s" % args.topic) lines = p | 'read_from_pubsub' >> ReadStringsFromPubSub( topic=args.topic) # fetch the blob for each event blobs = (lines | 'fetch_blob' >> beam.ParDo(BlobFetcher())) # run the pipeline result = p.run() print("pipeline started, waiting for completion...") result.wait_until_finish()
def test_expand(self): p = TestPipeline() pdone = (p | ReadStringsFromPubSub('projects/fakeprj/topics/baz') | WriteStringsToPubSub('projects/fakeprj/topics/a_topic')) # Ensure that the properties passed through correctly self.assertEqual('a_topic', pdone.producer.transform.dofn.topic_name)
def test_expand_with_subscription(self): p = TestPipeline() pcoll = p | ReadStringsFromPubSub( None, 'projects/fakeprj/subscriptions/a_subscription', 'a_label') # Ensure that the output type is str self.assertEqual(unicode, pcoll.element_type) # Ensure that the properties passed through correctly source = pcoll.producer.transform._source self.assertEqual('a_subscription', source.subscription_name) self.assertEqual('a_label', source.id_label)
def test_expand(self): p = TestPipeline() pdone = p | ReadStringsFromPubSub('baz') | WriteStringsToPubSub( 'a_topic') # Ensure that the properties passed through correctly sink = pdone.producer.transform.sink self.assertEqual('a_topic', sink.topic) # Ensure that the type on the intermediate payload transformer output # PCollection is bytes write_pcoll = pdone.producer.inputs[0] self.assertEqual(bytes, write_pcoll.element_type)
def test_expand_with_subscription(self): p = TestPipeline() pcoll = p | ReadStringsFromPubSub(None, 'a_subscription', 'a_label') # Ensure that the output type is str self.assertEqual(unicode, pcoll.element_type) # Ensure that the type on the intermediate read output PCollection is bytes read_pcoll = pcoll.producer.inputs[0] self.assertEqual(bytes, read_pcoll.element_type) # Ensure that the properties passed through correctly source = read_pcoll.producer.transform.source self.assertEqual('a_subscription', source.subscription) self.assertEqual('a_label', source.id_label)
def test_read_strings_success(self, mock_pubsub): payload = u'🤷 ¯\\_(ツ)_/¯' payload_encoded = payload.encode('utf-8') data = [pubsub.message.Message(payload_encoded, None, None)] expected_data = [payload] mock_pubsub.Client = functools.partial(FakePubsubClient, data) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadStringsFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label')) assert_that(pcoll, equal_to(expected_data)) p.run()
def test_read_strings_success(self, mock_pubsub): data = u'🤷 ¯\\_(ツ)_/¯' data_encoded = data.encode('utf-8') publish_time = '2018-03-12T13:37:01.234567Z' payloads = [create_client_message(data_encoded, None, None, publish_time)] expected_elements = [data] mock_pubsub.Client = functools.partial(FakePubsubClient, payloads) mock_pubsub.subscription.AutoAck = FakeAutoAck p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadStringsFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label')) assert_that(pcoll, equal_to(expected_elements)) p.run()
def test_read_strings_success(self, mock_pubsub): data = u'🤷 ¯\\_(ツ)_/¯' data_encoded = data.encode('utf-8') ack_id = 'ack_id' pull_response = test_utils.create_pull_response( [test_utils.PullResponseMessage(data_encoded, ack_id=ack_id)]) expected_elements = [data] mock_pubsub.return_value.pull.return_value = pull_response p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadStringsFromPubSub('projects/fakeprj/topics/a_topic', None, None)) assert_that(pcoll, equal_to(expected_elements)) p.run() mock_pubsub.return_value.acknowledge.assert_has_calls( [mock.call(mock.ANY, [ack_id])])
def test_expand(self): p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadStringsFromPubSub('projects/fakeprj/topics/baz') | WriteStringsToPubSub('projects/fakeprj/topics/a_topic') | beam.Map(lambda x: x)) # Apply the necessary PTransformOverrides. overrides = _get_transform_overrides(p.options) p.replace_all(overrides) # Note that the direct output of ReadStringsFromPubSub will be replaced # by a PTransformOverride, so we use a no-op Map. write_transform = pcoll.producer.inputs[0].producer.transform # Ensure that the properties passed through correctly self.assertEqual('a_topic', write_transform.dofn.topic_name)
def test_expand_with_topic(self): p = TestPipeline() p.options.view_as(StandardOptions).streaming = True pcoll = (p | ReadStringsFromPubSub('projects/fakeprj/topics/a_topic', None, 'a_label') | beam.Map(lambda x: x)) # Ensure that the output type is str. self.assertEqual(unicode, pcoll.element_type) # Apply the necessary PTransformOverrides. overrides = _get_transform_overrides(p.options) p.replace_all(overrides) # Note that the direct output of ReadStringsFromPubSub will be replaced # by a PTransformOverride, so we use a no-op Map. read_transform = pcoll.producer.inputs[0].producer.transform # Ensure that the properties passed through correctly source = read_transform._source self.assertEqual('a_topic', source.topic_name) self.assertEqual('a_label', source.id_label)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" print "enter" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://practice-00001/C2ImportCalEventSample.csv', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) topic = known_args.input[len('pubsub://'):] pipeline_args.append('--streaming') pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True logger1 = logging.getLogger('testlogger') p = beam.Pipeline(options=pipeline_options) lines = p | 'read_from_pubsub' >> ReadStringsFromPubSub(topic=topic) timest = lines | 'gettimestamp' >> beam.ParDo(extracttimestamp()) win = timest | 'createwindow' >> beam.WindowInto( beam.window.FixedWindows(60), trigger=beam.trigger.AfterProcessingTime(10), accumulation_mode=beam.trigger.AccumulationMode.DISCARDING) par = win | 'parsing' >> beam.ParDo(ParsingFn()) #par | 'writetobq' >> beam.io.WriteToBigQuery(table='firewall_data',dataset='cybersecurity',project='practice-00001') par | 'write_to_file' >> WriteToText(known_args.output) result = p.run() print("waiting for pipeline to complete...") result.wait_until_finish()
def test_expand_with_both_topic_and_subscription(self): with self.assertRaisesRegexp( ValueError, "Only one of topic or subscription should be provided."): ReadStringsFromPubSub('a_topic', 'a_subscription', 'a_label')
def test_expand_with_no_topic_or_subscription(self): with self.assertRaisesRegexp( ValueError, "Either a topic or subscription must be provided."): ReadStringsFromPubSub(None, None, 'a_label')
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) topic = None if known_args.input.startswith('pubsub://'): # if a topic was specified as input then consume from pubsub topic = known_args.input[len('pubsub://'):] print("Reading from pubsub topic: %s" % topic) print(pipeline_args) pipeline_args.append('--streaming') # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if topic: # if a topic was specified as input then consume from pubsub print("Reading from pubsub topic: %s" % topic) lines = p | 'read_from_pubsub' >> ReadStringsFromPubSub(topic=topic) else: # otherwise read the text file[pattern] into a PCollection. print("Reading from file: %s" % known_args.input) lines = p | 'read_from_file' >> ReadFromText(known_args.input) # Count the occurrences of each word. def extract_tracks(result): r = json.loads(result) print("processing a results payload with %d tracks..." % len(r['Tracks'])) for track in r['Tracks']: yield track tracks = (lines | 'split' >> beam.FlatMap(extract_tracks)) # Format the counts into a PCollection of strings. output = tracks | 'format' >> beam.Map(json.dumps) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write_to_file' >> WriteToText(known_args.output) # Write the tracks to a bigquery table tracks | 'write_to_table' >> WriteToBigQuery( 'alex-bigquery:simresults.tracks', write_disposition=BigQueryDisposition.WRITE_APPEND) result = p.run() print("waiting for pipeline to complete...") result.wait_until_finish()