Example #1
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--topic',
                        required=True,
                        help='Subscription to read GCS change notifications from.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # consume pubsub events
    print("Reading from pubsub topic: %s" % args.topic)
    lines = p | 'read_from_pubsub' >> ReadStringsFromPubSub(
        topic=args.topic)

    # fetch the blob for each event
    blobs = (lines | 'fetch_blob' >> beam.ParDo(BlobFetcher()))

    # run the pipeline
    result = p.run()
    print("pipeline started, waiting for completion...")
    result.wait_until_finish()
Example #2
0
    def test_expand(self):
        p = TestPipeline()
        pdone = (p
                 | ReadStringsFromPubSub('projects/fakeprj/topics/baz')
                 | WriteStringsToPubSub('projects/fakeprj/topics/a_topic'))

        # Ensure that the properties passed through correctly
        self.assertEqual('a_topic', pdone.producer.transform.dofn.topic_name)
Example #3
0
    def test_expand_with_subscription(self):
        p = TestPipeline()
        pcoll = p | ReadStringsFromPubSub(
            None, 'projects/fakeprj/subscriptions/a_subscription', 'a_label')
        # Ensure that the output type is str
        self.assertEqual(unicode, pcoll.element_type)

        # Ensure that the properties passed through correctly
        source = pcoll.producer.transform._source
        self.assertEqual('a_subscription', source.subscription_name)
        self.assertEqual('a_label', source.id_label)
Example #4
0
    def test_expand(self):
        p = TestPipeline()
        pdone = p | ReadStringsFromPubSub('baz') | WriteStringsToPubSub(
            'a_topic')

        # Ensure that the properties passed through correctly
        sink = pdone.producer.transform.sink
        self.assertEqual('a_topic', sink.topic)

        # Ensure that the type on the intermediate payload transformer output
        # PCollection is bytes
        write_pcoll = pdone.producer.inputs[0]
        self.assertEqual(bytes, write_pcoll.element_type)
Example #5
0
    def test_expand_with_subscription(self):
        p = TestPipeline()
        pcoll = p | ReadStringsFromPubSub(None, 'a_subscription', 'a_label')
        # Ensure that the output type is str
        self.assertEqual(unicode, pcoll.element_type)

        # Ensure that the type on the intermediate read output PCollection is bytes
        read_pcoll = pcoll.producer.inputs[0]
        self.assertEqual(bytes, read_pcoll.element_type)

        # Ensure that the properties passed through correctly
        source = read_pcoll.producer.transform.source
        self.assertEqual('a_subscription', source.subscription)
        self.assertEqual('a_label', source.id_label)
Example #6
0
    def test_read_strings_success(self, mock_pubsub):
        payload = u'🤷 ¯\\_(ツ)_/¯'
        payload_encoded = payload.encode('utf-8')
        data = [pubsub.message.Message(payload_encoded, None, None)]
        expected_data = [payload]

        mock_pubsub.Client = functools.partial(FakePubsubClient, data)
        mock_pubsub.subscription.AutoAck = FakeAutoAck

        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadStringsFromPubSub('projects/fakeprj/topics/a_topic',
                                         None, 'a_label'))
        assert_that(pcoll, equal_to(expected_data))
        p.run()
Example #7
0
  def test_read_strings_success(self, mock_pubsub):
    data = u'🤷 ¯\\_(ツ)_/¯'
    data_encoded = data.encode('utf-8')
    publish_time = '2018-03-12T13:37:01.234567Z'
    payloads = [create_client_message(data_encoded, None, None, publish_time)]
    expected_elements = [data]

    mock_pubsub.Client = functools.partial(FakePubsubClient, payloads)
    mock_pubsub.subscription.AutoAck = FakeAutoAck

    p = TestPipeline()
    p.options.view_as(StandardOptions).streaming = True
    pcoll = (p
             | ReadStringsFromPubSub('projects/fakeprj/topics/a_topic',
                                     None, 'a_label'))
    assert_that(pcoll, equal_to(expected_elements))
    p.run()
Example #8
0
    def test_read_strings_success(self, mock_pubsub):
        data = u'🤷 ¯\\_(ツ)_/¯'
        data_encoded = data.encode('utf-8')
        ack_id = 'ack_id'
        pull_response = test_utils.create_pull_response(
            [test_utils.PullResponseMessage(data_encoded, ack_id=ack_id)])
        expected_elements = [data]
        mock_pubsub.return_value.pull.return_value = pull_response

        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadStringsFromPubSub('projects/fakeprj/topics/a_topic',
                                         None, None))
        assert_that(pcoll, equal_to(expected_elements))
        p.run()
        mock_pubsub.return_value.acknowledge.assert_has_calls(
            [mock.call(mock.ANY, [ack_id])])
Example #9
0
    def test_expand(self):
        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadStringsFromPubSub('projects/fakeprj/topics/baz')
                 | WriteStringsToPubSub('projects/fakeprj/topics/a_topic')
                 | beam.Map(lambda x: x))

        # Apply the necessary PTransformOverrides.
        overrides = _get_transform_overrides(p.options)
        p.replace_all(overrides)

        # Note that the direct output of ReadStringsFromPubSub will be replaced
        # by a PTransformOverride, so we use a no-op Map.
        write_transform = pcoll.producer.inputs[0].producer.transform

        # Ensure that the properties passed through correctly
        self.assertEqual('a_topic', write_transform.dofn.topic_name)
Example #10
0
    def test_expand_with_topic(self):
        p = TestPipeline()
        p.options.view_as(StandardOptions).streaming = True
        pcoll = (p
                 | ReadStringsFromPubSub('projects/fakeprj/topics/a_topic',
                                         None, 'a_label')
                 | beam.Map(lambda x: x))
        # Ensure that the output type is str.
        self.assertEqual(unicode, pcoll.element_type)

        # Apply the necessary PTransformOverrides.
        overrides = _get_transform_overrides(p.options)
        p.replace_all(overrides)

        # Note that the direct output of ReadStringsFromPubSub will be replaced
        # by a PTransformOverride, so we use a no-op Map.
        read_transform = pcoll.producer.inputs[0].producer.transform

        # Ensure that the properties passed through correctly
        source = read_transform._source
        self.assertEqual('a_topic', source.topic_name)
        self.assertEqual('a_label', source.id_label)
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    print "enter"
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://practice-00001/C2ImportCalEventSample.csv',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    topic = known_args.input[len('pubsub://'):]
    pipeline_args.append('--streaming')
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    logger1 = logging.getLogger('testlogger')

    p = beam.Pipeline(options=pipeline_options)
    lines = p | 'read_from_pubsub' >> ReadStringsFromPubSub(topic=topic)
    timest = lines | 'gettimestamp' >> beam.ParDo(extracttimestamp())
    win = timest | 'createwindow' >> beam.WindowInto(
        beam.window.FixedWindows(60),
        trigger=beam.trigger.AfterProcessingTime(10),
        accumulation_mode=beam.trigger.AccumulationMode.DISCARDING)
    par = win | 'parsing' >> beam.ParDo(ParsingFn())
    #par | 'writetobq' >> beam.io.WriteToBigQuery(table='firewall_data',dataset='cybersecurity',project='practice-00001')
    par | 'write_to_file' >> WriteToText(known_args.output)

    result = p.run()
    print("waiting for pipeline to complete...")
    result.wait_until_finish()
Example #12
0
 def test_expand_with_both_topic_and_subscription(self):
     with self.assertRaisesRegexp(
             ValueError,
             "Only one of topic or subscription should be provided."):
         ReadStringsFromPubSub('a_topic', 'a_subscription', 'a_label')
Example #13
0
 def test_expand_with_no_topic_or_subscription(self):
     with self.assertRaisesRegexp(
             ValueError,
             "Either a topic or subscription must be provided."):
         ReadStringsFromPubSub(None, None, 'a_label')
Example #14
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://dataflow-samples/shakespeare/kinglear.txt',
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    topic = None
    if known_args.input.startswith('pubsub://'):
        # if a topic was specified as input then consume from pubsub
        topic = known_args.input[len('pubsub://'):]
        print("Reading from pubsub topic: %s" % topic)
        print(pipeline_args)
        pipeline_args.append('--streaming')

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    if topic:
        # if a topic was specified as input then consume from pubsub
        print("Reading from pubsub topic: %s" % topic)
        lines = p | 'read_from_pubsub' >> ReadStringsFromPubSub(topic=topic)
    else:
        # otherwise read the text file[pattern] into a PCollection.
        print("Reading from file: %s" % known_args.input)
        lines = p | 'read_from_file' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def extract_tracks(result):
        r = json.loads(result)
        print("processing a results payload with %d tracks..." %
              len(r['Tracks']))
        for track in r['Tracks']:
            yield track

    tracks = (lines | 'split' >> beam.FlatMap(extract_tracks))

    # Format the counts into a PCollection of strings.
    output = tracks | 'format' >> beam.Map(json.dumps)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write_to_file' >> WriteToText(known_args.output)

    # Write the tracks to a bigquery table
    tracks | 'write_to_table' >> WriteToBigQuery(
        'alex-bigquery:simresults.tracks',
        write_disposition=BigQueryDisposition.WRITE_APPEND)

    result = p.run()

    print("waiting for pipeline to complete...")
    result.wait_until_finish()