Esempio n. 1
0
class FilterTextFn(df.DoFn):
    """A DoFn that filters for a specific key based on a regular expression."""

    # A custom aggregator can track values in your pipeline as it runs. Those
    # values will be displayed in the Dataflow Monitoring UI when this pipeline is
    # run using the Dataflow service. These aggregators below track the number of
    # matched and unmatched words. Learn more at
    # https://cloud.google.com/dataflow/pipelines/dataflow-monitoring-intf about
    # the Dataflow Monitoring UI.
    matched_words = df.Aggregator('matched_words')
    umatched_words = df.Aggregator('umatched_words')

    def __init__(self, pattern):
        super(FilterTextFn, self).__init__()
        self.pattern = pattern

    def process(self, context):
        word, _ = context.element
        if re.match(self.pattern, word):
            # Log at INFO level each element we match. When executing this pipeline
            # using the Dataflow service, these log lines will appear in the Cloud
            # Logging UI.
            logging.info('Matched %s', word)
            context.aggregate_to(self.matched_words, 1)
            yield context.element
        else:
            # Log at the "DEBUG" level each element that is not matched. Different log
            # levels can be used to control the verbosity of logging providing an
            # effective mechanism to filter less important information.
            # Note currently only "INFO" and higher level logs are emitted to the
            # Cloud Logger. This log message will not be visible in the Cloud Logger.
            logging.debug('Did not match %s', word)
            context.aggregate_to(self.umatched_words, 1)
Esempio n. 2
0
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""A word-counting workflow."""

from __future__ import absolute_import

import argparse
import logging
import re

import google.cloud.dataflow as df


empty_line_aggregator = df.Aggregator('emptyLines')


class WordExtractingDoFn(df.DoFn):
  """Parse each line of input text into words."""

  def process(self, context):
    """Returns an iterator over the words of this element.

    The element is a line of text.  If the line is blank, note that, too.

    Args:
      context: the call-specific context: data and aggregator.

    Returns:
      The processed element.
Esempio n. 3
0
from __future__ import absolute_import

import argparse
import logging
import re
import google.cloud.dataflow as df

empty_line_aggregator = df.Aggregator('emptyLines')
average_word_size_aggregator = df.Aggregator('averageWordLength',
                                             df.combiners.MeanCombineFn(),
                                             float)


class WordExtractingDoFn(df.DoFn):
    def process(self, context):
        text_line = context.element.strip()
        if not text_line:
            context.aggregate_to(empty_line_aggregator, 1)
        words = re.findall(r'[A-Za-z\']+', text_line)
        for w in words:
            context.aggregate_to(average_word_size_aggregator, len(w))
        return words


def run(argv=None):

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',