def run(argv=None, comments=None):
    """Run the beam pipeline.

    Args:
        argv: (optional) the command line flags to parse.
        comments_collection: (optional) a list of comment JSON objects to
            process. Used in unit-tests to avoid requiring a BigQuery source.
    """
    args, pipeline_args = _parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    if comments is not None:
        comments = p | ("Read in-memory comments") >> beam.Create(comments)
    else:
        comments = p | ("Read " + args.reddit_table) >> Read(
            BigQuerySource(args.reddit_table))

    comments |= (
        "Normalise comments" >> beam.Map(
            partial(normalise_comment, max_length=args.max_length)))

    thread_id_to_comments = comments | (
        "Key by thread id" >> beam.Map(
            lambda comment: (comment.thread_id, comment)))
    threads = thread_id_to_comments | (
        "Group comments by thread ID" >> beam.GroupByKey())
    threads = threads | ("Get threads" >> beam.Map(lambda t: t[1]))

    examples = threads | (
        "Create {} examples".format(args.dataset_format) >> beam.FlatMap(
            partial(create_examples,
                    parent_depth=args.parent_depth,
                    min_length=args.min_length,
                    format=args.dataset_format,
                    )))
    examples = _shuffle(examples)

    # [START dataflow_molecules_split_to_train_and_eval_datasets]
    # Split the dataset into a training set and an evaluation set
    assert 0 < (100 - args.train_split*100) < 100, 'eval_percent must in the range (0-100)'
    eval_percent = 100 - args.train_split*100
    train_dataset, eval_dataset = (
        examples
        | 'Split dataset' >> beam.Partition(
            lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2))
    # [END dataflow_molecules_split_to_train_and_eval_datasets]

    if args.dataset_format == _JSON_FORMAT:
        write_sink = WriteToText
        file_name_suffix = ".json"
        serialize_fn = json.dumps

    serialized_train_examples = train_dataset | (
        "serialize {} examples".format('train') >> beam.Map(serialize_fn))
    (
        serialized_train_examples | ("write " + 'train')
        >> write_sink(
            os.path.join(args.output_dir, 'train'),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        )
    )

    serialized_test_examples = eval_dataset | (
        "serialize {} examples".format('valid') >> beam.Map(serialize_fn))
    (
        serialized_test_examples | ("write " + 'valid')
        >> write_sink(
            os.path.join(args.output_dir, 'valid'),
            file_name_suffix=file_name_suffix,
            num_shards=args.num_shards_train,
        )
    )

    result = p.run()
    result.wait_until_finish()
Exemple #2
0
 def test_create_application_client(self):
     pipeline_options = PipelineOptions()
     apiclient.DataflowApplicationClient(pipeline_options)
Exemple #3
0
 def test_interpreter_version_check_passes_with_experiment(self):
     pipeline_options = PipelineOptions(
         ["--experiment=use_unsupported_python_version"])
     apiclient._verify_interpreter_version_is_supported(pipeline_options)
Exemple #4
0
import re
import argparse
import typing
from apache_beam.options.pipeline_options import PipelineOptions

parser = argparse.ArgumentParser()
parser.add_argument('--input_topic',
                    required=True,
                    help=('Output path to google cloud storage'))

parser.add_argument('--output_path',
                    required=True,
                    help=('Output path to google cloud storage'))

path_args, pipeline_args = parser.parse_known_args()
options = PipelineOptions(pipeline_args)


class ParseApacheServerLog(beam.DoFn):
    @classmethod
    def process(self, element):
        HOST = r'^(?P<host>.*?)'
        SPACE = r'\s'
        IDENTITY = r'\S+'
        USER = r'\S+'
        TIME = r'(?P<time>\[.*?\])'
        REQUEST = r'\"(?P<request>.*?)\"'
        STATUS = r'(?P<status>\d{3})'
        SIZE = r'(?P<size>\S+)'
        REGEX = HOST + SPACE + IDENTITY + SPACE + USER + SPACE + TIME + SPACE + REQUEST + SPACE + STATUS + SPACE + SIZE + SPACE
        match = re.search(REGEX, element)
def run(argv=None):
    """
    This funciton parses the command line arguments and runs the Beam Pipeline.

    Args:
        argv: list containing the commandline arguments for this call of the
         script.
    """
    schema_inferred = False
    data_args, pipeline_args = parse_data_generator_args(argv)
    data_args, schema_inferred = fetch_schema(data_args, schema_inferred)
    pipeline_options = PipelineOptions(pipeline_args)

    data_gen = DataGenerator(bq_schema_filename=data_args.schema_file,
                             input_bq_table=data_args.input_bq_table,
                             hist_bq_table=data_args.hist_bq_table,
                             p_null=data_args.p_null,
                             n_keys=data_args.n_keys,
                             min_date=data_args.min_date,
                             max_date=data_args.max_date,
                             only_pos=data_args.only_pos,
                             max_int=data_args.max_int,
                             max_float=data_args.max_float,
                             float_precision=data_args.float_precision,
                             write_disp=data_args.write_disp,
                             key_skew=data_args.key_skew)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is and what runner to use.
    p = beam.Pipeline(options=pipeline_options)
    rows = (
        p
        | 'Read Histogram Table.' >> beam.io.Read(
            beam.io.BigQuerySource(data_gen.hist_bq_table))
        | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen))
        | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)]))

    if data_args.primary_key_cols:
        rows |= EnforcePrimaryKeys(data_args.primary_key_col)

    if data_args.csv_schema_order:
        (rows
         | 'Order fields for CSV writing.' >> beam.FlatMap(
             lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))])
         | 'Write to GCS' >> beam.io.textio.WriteToText(
             file_path_prefix=data_args.output_prefix, file_name_suffix='.csv')
         )

    if data_args.avro_schema_file:
        fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file)

        (rows
         # Need to convert time stamps from strings to timestamp-micros
         | 'Fix date and time Types for Avro.' >>
         beam.FlatMap(lambda row: fix_record_for_avro(row, fastavro_avsc))
         | 'Write to Avro.' >> beam.io.avroio.WriteToAvro(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.avro',
             use_fastavro=True,
             schema=fastavro_avsc))

    if data_args.output_bq_table:
        (rows
         | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command line.
             data_args.output_bq_table,
             schema=None if schema_inferred else data_gen.get_bq_schema(),
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=data_gen.write_disp,
             # Use the max recommended batch size.
             batch_size=500))

    p.run().wait_until_finish()
Exemple #6
0
def del_unwanted_cols(data):
    """Delete the unwanted columns"""
    del data['cast']
    del data['date_added']
    del data['listed_in']
    del data['description']
    return data


if __name__ == '__main__':

    parser = argparse.ArgumentParser()
    known_args = parser.parse_known_args(argv)

    p = beam.Pipeline(options=PipelineOptions())

    (p | 'ReadData' >> beam.io.ReadFromText(
        'gs://nf-bucket-test/batch/netflix_titles.csv', skip_header_lines=1)
     | 'SplitData' >> beam.Map(lambda x: x.split(','))
     | 'FormatToDict' >> beam.Map(
         lambda x: {
             "show_id": x[0],
             "type": x[1],
             "title": x[2],
             "director": x[3],
             "country": x[4],
             "release_year": x[5],
             "rating": x[6],
             "duration": x[7]
         })
Exemple #7
0
        # If a header is not provided, assume the first line in a file
        # to be the header.
        skip_header_lines = 1 if column_names is None else 0

        _ = (p
             | 'ReadData' >> beam.io.textio.ReadFromText(
                 file_pattern=input_path, skip_header_lines=skip_header_lines)
             | 'DecodeData' >> csv_decoder.DecodeCSV(column_names=column_names)
             | 'GenerateStatistics' >> tfdv.GenerateStatistics()
             | 'WriteStatsOutput' >> beam.io.WriteToTFRecord(
                 output_path,
                 shard_name_template='',
                 coder=beam.coders.ProtoCoder(
                     statistics_pb2.DatasetFeatureStatisticsList)))


if __name__ == '__main__':
    logging.getLogger().setLevel(logging.INFO)
    """Build and run the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input_path', required=True)
    parser.add_argument('--output_path', required=True)

    flags, pipeline_args = parser.parse_known_args()

    pipeline_option = PipelineOptions(flags=pipeline_args)
    pipeline_option.view_as(SetupOptions).save_main_session = True

    run_pipeline(flags, pipeline_option)
Exemple #8
0
 def test_no_staging_location(self):
     with self.assertRaises(RuntimeError) as cm:
         self.stager.stage_job_resources(PipelineOptions(),
                                         staging_location=None)
     self.assertEqual('The staging_location must be specified.',
                      cm.exception.args[0])
Exemple #9
0
    def test_with_extra_packages(self):
        staging_dir = self.make_temp_dir()
        source_dir = self.make_temp_dir()
        self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz2.tar'), 'nothing')
        self.create_temp_file(os.path.join(source_dir, 'whl.whl'), 'nothing')
        self.create_temp_file(
            os.path.join(source_dir, stager.EXTRA_PACKAGES_FILE), 'nothing')

        options = PipelineOptions()
        self.update_options(options)
        options.view_as(SetupOptions).extra_packages = [
            os.path.join(source_dir, 'abc.tar.gz'),
            os.path.join(source_dir, 'xyz.tar.gz'),
            os.path.join(source_dir, 'xyz2.tar'),
            os.path.join(source_dir, 'whl.whl'),
            '/tmp/remote/remote_file.tar.gz'
        ]

        remote_copied_files = []

        # We can not rely on actual remote file systems paths hence making
        # '/tmp/remote/' a new remote path.
        def is_remote_path(path):
            return path.startswith('/tmp/remote/')

        def file_copy(from_path, to_path):
            if is_remote_path(from_path):
                remote_copied_files.append(from_path)
                _, from_name = os.path.split(from_path)
                if os.path.isdir(to_path):
                    to_path = os.path.join(to_path, from_name)
                self.create_temp_file(to_path, 'nothing')
                logging.info('Fake copied remote file: %s to %s', from_path,
                             to_path)
            elif is_remote_path(to_path):
                logging.info('Faking upload_file(%s, %s)', from_path, to_path)
            else:
                shutil.copyfile(from_path, to_path)

        with mock.patch(
                'apache_beam.runners.portability.stager_test'
                '.stager.Stager._download_file', staticmethod(file_copy)):
            with mock.patch(
                    'apache_beam.runners.portability.stager_test'
                    '.stager.Stager._is_remote_path',
                    staticmethod(is_remote_path)):
                self.assertEqual([
                    'abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl',
                    'remote_file.tar.gz', stager.EXTRA_PACKAGES_FILE
                ],
                                 self.stager.stage_job_resources(
                                     options, staging_location=staging_dir))
        with open(os.path.join(staging_dir, stager.EXTRA_PACKAGES_FILE)) as f:
            self.assertEqual([
                'abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n',
                'remote_file.tar.gz\n'
            ], f.readlines())
        self.assertEqual(['/tmp/remote/remote_file.tar.gz'],
                         remote_copied_files)
Exemple #10
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the hourly_team_score pipeline."""
    parser = argparse.ArgumentParser()

    parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from')
    parser.add_argument('--subscription',
                        type=str,
                        help='Pub/Sub subscription to read from')
    parser.add_argument('--dataset',
                        type=str,
                        required=True,
                        help='BigQuery Dataset to write tables to. '
                        'Must already exist.')
    parser.add_argument(
        '--table_name',
        type=str,
        default='game_stats',
        help='The BigQuery table name. Should not already exist.')
    parser.add_argument('--fixed_window_duration',
                        type=int,
                        default=60,
                        help='Numeric value of fixed window duration for user '
                        'analysis, in minutes')
    parser.add_argument('--session_gap',
                        type=int,
                        default=5,
                        help='Numeric value of gap between user sessions, '
                        'in minutes')
    parser.add_argument(
        '--user_activity_window_duration',
        type=int,
        default=30,
        help='Numeric value of fixed window for finding mean of '
        'user session duration, in minutes')

    args, pipeline_args = parser.parse_known_args(argv)

    if args.topic is None and args.subscription is None:
        parser.print_usage()
        print(sys.argv[0] +
              ': error: one of --topic or --subscription is required')
        sys.exit(1)

    options = PipelineOptions(pipeline_args)

    # We also require the --project option to access --dataset
    if options.view_as(GoogleCloudOptions).project is None:
        parser.print_usage()
        print(sys.argv[0] + ': error: argument --project is required')
        sys.exit(1)

    fixed_window_duration = args.fixed_window_duration * 60
    session_gap = args.session_gap * 60
    user_activity_window_duration = args.user_activity_window_duration * 60

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = save_main_session

    # Enforce that this pipeline is always run in streaming mode
    options.view_as(StandardOptions).streaming = True

    with beam.Pipeline(options=options) as p:
        # Read game events from Pub/Sub using custom timestamps, which
        # are extracted from the data elements, and parse the data.
        if args.subscription:
            scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                subscription=args.subscription)
        else:
            scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub(
                topic=args.topic)
        raw_events = (scores
                      | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8'))
                      | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn())
                      | 'AddEventTimestamps' >>
                      beam.Map(lambda elem: beam.window.TimestampedValue(
                          elem, elem['timestamp'])))

        # Extract username/score pairs from the event stream
        user_events = (raw_events
                       | 'ExtractUserScores' >>
                       beam.Map(lambda elem: (elem['user'], elem['score'])))

        # Calculate the total score per user over fixed windows, and cumulative
        # updates for late data
        spammers_view = (
            user_events
            | 'UserFixedWindows' >> beam.WindowInto(
                beam.window.FixedWindows(fixed_window_duration))

            # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate.
            # These might be robots/spammers.
            | 'CalculateSpammyUsers' >> CalculateSpammyUsers()

            # Derive a view from the collection of spammer users. It will be used as
            # a side input in calculating the team score sums, below
            | 'CreateSpammersView' >> beam.CombineGlobally(
                beam.combiners.ToDictCombineFn()).as_singleton_view())

        # [START filter_and_calc]
        # Calculate the total score per team over fixed windows, and emit cumulative
        # updates for late data. Uses the side input derived above --the set of
        # suspected robots-- to filter out scores from those users from the sum.
        # Write the results to BigQuery.
        (  # pylint: disable=expression-not-assigned
            raw_events
            | 'WindowIntoFixedWindows' >> beam.WindowInto(
                beam.window.FixedWindows(fixed_window_duration))

            # Filter out the detected spammer users, using the side input derived
            # above
            | 'FilterOutSpammers' >> beam.Filter(
                lambda elem, spammers: elem['user'] not in spammers,
                spammers_view)
            # Extract and sum teamname/score pairs from the event data.
            | 'ExtractAndSumScore' >> ExtractAndSumScore('team')
            # [END filter_and_calc]
            | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict())
            | 'WriteTeamScoreSums' >> WriteToBigQuery(
                args.table_name + '_teams', args.dataset, {
                    'team': 'STRING',
                    'total_score': 'INTEGER',
                    'window_start': 'STRING',
                    'processing_time': 'STRING',
                },
                options.view_as(GoogleCloudOptions).project))

        # [START session_calc]
        # Detect user sessions-- that is, a burst of activity separated by a gap
        # from further activity. Find and record the mean session lengths.
        # This information could help the game designers track the changing user
        # engagement as their set of game changes.
        (  # pylint: disable=expression-not-assigned
            user_events
            | 'WindowIntoSessions' >> beam.WindowInto(
                beam.window.Sessions(session_gap),
                timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW)

            # For this use, we care only about the existence of the session, not any
            # particular information aggregated over it, so we can just group by key
            # and assign a "dummy value" of None.
            | beam.CombinePerKey(lambda _: None)

            # Get the duration of the session
            | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity())
            # [END session_calc]

            # [START rewindow]
            # Re-window to process groups of session sums according to when the
            # sessions complete
            | 'WindowToExtractSessionMean' >> beam.WindowInto(
                beam.window.FixedWindows(user_activity_window_duration))

            # Find the mean session duration in each window
            | beam.CombineGlobally(
                beam.combiners.MeanCombineFn()).without_defaults()
            | 'FormatAvgSessionLength' >>
            beam.Map(lambda elem: {'mean_duration': float(elem)})
            | 'WriteAvgSessionLength' >> WriteToBigQuery(
                args.table_name + '_sessions', args.dataset, {
                    'mean_duration': 'FLOAT',
                },
                options.view_as(GoogleCloudOptions).project))
def run(argv=None, save_main_session=True):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--output_topic',
                        required=True,
                        help=('Output PubSub topic of the form '
                              '"projects/<PROJECT>/topic/<TOPIC>".'))
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--input_topic',
                       help=('Input PubSub topic of the form '
                             '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    pipeline_options.view_as(StandardOptions).streaming = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Read from PubSub into a PCollection.
        if known_args.input_subscription:
            messages = p | beam.io.ReadFromPubSub(
                subscription=known_args.input_subscription)
        else:
            messages = p | beam.io.ReadFromPubSub(topic=known_args.input_topic)

        lines = messages | 'decode' >> beam.Map(lambda x: x.decode('utf-8'))

        # Count the occurrences of each word.
        def count_ones(word_ones):
            (word, ones) = word_ones
            return (word, sum(ones))

        counts = (
            lines
            | 'Split' >>
            (beam.ParDo(WordExtractingDoFn()).with_output_types(str))
            | 'AddTimestampFn' >> beam.ParDo(AddTimestampFn())
            | 'After AddTimestampFn' >> ParDo(PrintFn('After AddTimestampFn'))
            | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
            | beam.WindowInto(window.FixedWindows(5, 0))
            | 'GroupByKey' >> beam.GroupByKey()
            | 'CountOnes' >> beam.Map(count_ones))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %d' % (word, count)

        output = (
            counts
            | 'format' >> beam.Map(format_result)
            | 'encode' >>
            beam.Map(lambda x: x.encode('utf-8')).with_output_types(bytes))

        # Write to PubSub.
        # pylint: disable=expression-not-assigned
        output | beam.io.WriteToPubSub(known_args.output_topic)

        def check_gbk_format():
            # A matcher that checks that the output of GBK is of the form word: count.
            def matcher(elements):
                # pylint: disable=unused-variable
                actual_elements_in_window, window = elements
                for elm in actual_elements_in_window:
                    assert re.match(r'\S+:\s+\d+',
                                    elm.decode('utf-8')) is not None

            return matcher

        # Check that the format of the output is correct.
        assert_that(output,
                    check_gbk_format(),
                    use_global_window=False,
                    label='Assert word:count format.')

        # Check also that elements are ouput in the right window.
        # This expects exactly 1 occurrence of any subset of the elements
        # 150, 151, 152, 153, 154 in the window [150, 155)
        # or exactly 1 occurrence of any subset of the elements
        # 210, 211, 212, 213, 214 in the window [210, 215).
        first_window_val = [
            '150: 1',
            '151: 1',
            '152: 1',
            '153: 1',
            '154: 1',
        ]
        second_window_val = [
            '210: 1',
            '211: 1',
            '212: 1',
            '213: 1',
            '214: 1',
        ]
        expected_window_to_elements = {
            window.IntervalWindow(150, 155):
            [x.encode('utf-8') for x in first_window_val],
            window.IntervalWindow(210, 215):
            [x.encode('utf-8') for x in second_window_val],
        }

        # To pass, publish numbers in [150-155) or [210-215) with no repeats.
        # To fail, publish a repeated number in the range above range.
        # For example: '210 213 151 213'
        assert_that(output,
                    equal_to_per_window(expected_window_to_elements),
                    use_global_window=False,
                    label='Assert correct streaming windowing.')
def run(argv=None, save_main_session=True):
    """Build and run the pipeline."""
    parser = argparse.ArgumentParser()
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument(
        '--input_topic',
        help=('Input PubSub topic of the form '
              '"projects/<PROJECT>/topics/<TOPIC>".'))
    group.add_argument(
        '--input_subscription',
        help=('Input PubSub subscription of the form '
              '"projects/<PROJECT>/subscriptions/<SUBSCRIPTION>."'))
    parser.add_argument('--output_bigquery', required=True,
                        help='Output BQ table to write results to '
                             '"PROJECT_ID:DATASET.TABLE"')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
    pipeline_options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=pipeline_options)

    # Read from PubSub into a PCollection.
    if known_args.input_subscription:
        messages = (p
                    | beam.io.ReadFromPubSub(
                    subscription=known_args.input_subscription)
                    .with_output_types(bytes))
    else:
        messages = (p
                    | beam.io.ReadFromPubSub(topic=known_args.input_topic)
                    .with_output_types(bytes))

    decode_messages = messages | 'DecodePubSubMessages' >> beam.Map(lambda x: x.decode('utf-8'))

    # Get STT data from function for long audio file using asynchronous speech recognition
    stt_output = decode_messages | 'SpeechToTextOutput' >> beam.Map(stt_output_response)

    # Parse and enrich stt_output response
    parse_stt_output = stt_output | 'ParseSpeechToText' >> beam.Map(stt_parse_response)

    # Get NLP Sentiment and Entity response
    nlp_output = parse_stt_output | 'NaturalLanguageOutput' >> beam.Map(get_nlp_output)

    # Write to BigQuery
    bigquery_table_schema = {
        "fields": [
        {
            "mode": "NULLABLE", 
            "name": "fileid", 
            "type": "STRING"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "filename", 
            "type": "STRING"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "callid", 
            "type": "STRING"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "date", 
            "type": "TIMESTAMP"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "year", 
            "type": "INTEGER"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "month", 
            "type": "INTEGER"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "day", 
            "type": "INTEGER"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "starttime", 
            "type": "STRING"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "duration", 
            "type": "FLOAT"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "silencesecs", 
            "type": "FLOAT"
        },
        {
            "mode": "NULLABLE", 
            "name": "sentimentscore", 
            "type": "FLOAT"
        },
        {
            "mode": "NULLABLE", 
            "name": "magnitude", 
            "type": "FLOAT"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "silencepercentage", 
            "type": "INTEGER"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "speakeronespeaking", 
            "type": "FLOAT"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "speakertwospeaking", 
            "type": "FLOAT"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "nlcategory", 
            "type": "STRING"
        }, 
        {
            "mode": "NULLABLE", 
            "name": "transcript", 
            "type": "STRING"
        }, 
        {
            "fields": [
            {
                "mode": "NULLABLE", 
                "name": "name", 
                "type": "STRING"
            }, 
            {
                "mode": "NULLABLE", 
                "name": "type", 
                "type": "STRING"
            }, 
            {
                "mode": "NULLABLE", 
                "name": "sentiment", 
                "type": "FLOAT"
            }
            ], 
            "mode": "REPEATED", 
            "name": "entities", 
            "type": "RECORD"
        }, 
        {
            "fields": [
            {
                "mode": "NULLABLE", 
                "name": "word", 
                "type": "STRING"
            }, 
            {
                "mode": "NULLABLE", 
                "name": "startSecs", 
                "type": "FLOAT"
            }, 
            {
                "mode": "NULLABLE", 
                "name": "endSecs", 
                "type": "FLOAT"
            }, 
            {
                "mode": "NULLABLE", 
                "name": "speakertag", 
                "type": "INTEGER"
            }, 
            {
                "mode": "NULLABLE", 
                "name": "confidence", 
                "type": "FLOAT"
            }
            ], 
            "mode": "REPEATED", 
            "name": "words", 
            "type": "RECORD"
        }, 
        {
            "fields": [
            {
                "mode": "NULLABLE", 
                "name": "sentence", 
                "type": "STRING"
            }, 
            {
                "mode": "NULLABLE", 
                "name": "sentiment", 
                "type": "FLOAT"
            }, 
            {
                "mode": "NULLABLE", 
                "name": "magnitude", 
                "type": "FLOAT"
            }
            ], 
            "mode": "REPEATED", 
            "name": "sentences", 
            "type": "RECORD"
            }
        ]
    }
    nlp_output | 'WriteToBigQuery' >> beam.io.WriteToBigQuery(
            known_args.output_bigquery,
            schema=bigquery_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)

    p.run()
Exemple #13
0
def run(argv=None):
    """Main entry point; defines and runs the hourly_team_score pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--subdomain',
                        type=str,
                        default=config.SUBDOMAIN,
                        help='Sub-domain for Stat API.')
    parser.add_argument('--api_key',
                        type=str,
                        default=config.API_KEY,
                        help='API key for Stat API.')
    parser.add_argument(
        '--date',
        type=str,
        default=(dt.date.today() -
                 dt.timedelta(days=config.PRIOR_DAYS)).strftime("%Y-%m-%d"),
        help='Run date in YYYY-MM-DD format.')
    parser.add_argument(
        '--dataset',
        type=str,
        default=config.DATASET,
        help='BigQuery Dataset to write tables to. Must already exist.')
    parser.add_argument(
        '--table_name',
        type=str,
        default=config.TABLE_NAME,
        help='The BigQuery table name. Should not already exist.')
    parser.add_argument('--project',
                        type=str,
                        default=config.PROJECT,
                        help='Your GCS project.')
    parser.add_argument('--runner',
                        type=str,
                        default="DataflowRunner",
                        help='Type of DataFlow runner.')

    args, pipeline_args = parser.parse_known_args(argv)

    # Create and set your PipelineOptions.
    options = PipelineOptions(pipeline_args)

    # For Cloud execution, set the Cloud Platform project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = args.project
    google_cloud_options.job_name = ("{0}-{1}".format(
        args.project, str(dt.datetime.today().strftime("%m%dt%H%M"))))
    google_cloud_options.staging_location = "gs://{0}/binaries".format(
        args.project)
    google_cloud_options.temp_location = "gs://{0}/temp".format(args.project)
    options.view_as(StandardOptions).runner = args.runner

    ## Comment this if wanting to run this file directly.
    options.view_as(SetupOptions).setup_file = "./setup.py"

    pipeline = beam.Pipeline(options=options)

    # Read projects from Stat API
    api = (pipeline
           | 'create' >> beam.Create(StatAPI(data=args).get_job())
           | 'IterProjects' >> beam.ParDo(IterProjects(data=args)))

    # Iteates Sites in Projects
    keywords = (api
                | 'IterSites' >> beam.ParDo(IterSites())
                | 'IterKeywords' >> beam.ParDo(IterKeywords()))

    # Write to bigquery based on specified schema
    BQ = (keywords | "WriteToBigQuery" >> WriteToBigQuery(
        args.table_name, args.dataset, STAT_API_SCHEMA))

    pipeline.run()
Exemple #14
0
logger = logging.getLogger(__name__)

table_spec = bigquery.TableReference(projectId='query-11',
                                     datasetId='rpm',
                                     tableId='account_id_schema_new')

output_spec = bigquery.TableReference(projectId='query-11',
                                      datasetId='rpm',
                                      tableId='yesyes')

dataflow_options = [
    '--project=query-11', '--job_name=amaz',
    '--temp_location=gs://dataflow_s/tmp', '--region=us-central1'
]
dataflow_options.append('--staging_location=gs://dataflow_s/stage')
options = PipelineOptions(dataflow_options)
gcloud_options = options.view_as(GoogleCloudOptions)

options.view_as(StandardOptions).runner = "dataflow"

table_schema = {
    'fields': [
        {
            'name': 'ACNO',
            'type': 'STRING',
            'mode': 'NULLABLE'
        },
        {
            'name': 'FIELD_1',
            'type': 'FLOAT',
            'mode': 'NULLABLE'
Exemple #15
0
def run():
    t0 = now()

    # parse command line options
    known_args, beam_args = runtime_args()

    # BigQuery utility
    bq_utils = BigQueryUtils()

    options = PipelineOptions(beam_args)
    options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=options) as p:
        rows = (p
                | beam.io.ReadFromText(known_args.input, skip_header_lines=1)
                | beam.ParDo(BeamReadCSV(header_cols=FLIGHTS_CSV_COLUMNS))
                | beam.ParDo(BeamTransformRecords(),
                             date_fmt='%Y-%m-%d',
                             time_fmt='%H%M'))

        # load the routes table into a lookup dict
        sql = f"""select airline, src, dest from {known_args.routes_table}"""
        routes = bq_utils.execute_as_dict(sql,
                                          keycols=['airline', 'src', 'dest'])

        # lookup routes
        rows, routes_rejects, missing_routes = (
            rows
            | beam.ParDo(BeamLookupRoute(), routes=routes).with_outputs(
                'rejects', 'missing_routes', main='main'))

        # write parquet output files
        output = (rows
                  | beam.io.WriteToParquet(
                      os.path.join(known_args.output, 'flights'),
                      schema=datamodel_flights_parquet_schema(),
                      file_name_suffix='.parquet'))

        # write missing routes to another output as CSV
        output_routes = (
            missing_routes
            | "gbr" >> beam.GroupByKey()  # calculate distinct missing routes
            | "missing_routes_csv" >> beam.Map(
                lambda e: ','.join(list(e[0]))
            )  # csv output the key (e[0] of key value tuple) which is (airline,src,dest)
            | "missing_routes_out" >> beam.io.WriteToText(
                os.path.join(known_args.output, 'rejects/missing-routes'),
                file_name_suffix='.csv',
                header='airline,src,dest'))

        # alternative: write (simple) newline delimited json output files
        #              a very flexible output file format for bigquery and other big data tools
        #              much slower to write and larger in size than binary formats such as Parquet, ORC, or Avro
        #              but provides flexibility over schema for smaller data files
        #              larger file sizes should use Avro, Parquet, ORC. Avro provides fastest write speeds where
        #              parquet and orc provide faster read performance for analytical queries

        # output = (rows
        #           | beam.Map(lambda e: {k: v if k != 'flight_date' else v.strftime('%Y-%m-%d') for k, v in e.items()})  # convert flight_date back to string type for json conversion
        #           | beam.Map(lambda e: json.dumps(e))  # json dump row
        #           | beam.io.WriteToText(os.path.join(known_args.output, 'flights'),
        #                                 file_name_suffix='.json')
        #           )

    logger.info("beam pipiline completed.")

    # create bigquery external table and insert into bq flights table
    bq_utils.create_external_table(known_args.flights_ext_table,
                                   source_uris=os.path.join(
                                       known_args.output, "flights*.parquet"),
                                   source_format='PARQUET',
                                   delete_if_exists=True)

    # create and replace existing bigquery flights table
    bq_utils.create_table(known_args.flights_table,
                          schema=datamodel_flights_bigquery_schema(),
                          delete_if_exists=True)

    # insert into table as select (itas) statement
    sql = f"""
        INSERT INTO `{known_args.flights_table}`
        SELECT
          a.day_of_week,
          a.flight_date,
          a.airline,
          a.tailnumber,
          a.flight_number,
          a.src,
          a.src_city,
          a.src_state,
          a.dest,
          a.dest_city,
          a.dest_state,
          PARSE_TIME('%H:%M:%S', a.departure_time) as departure_time,
          PARSE_TIME('%H:%M:%S', a.actual_departure_time) as actual_departure_time,
          a.departure_delay,
          a.taxi_out,
          PARSE_TIME('%H:%M:%S', a.wheels_off) as wheels_off,
          PARSE_TIME('%H:%M:%S', a.wheels_on) as wheels_on,
          a.taxi_in,
          PARSE_TIME('%H:%M:%S', a.arrival_time) as arrival_time,
          PARSE_TIME('%H:%M:%S', a.actual_arrival_time) as actual_arrival_time,
          a.arrival_delay,
          a.cancelled,
          a.cancellation_code,
          a.flight_time,
          a.actual_flight_time,
          a.air_time,
          a.flights,
          a.distance,
          a.airline_delay,
          a.weather_delay,
          a.nas_delay,
          a.security_delay,
          a.late_aircraft_delay,
          -- CONCAT(a.flight_date, '_', a.airline, '_', a.flight_number) AS flightDate_airline_flightNumber
        FROM
          `{known_args.flights_ext_table}`  a
        """
    # insert records form parquet external table into final bq managed flights table
    r = bq_utils.execute(sql)

    logger.info(f"total time: {(now() - t0):,.6f} secs")
Exemple #16
0
def pipeline_monitoring(renames):
  """Using monitoring interface snippets."""

  import re
  import apache_beam as beam
  from apache_beam.options.pipeline_options import PipelineOptions

  class WordCountOptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
      parser.add_argument(
          '--input',
          help='Input for the pipeline',
          default='gs://my-bucket/input')
      parser.add_argument(
          '--output',
          help='output for the pipeline',
          default='gs://my-bucket/output')

  class ExtractWordsFn(beam.DoFn):
    def process(self, element):
      words = re.findall(r'[A-Za-z\']+', element)
      for word in words:
        yield word

  class FormatCountsFn(beam.DoFn):
    def process(self, element):
      word, count = element
      yield '%s: %s' % (word, count)

  # [START pipeline_monitoring_composite]
  # The CountWords Composite Transform inside the WordCount pipeline.
  class CountWords(beam.PTransform):
    def expand(self, pcoll):
      return (
          pcoll
          # Convert lines of text into individual words.
          | 'ExtractWords' >> beam.ParDo(ExtractWordsFn())
          # Count the number of times each word occurs.
          | beam.combiners.Count.PerElement()
          # Format each word and count into a printable string.
          | 'FormatCounts' >> beam.ParDo(FormatCountsFn()))

  # [END pipeline_monitoring_composite]

  pipeline_options = PipelineOptions()
  options = pipeline_options.view_as(WordCountOptions)
  with TestPipeline() as p:  # Use TestPipeline for testing.

    # [START pipeline_monitoring_execution]
    (
        p
        # Read the lines of the input text.
        | 'ReadLines' >> beam.io.ReadFromText(options.input)
        # Count the words.
        | CountWords()
        # Write the formatted word counts to output.
        | 'WriteCounts' >> beam.io.WriteToText(options.output))
    # [END pipeline_monitoring_execution]

    p.visit(SnippetUtils.RenameFiles(renames))
Exemple #17
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    pubsubTopicName = "projects/data-qe-da7e1252/topics/cfw-data-topic"

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/5: The Google Cloud Storage path is required
        # for outputting the results.
        #default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        #default="/Users/skanabargi/python/stream/output",
        default='gs://data-qe-da7e1252/tmp/sk_out',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DataflowRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=data-qe-da7e1252',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        #'--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        '--staging_location=gs://data-qe-da7e1252/tmp/stage/',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        #'--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--temp_location=gs://data-qe-da7e1252/tmp/local',
        '--experiments=allow_non_updatable_job',
        '--job_name=sk-wordcount-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    pipeline_options.view_as(StandardOptions).streaming = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        #lines = p | ReadFromText(known_args.input)
        lines = p | beam.io.ReadFromPubSub(topic=pubsubTopicName)

        print "SK_logs : " + str(lines)

        # Count the occurrences of each word.
        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[A-Za-z\']+', x)).with_output_types(unicode))
                  | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        #output | WriteToText(known_args.output)
        print "SK_logs : output" + str(output)
Exemple #18
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.
    def count_ones(word_ones):
        (word, ones) = word_ones
        return (word, sum(ones))

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'lowercase' >> beam.Map(unicode.lower)
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(known_args.output)

    result = p.run()
    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run
    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation
        empty_lines_filter = MetricsFilter().with_name('empty_lines')
        query_result = result.metrics().query(empty_lines_filter)
        if query_result['counters']:
            empty_lines_counter = query_result['counters'][0]
            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')
        query_result = result.metrics().query(word_lengths_filter)
        if query_result['distributions']:
            word_lengths_dist = query_result['distributions'][0]
            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)
Exemple #19
0
 def test_create_application_client(self):
   pipeline_options = PipelineOptions()
   apiclient.DataflowApplicationClient(
       pipeline_options,
       DataflowRunner.BATCH_ENVIRONMENT_MAJOR_VERSION)
Exemple #20
0
def run(flags, pipeline_args):
    """Run Apache Beam pipeline to generate TFRecords for Survival Analysis"""
    options = PipelineOptions(flags=[], **pipeline_args)
    options.view_as(WorkerOptions).machine_type = flags.machine_type
    temp_dir = os.path.join(flags.output_dir, 'tmp')
    runner = 'DataflowRunner' if flags.cloud else 'DirectRunner'

    files = tf.gfile.Glob(flags.input_dir + "*")
    if not flags.cloud:
        files = files[0:
                      20]  # if running locally for testing, process less files

    logging.warning("Number of files: " + str(len(files)))
    labels = get_labels_array(
        "gs://columbia-dl-storage-bucket/ADNI_t1_list_with_fsstatus_20190111.csv"
    )

    with beam.Pipeline(runner, options=options) as p:
        with tft_beam.Context(temp_dir=temp_dir):

            input_metadata = dataset_metadata.DatasetMetadata(
                dataset_schema.from_feature_spec(features.RAW_FEATURE_SPEC))

            filenames = (p | 'Create filenames' >> beam.Create(files))
            nii = (filenames | 'Read NII' >> beam.Map(read_nii))
            nii_with_labels = (
                nii
                | 'Get Label' >> beam.FlatMap(lambda x: read_label(x, labels)))

            raw_train, raw_eval, raw_test = (
                nii_with_labels | 'RandomlySplitData' >> randomly_split(
                    train_size=.7, validation_size=.15, test_size=.15))

            raw_train = raw_train | 'FlattenTrain' >> beam.FlatMap(
                lambda x: x[1])
            raw_eval = (raw_eval
                        | 'FlattenEval' >> beam.FlatMap(lambda x: x[1]))
            raw_test = (raw_test
                        | 'FlattenTest' >> beam.FlatMap(lambda x: x[1]))

            raw_train | 'CountLabelFreq' >> extractAndCount(flags.output_dir)

            dataset_and_metadata, transform_fn = (
                (raw_train, input_metadata)
                | 'TransformData' >> tft_beam.AnalyzeAndTransformDataset(
                    features.preprocess))
            transform_fn = (
                (raw_train, input_metadata)
                |
                'AnalyzeTrain' >> tft_beam.AnalyzeDataset(features.preprocess))
            _ = (transform_fn
                 | 'WriteTransformFn' >> tft_beam_io.WriteTransformFn(
                     flags.output_dir))
            for dataset_type, dataset in [('Train', raw_train),
                                          ('Eval', raw_eval),
                                          ('Predict', raw_test)]:

                transform_label = 'Transform{}'.format(dataset_type)
                t, metadata = (((dataset, input_metadata), transform_fn)
                               |
                               transform_label >> tft_beam.TransformDataset())
                if dataset_type == 'Train':
                    _ = (metadata
                         | 'WriteMetadata' >>
                         tft_beam_io.WriteMetadata(os.path.join(
                             flags.output_dir, 'transformed_metadata'),
                                                   pipeline=p))
                write_label = 'Write{}TFRecord'.format(dataset_type)
                _ = t | write_label >> WriteTFRecord(
                    dataset_type, flags.output_dir, metadata)
Exemple #21
0

parser = argparse.ArgumentParser()
parser.add_argument("--input", dest="input", required=True)
parser.add_argument(
    "--output",
    required=True,
    help=
    ("Output BigQuery table for results specified as: PROJECT:DATASET.TABLE or DATASET.TABLE."
     ))
app_args, pipeline_args = parser.parse_known_args()

input_files = app_args.input
output_filename = 'output.txt'

options = PipelineOptions()
gcloud_options = options.view_as(GoogleCloudOptions)
# gcloud_options.project = project_id
gcloud_options.job_name = 'import-citybikes'

# Dataflow runner
runner = os.environ['DATAFLOW_RUNNER']
options.view_as(StandardOptions).runner = runner

with apache_beam.Pipeline(options=options) as p:

    inputs = []
    for match in FileSystems.match([input_files]):
        for file in match.metadata_list:
            inputs.append(file.path)
Exemple #22
0
 def test_display_data(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         dd = DisplayData.create_from(options)
         hc.assert_that(dd.items,
                        hc.contains_inanyorder(*case['display_data']))
def run(argv=None):
  """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline
  that transforms bitcoin transactions"""
  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://beam-avro-test/bitcoin/txns/*',
      help='Input file(s) to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=True,
      help='Output file to write results to.')
  parser.add_argument(
      '--compress',
      dest='compress',
      required=False,
      action='store_true',
      help='When set, compress the output data')
  parser.add_argument(
      '--fastavro',
      dest='use_fastavro',
      required=False,
      action='store_true',
      help='When set, use fastavro for Avro I/O')

  opts, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the avro file[pattern] into a PCollection.
  records = \
      p | 'read' >> ReadFromAvro(opts.input, use_fastavro=opts.use_fastavro)

  measured = records | 'scan' >> beam.ParDo(BitcoinTxnCountDoFn())

  # pylint: disable=expression-not-assigned
  measured | 'write' >> \
      WriteToAvro(
          opts.output,
          schema=SCHEMA,
          codec=('deflate' if opts.compress else 'null'),
          use_fastavro=opts.use_fastavro
      )

  result = p.run()
  result.wait_until_finish()

  # Do not query metrics when creating a template which doesn't run
  if (not hasattr(result, 'has_job')  # direct runner
      or result.has_job):  # not just a template creation
    metrics = result.metrics().query()

    for counter in metrics['counters']:
      logging.info("Counter: %s", counter)

    for dist in metrics['distributions']:
      logging.info("Distribution: %s", dist)
Exemple #24
0
    def test_dataflow_job_file(self):
        options = PipelineOptions(['--dataflow_job_file', 'abc'])
        self.assertEqual(options.get_all_options()['dataflow_job_file'], 'abc')

        options = PipelineOptions(flags=[''])
        self.assertEqual(options.get_all_options()['dataflow_job_file'], None)
Exemple #25
0
 def test_default_ip_configuration(self):
     pipeline_options = PipelineOptions(
         ['--temp_location', 'gs://any-location/temp'])
     env = apiclient.Environment([], pipeline_options, '2.0.0',
                                 FAKE_PIPELINE_URL)
     self.assertEqual(env.proto.workerPools[0].ipConfiguration, None)
Exemple #26
0
    def test_template_location(self):
        options = PipelineOptions(['--template_location', 'abc'])
        self.assertEqual(options.get_all_options()['template_location'], 'abc')

        options = PipelineOptions(flags=[''])
        self.assertEqual(options.get_all_options()['template_location'], None)
Exemple #27
0
 def test_interpreter_version_check_fails_py27(self):
     pipeline_options = PipelineOptions([])
     self.assertRaises(Exception,
                       apiclient._verify_interpreter_version_is_supported,
                       pipeline_options)
Exemple #28
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='/tmp/logs/2020-10-02-11-34-19-EA6C5E314B70B157',
        #default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/6: The Google Cloud Storage path is required
        # for outputting the results.
        default='/tmp/logs/output.txt',
        #default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        # CHANGE 2/6: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectRunner',
        # CHANGE 3/6: (OPTIONAL) Your project ID is required in order to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--project=SET_YOUR_PROJECT_ID_HERE',
        # CHANGE 4/6: (OPTIONAL) The Google Cloud region (e.g. us-central1)
        # is required in order to run your pipeline on the Google Cloud
        # Dataflow Service.
        '--region=SET_REGION_HERE',
        # CHANGE 5/6: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=/tmp/logs/2020-10-02-11-34-19-EA6C5E314B70B157',  #gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 6/6: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=/tmp',  #gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the mtext filettern] into a PCollection.
        lines = p | ReadFromText(known_args.input)

        # Count the occurrences of each word.

        def split_me(x):
            print("--->", x)
            x = x.replace('"', '')
            data = x.split(' ')
            print("--->", data)
            print("***", data[2])
            date = data[2].split('[')[-1]
            offset = data[3].split(']')[0]
            valid_data = [
                data[1],  # bucket_name
                f"{date} {offset}",
                data[7],  # operation
                data[8],  # Key
                data[9],  # request_uri
                data[10],  # http status
                data[11],  # error_code
                data[12],  # bytes_sent
                data[13],  # object_size
                data[14],  # total_time
                data[15],  # turn_aroundtime
                data[16],  # referrer
                data[17],  # user_agent
                data[26],  # request_header
            ]
            return valid_data  #x.split(' ')

        splits = (lines | 'Split' >> beam.Map(split_me))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        #output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        splits | WriteToText(known_args.output)
Exemple #29
0
 def test_interpreter_version_check_passes_py38(self):
     pipeline_options = PipelineOptions([])
     apiclient._verify_interpreter_version_is_supported(pipeline_options)
def run(argv=None, save_main_session=True):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  
  #1 Replace your hackathon-edem with your project id 
  parser.add_argument('--input_topic',
                      dest='input_topic',
                      #1 Add your project Id and topic name you created
                      # Example projects/versatile-gist-251107/topics/iexCloud',
                      default='projects/hackathon2-luis1201/topics/valenbisi',
                      help='Input file to process.')
  #2 Replace your hackathon-edem with your project id 
  parser.add_argument('--input_subscription',
                      dest='input_subscription',
                      #3 Add your project Id and Subscription you created you created
                      # Example projects/versatile-gist-251107/subscriptions/quotesConsumer',
                      default='projects/hackathon2-luis1201/subscriptions/streaming',
                      help='Input Subscription')
  
  
  
    
  known_args, pipeline_args = parser.parse_known_args(argv)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
   
  google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
  #3 Replace your hackathon-edem with your project id 
  google_cloud_options.project = 'hackathon2-luis1201'
  google_cloud_options.job_name = 'myjob'
 
  # Uncomment below and add your bucket if you want to execute on Dataflow
  #google_cloud_options.staging_location = 'gs://edem-bucket-roberto/binaries'
  #google_cloud_options.temp_location = 'gs://edem-bucket-roberto/temp'

  pipeline_options.view_as(StandardOptions).runner = 'DirectRunner'
  #pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner'
  pipeline_options.view_as(StandardOptions).streaming = True

 
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session


 

  p = beam.Pipeline(options=pipeline_options)


  # Read the pubsub messages into a PCollection.
  biciStations = p | beam.io.ReadFromPubSub(subscription=known_args.input_subscription)

  # Print messages received
 
  
  
  biciStations = ( biciStations | beam.ParDo(LocationConcat()))
  
  biciStations | 'Print Quote' >> beam.Map(print)
  
  # Store messages on elastic
  biciStations | 'Bici Stations Stored' >> beam.ParDo(IndexDocument())
  
  
  
 
  result = p.run()
  result.wait_until_finish()