# [START combine_custom_average] class AverageFn(beam.CombineFn): def create_accumulator(self): return (0.0, 0) def add_input(self, (sum, count), input): return sum + input, count + 1 def merge_accumulators(self, accumulators): sums, counts = zip(*accumulators) return sum(sums), sum(counts) def extract_output(self, (sum, count)): return sum / count if count else float('NaN') average = pc | beam.CombineGlobally(AverageFn()) # [END combine_custom_average] self.assertEqual([4.25], average) def test_keys(self): occurrences = [('cat', 1), ('cat', 5), ('dog', 5), ('cat', 9), ('dog', 2)] unique_keys = occurrences | snippets.Keys() self.assertEqual({'cat', 'dog'}, set(unique_keys)) def test_count(self): occurrences = ['cat', 'dog', 'cat', 'cat', 'dog'] perkey_counts = occurrences | snippets.Count() self.assertEqual({('cat', 3), ('dog', 2)}, set(perkey_counts)) def test_setting_fixed_windows(self): p = TestPipeline()
"""平均以上の文字数を持つ文字列をフィルタリングする.""" def __init__(self): super(FilterAboveMeanLengthFn, self).__init__() def process(self, element, mean_word_length): if element >= mean_word_length: yield element if __name__ == '__main__': p = beam.Pipeline(options=PipelineOptions()) inputs = ["good morning.", "good afternoon.", "good evening."] # 主入力 word_lengths = ( p | 'create inputs' >> beam.Create(inputs) | 'compute word length' >> beam.Map(lambda element: len(element))) # 副入力 mean_word_length = word_lengths | 'compute mean word length' >> beam.CombineGlobally( beam.combiners.MeanCombineFn()) (word_lengths | 'filter above mean length' >> beam.ParDo( FilterAboveMeanLengthFn(), pvalue.AsSingleton(mean_word_length)) | 'write to text' >> beam.io.WriteToText("./output.txt")) p.run().wait_until_finish()
def process(self, lines): with self.gcsio().open(f'{self.gcs_path}/index.csv', 'w', mime_type='text/csv') as fp: fp.write(lines.encode()) job_name = f"reviewr-automl--{datetime.utcnow().strftime('%Y%m%d-%H%I%S')}" gcs_path = f'{GCS_DESTINATION}/{job_name}' pipeline_options = PipelineOptions(project=PROJECT_ID, region=DATAFLOW_REGION, job_name=job_name, temp_location=f'{gcs_path}/temp') p = beam.Pipeline(runner=RUNNER, options=pipeline_options) bq_row = p | 'ReadFromBigQuery' >> ReadFromBigQuery( query= f"SELECT * FROM `{BQ_SOURCE}`{' LIMIT 10' if RUNNER == 'DirectRunner' else ''}", project=PROJECT_ID, use_standard_sql=True, gcs_location=f'{gcs_path}/temp') bq_row | 'WriteExampleFile' >> beam.ParDo(WriteExampleFile(gcs_path)) bq_row | 'CreateLine' >> beam.ParDo(CreateLine(gcs_path))\ | 'CombineLines' >> beam.CombineGlobally(lambda lines: '\n'.join(lines))\ | 'WriteIndexFile' >> beam.ParDo(WriteIndexFile(gcs_path)) p.run()
def _WriteMetricsPlotsAndValidations( # pylint: disable=invalid-name evaluation: evaluator.Evaluation, output_paths: Dict[str, str], eval_config: config_pb2.EvalConfig, add_metrics_callbacks: List[types.AddMetricsCallbackType], metrics_key: str, plots_key: str, attributions_key: str, validations_key: str, output_file_format: str, rubber_stamp: bool = False) -> beam.pvalue.PDone: """PTransform to write metrics and plots.""" if output_file_format not in _SUPPORTED_FORMATS: raise ValueError('only "{}" formats are currently supported but got ' 'output_file_format={}'.format(_SUPPORTED_FORMATS, output_file_format)) def convert_slice_key_to_parquet_dict( slice_key: metrics_for_slice_pb2.SliceKey) -> _SliceKeyDictPythonType: single_slice_key_dicts = [] for single_slice_key in slice_key.single_slice_keys: kind = single_slice_key.WhichOneof('kind') if not kind: continue single_slice_key_dicts.append({kind: getattr(single_slice_key, kind)}) return {_SINGLE_SLICE_KEYS_PARQUET_FIELD_NAME: single_slice_key_dicts} def convert_to_parquet_columns( value: Union[metrics_for_slice_pb2.MetricsForSlice, metrics_for_slice_pb2.PlotsForSlice, metrics_for_slice_pb2.AttributionsForSlice] ) -> Dict[str, Union[_SliceKeyDictPythonType, bytes]]: return { _SLICE_KEY_PARQUET_COLUMN_NAME: convert_slice_key_to_parquet_dict(value.slice_key), _SERIALIZED_VALUE_PARQUET_COLUMN_NAME: value.SerializeToString() } if metrics_key in evaluation and constants.METRICS_KEY in output_paths: metrics = ( evaluation[metrics_key] | 'ConvertSliceMetricsToProto' >> beam.Map( convert_slice_metrics_to_proto, add_metrics_callbacks=add_metrics_callbacks)) file_path_prefix = output_paths[constants.METRICS_KEY] if output_file_format == _PARQUET_FORMAT: _ = ( metrics | 'ConvertToParquetColumns' >> beam.Map(convert_to_parquet_columns) | 'WriteMetricsToParquet' >> beam.io.WriteToParquet( file_path_prefix=file_path_prefix, schema=_SLICED_PARQUET_SCHEMA, file_name_suffix='.' + output_file_format)) elif output_file_format == _TFRECORD_FORMAT: _ = metrics | 'WriteMetrics' >> beam.io.WriteToTFRecord( file_path_prefix=file_path_prefix, shard_name_template=None if output_file_format else '', file_name_suffix=('.' + output_file_format if output_file_format else ''), coder=beam.coders.ProtoCoder(metrics_for_slice_pb2.MetricsForSlice)) else: raise ValueError(f'Unsupported output file format: {output_file_format}.') if plots_key in evaluation and constants.PLOTS_KEY in output_paths: plots = ( evaluation[plots_key] | 'ConvertSlicePlotsToProto' >> beam.Map( convert_slice_plots_to_proto, add_metrics_callbacks=add_metrics_callbacks)) file_path_prefix = output_paths[constants.PLOTS_KEY] if output_file_format == _PARQUET_FORMAT: _ = ( plots | 'ConvertPlotsToParquetColumns' >> beam.Map(convert_to_parquet_columns) | 'WritePlotsToParquet' >> beam.io.WriteToParquet( file_path_prefix=file_path_prefix, schema=_SLICED_PARQUET_SCHEMA, file_name_suffix='.' + output_file_format)) elif output_file_format == _TFRECORD_FORMAT: _ = plots | 'WritePlotsToTFRecord' >> beam.io.WriteToTFRecord( file_path_prefix=file_path_prefix, shard_name_template=None if output_file_format else '', file_name_suffix=('.' + output_file_format if output_file_format else ''), coder=beam.coders.ProtoCoder(metrics_for_slice_pb2.PlotsForSlice)) else: raise ValueError(f'Unsupported output file format: {output_file_format}.') if (attributions_key in evaluation and constants.ATTRIBUTIONS_KEY in output_paths): attributions = ( evaluation[attributions_key] | 'ConvertSliceAttributionsToProto' >> beam.Map(convert_slice_attributions_to_proto)) file_path_prefix = output_paths[constants.ATTRIBUTIONS_KEY] if output_file_format == _PARQUET_FORMAT: _ = ( attributions | 'ConvertAttributionsToParquetColumns' >> beam.Map(convert_to_parquet_columns) | 'WriteAttributionsToParquet' >> beam.io.WriteToParquet( file_path_prefix=file_path_prefix, schema=_SLICED_PARQUET_SCHEMA, file_name_suffix='.' + output_file_format)) elif output_file_format == _TFRECORD_FORMAT: _ = attributions | 'WriteAttributionsToTFRecord' >> beam.io.WriteToTFRecord( file_path_prefix=file_path_prefix, shard_name_template=None if output_file_format else '', file_name_suffix=('.' + output_file_format if output_file_format else ''), coder=beam.coders.ProtoCoder( metrics_for_slice_pb2.AttributionsForSlice)) else: raise ValueError(f'Unsupported output file format: {output_file_format}.') if (validations_key in evaluation and constants.VALIDATIONS_KEY in output_paths): validations = ( evaluation[validations_key] | 'MergeValidationResults' >> beam.CombineGlobally( CombineValidations(eval_config, rubber_stamp=rubber_stamp))) file_path_prefix = output_paths[constants.VALIDATIONS_KEY] # We only use a single shard here because validations are usually single # values. Setting the shard_name_template to the empty string forces this. shard_name_template = '' if output_file_format == _PARQUET_FORMAT: _ = ( validations | 'ConvertValidationsToParquetColumns' >> beam.Map( lambda v: # pylint: disable=g-long-lambda {_SERIALIZED_VALUE_PARQUET_COLUMN_NAME: v.SerializeToString()}) | 'WriteValidationsToParquet' >> beam.io.WriteToParquet( file_path_prefix=file_path_prefix, shard_name_template=shard_name_template, schema=_UNSLICED_PARQUET_SCHEMA, file_name_suffix='.' + output_file_format)) elif output_file_format == _TFRECORD_FORMAT: _ = ( validations | 'WriteValidationsToTFRecord' >> beam.io.WriteToTFRecord( file_path_prefix=file_path_prefix, shard_name_template=shard_name_template, file_name_suffix=('.' + output_file_format if output_file_format else ''), coder=beam.coders.ProtoCoder( validation_result_pb2.ValidationResult))) else: raise ValueError(f'Unsupported output file format: {output_file_format}.') return beam.pvalue.PDone(list(evaluation.values())[0].pipeline)
# # This can be used directly if print to screen not needed as # # | >> beam.CombinePerKey(sum) # | 'Get the total for each item' >> beam.CombinePerKey(sum) # # To print to screen # | 'Prints data to screen' >> beam.ParDo(Printer()) # ) # Option 3 number_of_transactions_per_item = ( data_from_source | 'Clean the item for items count' >> beam.ParDo(Transaction()) | 'Map record item to 1 for items count' >> beam.Map(lambda record: (record['item'], 1)) | 'Get the total for each item' >> beam.CombinePerKey(sum) | 'Convert data into List' >> ( beam.CombineGlobally( beam.combiners.ToListCombineFn()) # ToDictCombineFn()) )) # Uses the Dictionary # # Maximum # most_popular_item = ( # number_of_transactions_per_item # | 'Get the item with maximum count' >> beam.Map(lambda item: max(item, key=item.get)) # # To print to screen # | 'Prints max. data to screen' >> beam.ParDo(Printer()) # ) # # Minimum # less_popular_item = ( # number_of_transactions_per_item # | 'Get the item with minimum count' >> beam.Map(lambda item: min(item, key=item.get))
# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import apache_beam as beam from log_elements import LogElements p = beam.Pipeline() (p | beam.Create(range(1, 11)) | beam.CombineGlobally(sum) | LogElements()) p.run()
def expand(self, inputs): pcoll, = inputs if self._top_k is not None and self._top_k < 0: raise ValueError( 'top_k for VocabularyImpl should be >= 0 or None, got ' '{}.'.format(self._top_k)) if self._frequency_threshold is not None and self._frequency_threshold < 0: raise ValueError( 'frequency_threshold for VocabularyImpl should be >= 0 or None, ' 'got {}.'.format(self._frequency_threshold)) # Create a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). def is_problematic_string(kv): string, _ = kv # Ignore counts. return string and b'\n' not in string and b'\r' not in string if (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION): flatten_map_fn = ( _flatten_positive_label_weights_total_weights_and_counts) # count_and_means is a pcollection that contains a # _CountAndWeightsMeansAccumulator where: # `weighted_mean` is the weighted mean of positive labels # for all features. # `count` is the count for all features. # `weights_mean` is the mean of the weights for all features. count_and_means = ( pcoll | 'SumBatchCountAndWeightsMeans' >> beam.Map(_count_and_means) | 'ComputeCountAndWeightsMeansGlobally' >> beam.CombineGlobally(CountAndWeightsMeansCombineFn())) # CountAndWeightsMeansCombineFn returns a tuple of the form: # (feature,_CountAndWeightsMeansAccumulator) where: # `feature` is a single string, which is the word in the vocabulary # whose mutual information with the label is being computed. # `weighted_mean` is the weighted mean of y positive given x. # `count` is the count of weights for a feature. # `weights_mean` is the mean of the weights for a feature. combine_transform = ( 'ComputeCountAndWeightsMeansPerUniqueWord' >> beam.CombinePerKey(CountAndWeightsMeansCombineFn()) | 'CalculateMutualInformationPerUniqueWord' >> beam.Map( _calculate_mutual_information, global_accumulator=beam.pvalue.AsSingleton( count_and_means), use_adjusted_mutual_info=self._use_adjusted_mutual_info)) elif (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY): flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples combine_transform = beam.CombinePerKey(sum) else: flatten_map_fn = _flatten_value_to_list combine_transform = beam.combiners.Count.PerElement() raw_counts = ( pcoll | 'FlattenStringsAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn) | 'CountPerString' >> combine_transform | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string) | 'SwapStringsAndCounts' >> beam.KvSwap()) counts = ( raw_counts | 'ApplyFrequencyThresholdAndTopK' >> ( _ApplyFrequencyThresholdAndTopK( # pylint: disable=no-value-for-parameter self._frequency_threshold, self._top_k))) return counts | 'WriteVocabFile' >> ( _WriteVocabFile( # pylint: disable=no-value-for-parameter self._base_temp_dir, self._vocab_filename, self._store_frequency))
def expand(self, stats: beam.PCollection): return stats | 'MergeDatasetFeatureStatisticsProtos' >> beam.CombineGlobally( merge_util.merge_dataset_feature_statistics_list)
def run(argv=None): """Main entry point; defines and runs the hourly_team_score pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--topic', type=str, help='Pub/Sub topic to read from') parser.add_argument('--subscription', type=str, help='Pub/Sub subscription to read from') parser.add_argument('--dataset', type=str, required=True, help='BigQuery Dataset to write tables to. ' 'Must already exist.') parser.add_argument('--table_name', type=str, default='game_stats', help='The BigQuery table name. Should not already exist.') parser.add_argument('--fixed_window_duration', type=int, default=60, help='Numeric value of fixed window duration for user ' 'analysis, in minutes') parser.add_argument('--session_gap', type=int, default=5, help='Numeric value of gap between user sessions, ' 'in minutes') parser.add_argument('--user_activity_window_duration', type=int, default=30, help='Numeric value of fixed window for finding mean of ' 'user session duration, in minutes') args, pipeline_args = parser.parse_known_args(argv) if args.topic is None and args.subscription is None: parser.print_usage() print(sys.argv[0] + ': error: one of --topic or --subscription is required') sys.exit(1) options = PipelineOptions(pipeline_args) # We also require the --project option to access --dataset if options.view_as(GoogleCloudOptions).project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) fixed_window_duration = args.fixed_window_duration * 60 session_gap = args.session_gap * 60 user_activity_window_duration = args.user_activity_window_duration * 60 # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True # Enforce that this pipeline is always run in streaming mode options.view_as(StandardOptions).streaming = True with beam.Pipeline(options=options) as p: # Read game events from Pub/Sub using custom timestamps, which # are extracted from the data elements, and parse the data. if args.subscription: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( subscription=args.subscription) else: scores = p | 'ReadPubSub' >> beam.io.ReadFromPubSub( topic=args.topic) raw_events = ( scores | 'DecodeString' >> beam.Map(lambda b: b.decode('utf-8')) | 'ParseGameEventFn' >> beam.ParDo(ParseGameEventFn()) | 'AddEventTimestamps' >> beam.Map( lambda elem: beam.window.TimestampedValue(elem, elem['timestamp']))) # Extract username/score pairs from the event stream user_events = ( raw_events | 'ExtractUserScores' >> beam.Map( lambda elem: (elem['user'], elem['score']))) # Calculate the total score per user over fixed windows, and cumulative # updates for late data spammers_view = ( user_events | 'UserFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out everyone but those with (SCORE_WEIGHT * avg) clickrate. # These might be robots/spammers. | 'CalculateSpammyUsers' >> CalculateSpammyUsers() # Derive a view from the collection of spammer users. It will be used as # a side input in calculating the team score sums, below | 'CreateSpammersView' >> beam.CombineGlobally( beam.combiners.ToDictCombineFn()).as_singleton_view()) # [START filter_and_calc] # Calculate the total score per team over fixed windows, and emit cumulative # updates for late data. Uses the side input derived above --the set of # suspected robots-- to filter out scores from those users from the sum. # Write the results to BigQuery. (raw_events # pylint: disable=expression-not-assigned | 'WindowIntoFixedWindows' >> beam.WindowInto( beam.window.FixedWindows(fixed_window_duration)) # Filter out the detected spammer users, using the side input derived above | 'FilterOutSpammers' >> beam.Filter( lambda elem, spammers: elem['user'] not in spammers, spammers_view) # Extract and sum teamname/score pairs from the event data. | 'ExtractAndSumScore' >> ExtractAndSumScore('team') # [END filter_and_calc] | 'TeamScoresDict' >> beam.ParDo(TeamScoresDict()) | 'WriteTeamScoreSums' >> WriteToBigQuery( args.table_name + '_teams', args.dataset, { 'team': 'STRING', 'total_score': 'INTEGER', 'window_start': 'STRING', 'processing_time': 'STRING', }, options.view_as(GoogleCloudOptions).project)) # [START session_calc] # Detect user sessions-- that is, a burst of activity separated by a gap # from further activity. Find and record the mean session lengths. # This information could help the game designers track the changing user # engagement as their set of game changes. (user_events # pylint: disable=expression-not-assigned | 'WindowIntoSessions' >> beam.WindowInto( beam.window.Sessions(session_gap), timestamp_combiner=beam.window.TimestampCombiner.OUTPUT_AT_EOW) # For this use, we care only about the existence of the session, not any # particular information aggregated over it, so we can just group by key # and assign a "dummy value" of None. | beam.CombinePerKey(lambda _: None) # Get the duration of the session | 'UserSessionActivity' >> beam.ParDo(UserSessionActivity()) # [END session_calc] # [START rewindow] # Re-window to process groups of session sums according to when the # sessions complete | 'WindowToExtractSessionMean' >> beam.WindowInto( beam.window.FixedWindows(user_activity_window_duration)) # Find the mean session duration in each window | beam.CombineGlobally(beam.combiners.MeanCombineFn()).without_defaults() | 'FormatAvgSessionLength' >> beam.Map( lambda elem: {'mean_duration': float(elem)}) | 'WriteAvgSessionLength' >> WriteToBigQuery( args.table_name + '_sessions', args.dataset, { 'mean_duration': 'FLOAT', }, options.view_as(GoogleCloudOptions).project))
def _compute_sum(examples): return (examples | beam.Map(lambda x: x[1]['label']) | beam.CombineGlobally(sum))
def _compute_mean(examples): return (examples | beam.Map(lambda x: x[1]['id']) | beam.CombineGlobally(beam.combiners.MeanCombineFn()))
def expand(self, pcoll): return pcoll | beam.CombineGlobally( _MergeDefinitionsFn(self._definitions_merger)).without_defaults()
def expand(self, pcoll): return pcoll \ | beam.CombineGlobally(sum).with_output_types(int)
def expand(self, pcoll): return pcoll | 'MergeHeaders' >> beam.CombineGlobally( _MergeHeadersFn(self._header_merger)).without_defaults()
def test_global_sum(self): pc = [1, 2, 3] # [START global_sum] result = pc | beam.CombineGlobally(sum) # [END global_sum] self.assertEqual([6], result)
# to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # beam-playground: # name: AggregationSum # description: Task from katas to compute the sum of all elements. # multifile: false # pipeline_options: # categories: # - Combiners import apache_beam as beam from log_elements import LogElements with beam.Pipeline() as p: (p | beam.Create(range(1, 11)) | beam.CombineGlobally(sum) | LogElements())
from abc import ABCMeta import apache_beam as beam # Advanced Combiner - CombineFn class Combiner(beam.CombineFn): __metaclass__ = ABCMeta def create_accumulator(self, *args, **kwargs): return 0.0, 0 def add_input(self, mutable_accumulator, element, *args, **kwargs): num, count = mutable_accumulator return num + element, count + 1 def merge_accumulators(self, accumulators, *args, **kwargs): num_values, count_values = zip(*accumulators) return sum(num_values), sum(count_values) def extract_output(self, accumulator, *args, **kwargs): num_values, count_values = accumulator return num_values / count_values if count_values else float('NaN') with beam.Pipeline() as combiner_pipeline: (combiner_pipeline | "Create list" >> beam.Create([1, 2, 3, 4, 5, 6]) | "Using combiner" >> beam.CombineGlobally(Combiner()) | "Writing output" >> beam.io.WriteToText("combiner_output/output"))
def expand(self, estimates): return (estimates | 'ExtractFileSize' >> beam.Map(lambda estimate: estimate.size_in_bytes) | 'SumFileSizes' >> beam.CombineGlobally(sum))
# "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import apache_beam as beam from log_elements import LogElements def sum(numbers): total = 0 for num in numbers: total += num return total p = beam.Pipeline() (p | beam.Create([1, 2, 3, 4, 5]) | beam.CombineGlobally(sum) | LogElements()) p.run()
def expand(self, estimates): return (estimates | 'ExtractVariantCount' >> beam.Map(lambda estimate: estimate.estimated_variant_count) | 'SumVariantCounts' >> beam.CombineGlobally(sum))
def CSVToRecordBatch(lines: beam.pvalue.PCollection, column_names: List[Text], desired_batch_size: Optional[int], delimiter: Text = ",", skip_blank_lines: bool = True, schema: Optional[schema_pb2.Schema] = None, multivalent_columns: Optional[List[Text]] = None, secondary_delimiter: Optional[Text] = None, raw_record_column_name: Optional[Text] = None): """Decodes CSV records into Arrow RecordBatches. Args: lines: The pcollection of raw records (csv lines). column_names: List of feature names. Order must match the order in the CSV file. desired_batch_size: Batch size. The output Arrow RecordBatches will have as many rows as the `desired_batch_size`. If None, the batch size is auto tuned by beam. delimiter: A one-character string used to separate fields. skip_blank_lines: A boolean to indicate whether to skip over blank lines rather than interpreting them as missing values. schema: An optional schema of the input data. If this is provided, it must contain a subset of columns in `column_names`. If a feature is in `column_names` but not in the schema, it won't be in the result RecordBatch. multivalent_columns: Columns that can contain multiple values. If secondary_delimiter is provided, this must also be provided. secondary_delimiter: Delimiter used for parsing multivalent columns. If multivalent_columns is provided, this must also be provided. raw_record_column_name: Optional name for a column containing the raw csv lines. If this is None, then this column will not be produced. This will always be the last column in the record batch. Returns: RecordBatches of the CSV lines. Raises: ValueError: * If the columns do not match the specified csv headers. * If the schema has invalid feature types. * If the schema does not contain all columns. * If raw_record_column_name exists in column_names """ if (raw_record_column_name is not None and raw_record_column_name in column_names): raise ValueError( "raw_record_column_name: {} is already an existing column name. " "Please choose a different name.".format(raw_record_column_name)) csv_lines_and_raw_records = ( lines | "ParseCSVLines" >> beam.ParDo(ParseCSVLine(delimiter))) if schema is not None: column_infos = _GetColumnInfosFromSchema(schema, column_names) else: # TODO(b/72746442): Consider using a DeepCopy optimization similar to TFT. # Do first pass to infer the feature types. column_infos = beam.pvalue.AsSingleton( csv_lines_and_raw_records | "ExtractParsedCSVLines" >> beam.Keys() | "InferColumnTypes" >> beam.CombineGlobally( ColumnTypeInferrer(column_names=column_names, skip_blank_lines=skip_blank_lines, multivalent_columns=multivalent_columns, secondary_delimiter=secondary_delimiter))) # Do second pass to generate the RecordBatches. return ( csv_lines_and_raw_records | "BatchCSVLines" >> beam.BatchElements( **batch_util.GetBatchElementsKwargs(desired_batch_size)) | "BatchedCSVRowsToArrow" >> beam.ParDo( BatchedCSVRowsToRecordBatch( skip_blank_lines=skip_blank_lines, multivalent_columns=multivalent_columns, secondary_delimiter=secondary_delimiter, raw_record_column_name=raw_record_column_name), column_infos))
def expand(self, sample_map): return (sample_map | 'GetListsOfValueCounts' >> beam.Values() | 'SumValueCountsPerSample' >> beam.Map(sum) | 'SumTotalValueCounts' >> beam.CombineGlobally(sum))
def expand(self, pcolls): scalar_inputs = [expr for expr in self.stage.inputs if is_scalar(expr)] tabular_inputs = [ expr for expr in self.stage.inputs if not is_scalar(expr) ] if len(tabular_inputs) == 0: partitioned_pcoll = next(pcolls.values()).pipeline | beam.Create([{}]) elif self.stage.partitioning != partitionings.Nothing(): # Partitioning required for these operations. # Compute the number of partitions to use for the inputs based on # the estimated size of the inputs. if self.stage.partitioning == partitionings.Singleton(): # Always a single partition, don't waste time computing sizes. num_partitions = 1 else: # Estimate the sizes from the outputs of a *previous* stage such # that using these estimates will not cause a fusion break. input_sizes = [ estimate_size(input, same_stage_ok=False) for input in tabular_inputs ] if None in input_sizes: # We were unable to (cheaply) compute the size of one or more # inputs. num_partitions = DEFAULT_PARTITIONS else: num_partitions = beam.pvalue.AsSingleton( input_sizes | 'FlattenSizes' >> beam.Flatten() | 'SumSizes' >> beam.CombineGlobally(sum) | 'NumPartitions' >> beam.Map( lambda size: max( MIN_PARTITIONS, min(MAX_PARTITIONS, size // TARGET_PARTITION_SIZE)))) # Arrange such that partitioned_pcoll is properly partitioned. main_pcolls = { expr._id: pcolls[expr._id] | 'Partition_%s_%s' % (self.stage.partitioning, expr._id) >> beam.FlatMap( self.stage.partitioning.partition_fn, num_partitions) for expr in tabular_inputs } | beam.CoGroupByKey() partitioned_pcoll = main_pcolls | beam.ParDo(_ReBatch()) else: # Already partitioned, or no partitioning needed. assert len(tabular_inputs) == 1 tag = tabular_inputs[0]._id partitioned_pcoll = pcolls[tag] | beam.Map(lambda df: {tag: df}) side_pcolls = { expr._id: beam.pvalue.AsSingleton(pcolls[expr._id]) for expr in scalar_inputs } # Actually evaluate the expressions. def evaluate(partition, stage=self.stage, **side_inputs): session = expressions.Session( dict([(expr, partition[expr._id]) for expr in tabular_inputs] + [(expr, side_inputs[expr._id]) for expr in scalar_inputs])) for expr in stage.outputs: yield beam.pvalue.TaggedOutput(expr._id, expr.evaluate_at(session)) return partitioned_pcoll | beam.FlatMap(evaluate, ** side_pcolls).with_outputs()
def run(argv=None): pipeline_args = [ '--project={0}'.format(PROJECT), '--job_name=majesticmillion1', '--save_main_session', '--staging_location=gs://{0}/staging/'.format(BUCKET), '--temp_location=gs://{0}/temp/'.format(BUCKET), '--num_workers=4', '--runner=DataflowRunner', '--inputFile=gs://{0}/Sample_Data/majestic_million.csv'.format(BUCKET), '--template_location=gs://{0}/templates/majestic_million_template'. format(BUCKET), '--zone=australia-southeast1-a' # '--region=australia-southeast1', ] pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True inbound_options = pipeline_options.view_as(FileLoader) input = inbound_options.inputFile with beam.Pipeline(options=pipeline_options) as p: TLD_Desc = ( p | 'Read TLD Description File' >> beam.io.ReadFromText(TLDFile) | 'Parse Descriptions' >> beam.ParDo(combine_TLD()) | 'Combine Descriptions to Dictionary' >> beam.CombineGlobally(combine_pdict)) excludedTLDs = ( p | 'Read excuded TLD file' >> beam.io.ReadFromText(excludedTLDFile) | 'Get list of excluded TLD' >> beam.ParDo(lambda x: x.split(','))) # Extract records as dictionaries records = ( p | 'Read File' >> beam.io.ReadFromText(input, skip_header_lines=1) | 'Parse CSV' >> beam.ParDo(Split(), SCHEMA) | 'Add Descriptions' >> beam.ParDo( AddDTLDDesc(), beam.pvalue.AsSingleton(TLD_Desc))) # Write TLD aggregations to BigQuery (records | 'Aggregate TLDS' >> CountTLDs(excludedTLDs) | 'Write TLDs to BigQuery' >> beam.io.WriteToBigQuery( '{0}:{1}.TLDCounts'.format(PROJECT, DATASET), # Enter your table name schema=TLD_SCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) # Write all records to BigQuery (records | 'Write Items BQ' >> beam.io.WriteToBigQuery( '{0}:{1}.TopSites'.format(PROJECT, DATASET), # Enter your table name schema=SCHEMA + "," + DESCRIPTIONSCHEMA, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) # Write metadata to Datastore (records | 'Get Record Count' >> beam.combiners.Count.Globally() | 'Create Metadata' >> beam.ParDo( GetMetaData(inbound_options.inputFile)) | 'Create DS Entity' >> beam.Map(lambda x: create_ds_entity(x)) | 'Write To DS' >> WriteToDatastore(PROJECT)) p.run()