def run(): p = beam.Pipeline(options=PipelineOptions()) # Callable takes additional arguments. def filter_using_length(word, lower_bound, upper_bound=float('inf')): if lower_bound <= len(word) <= upper_bound: yield word # Construct a deferred side input. avg_word_len = ( words | beam.Map(len) | beam.CombineGlobally(beam.combiners.MeanCombineFn())) print(avg_word_len) p_avg_word_len = (p | 'avg word len' >> beam.Create(avg_word_len)) # Call with explicit side inputs. small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0, 3) # A single deferred side input. larger_than_average = ( words | 'large' >> beam.FlatMap( filter_using_length, lower_bound=pvalue.AsSingleton(p_avg_word_len)) ) # Mix and match. small_but_nontrivial = words | beam.FlatMap( filter_using_length, lower_bound=2, upper_bound=pvalue.AsSingleton(avg_word_len))
def test_deferred_side_inputs(self): @typehints.with_input_types(str, int) def repeat(s, times): return s * times with TestPipeline() as p: main_input = p | beam.Create(['a', 'bb', 'c']) side_input = p | 'side' >> beam.Create([3]) result = main_input | beam.Map(repeat, pvalue.AsSingleton(side_input)) assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc'])) bad_side_input = p | 'bad_side' >> beam.Create(['z']) with self.assertRaises(typehints.TypeCheckError): main_input | 'bis' >> beam.Map(repeat, pvalue.AsSingleton(bad_side_input))
def _add_inferred_headers(all_patterns, # type: List[str] pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace merged_header, # type: pvalue.PCollection pipeline_mode # type: int ): # type: (...) -> pvalue.PCollection annotation_fields_to_infer = (known_args.annotation_fields if known_args.infer_annotation_types else []) inferred_headers = ( _read_variants(all_patterns, pipeline, known_args, pipeline_mode) | 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) | 'InferHeaderFields' >> infer_headers.InferHeaderFields( pvalue.AsSingleton(merged_header), known_args.allow_incompatible_records, known_args.infer_headers, annotation_fields_to_infer)) merged_header = ( (inferred_headers, merged_header) | 'FlattenHeaders' >> beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header
def test_pardo_side_input(self): # pylint: disable=line-too-long with TestPipeline() as p: words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd']) # [START model_pardo_side_input] # Callable takes additional arguments. def filter_using_length(word, lower_bound, upper_bound=float('inf')): if lower_bound <= len(word) <= upper_bound: yield word # Construct a deferred side input. avg_word_len = (words | beam.Map(len) | beam.CombineGlobally( beam.combiners.MeanCombineFn())) # Call with explicit side inputs. small_words = words | 'small' >> beam.FlatMap( filter_using_length, 0, 3) # A single deferred side input. larger_than_average = (words | 'large' >> beam.FlatMap( filter_using_length, lower_bound=pvalue.AsSingleton(avg_word_len))) # Mix and match. small_but_nontrivial = words | beam.FlatMap( filter_using_length, lower_bound=2, upper_bound=pvalue.AsSingleton(avg_word_len)) # [END model_pardo_side_input] assert_that(small_words, equal_to(['a', 'bb', 'ccc'])) assert_that(larger_than_average, equal_to(['ccc', 'dddd']), label='larger_than_average') assert_that(small_but_nontrivial, equal_to(['bb']), label='small_but_not_trivial')
def _get_inferred_headers(variants, # type: pvalue.PCollection merged_header # type: pvalue.PCollection ): # type: (...) -> (pvalue.PCollection, pvalue.PCollection) inferred_headers = (variants | 'FilterVariants' >> filter_variants.FilterVariants() | ' InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(merged_header))) merged_header = ( (inferred_headers, merged_header) | beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( allow_incompatible_records=True)) return inferred_headers, merged_header
def expand(self, records): o = (records | 'pair one' >> beam.Map(lambda x: (1, x)) | 'group all records' >> beam.GroupByKey() | 'split one of' >> beam.ParDo( self.PreGenerateMappings()).with_outputs( 'splitted', 'combine')) # Create mappings, and prevent fusion (this limits the parallelization # in the optimization step) mappings = (o.splitted | 'create mappings' >> beam.ParDo( self.GenerateMappings(), pvalue.AsSingleton(o.combine)) | 'prevent fusion' >> beam.Reshuffle()) return mappings
def test_defined_fields_filtered_one_variant(self): # All FORMATs and INFOs are already defined in the header section of VCF # files. with TestPipeline() as p: vcf_headers = self._get_sample_header_fields() vcf_headers_side_input = p | 'vcf_headers' >> Create([vcf_headers]) variant = self._get_sample_variant_1() inferred_headers = ( p | Create([variant]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(vcf_headers_side_input))) expected = vcf_header_io.VcfHeader() assert_that(inferred_headers, equal_to([expected])) p.run()
def _add_inferred_headers( pipeline, # type: beam.Pipeline known_args, # type: argparse.Namespace merged_header # type: pvalue.PCollection ): # type: (...) -> pvalue.PCollection inferred_headers = (_read_variants(pipeline, known_args) | 'FilterVariants' >> filter_variants.FilterVariants( reference_names=known_args.reference_names) | ' InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(merged_header))) merged_header = ( (inferred_headers, merged_header) | beam.Flatten() | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields, known_args.allow_incompatible_records)) return merged_header
def test_defined_fields_filtered_two_variants(self): # Only INFO and FORMAT in the first variants are already defined in the # header section of the VCF files. with TestPipeline() as p: vcf_headers = self._get_sample_header_fields() vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers]) variant_1 = self._get_sample_variant_1() variant_2 = self._get_sample_variant_2() inferred_headers = ( p | Create([variant_1, variant_2]) | 'InferUndefinedHeaderFields' >> infer_undefined_headers.InferUndefinedHeaderFields( pvalue.AsSingleton(vcf_headers_side_input))) expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')} expected_formats = {'FI_2': Format('FI_2', 1, 'String', '')} expected = vcf_header_io.VcfHeader( infos=expected_infos, formats=expected_formats) assert_that(inferred_headers, equal_to([expected])) p.run()
def run(): # まずパイプラインを作る p = beam.Pipeline(options=PipelineOptions()) # 1.配列をパイプラインの入力に設定 inputs = ["good morning.", "good afternoon.", "good evening."] mean_word_length = (inputs | beam.Map(len) | beam.CombineGlobally(beam.combiners.MeanCombineFn())) print(mean_word_length) p_mean_word_length = (p | 'avg word len' >> beam.Create(mean_word_length)) output = (p | 'read' >> beam.Create(inputs) | 'FilterMeanLength' >> beam.ParDo( FilterMeanLengthFn(), pvalue.AsSingleton(p_mean_word_length)) | 'write to text' >> beam.io.WriteToText('./output.txt')) print(output) p.run().wait_until_finish()
def run(): p = beam.Pipeline(options=PipelineOptions()) inputs = ['good morning.', 'good afternoon.', 'good evening.'] # 副入力 mean_word_length = ( p | 'CreateWordLength' >> beam.Create([len(s) for s in inputs]) | 'ComputeMeanWordLength' >> beam.CombineGlobally( beam.combiners.MeanCombineFn())) # 主入力 output = ( p | 'CreateWord' >> beam.Create(inputs) | 'FilterMeanLength' >> beam.ParDo( FilterMeanLengthFn(), pvalue.AsSingleton(mean_word_length)) # ParDo の第2引数に副入力を挿入する | 'WriteToText' >> beam.io.WriteToText('出力先のパス')) p.run().wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) BUCKET='BUCKET_NAME' data = [('NC', 'F', 2020, 'Hello', 3200), ('NC', 'F', 2020, 'World', 3180)] schema = (p | 'Read Schema from GCS' >> ReadFromText('gs://{}/schema.json'.format(BUCKET))) (p | 'Create Events' >> beam.Create(data) \ | 'Enrich with side input' >> beam.ParDo(EnrichElementsFn(), pvalue.AsSingleton(schema)) \ | 'Log elements' >> beam.ParDo(LogElementsFn())) result = p.run() result.wait_until_finish()
def expand(self, pcoll): p = pcoll.pipeline temp_location = p.options.view_as(GoogleCloudOptions).temp_location empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) load_job_name_pcv = pvalue.AsSingleton( singleton_pc | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( singleton_pc | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), * self.table_side_inputs)) all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) partitions = ( grouped_files_pc | beam.ParDo( PartitionFiles(self.max_partition_size, self.max_files_per_partition)).with_outputs( PartitionFiles.MULTIPLE_PARTITIONS_TAG, PartitionFiles.SINGLE_PARTITION_TAG)) multiple_partitions_per_destination_pc = partitions[ PartitionFiles.MULTIPLE_PARTITIONS_TAG] single_partition_per_destination_pc = partitions[ PartitionFiles.SINGLE_PARTITION_TAG] # When using dynamic destinations, elements with both single as well as # multiple partitions are loaded into BigQuery using temporary tables to # ensure atomicity. if self.dynamic_destinations: all_partitions = ((multiple_partitions_per_destination_pc, single_partition_per_destination_pc) | "FlattenPartitions" >> beam.Flatten()) destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\ _load_data(all_partitions, empty_pc, load_job_name_pcv, singleton_pc) else: destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\ _load_data(multiple_partitions_per_destination_pc, single_partition_per_destination_pc, load_job_name_pcv, singleton_pc) return { self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def run(argv=None): """Main function. Main function containing the Apache Beam pipeline describing how to process the input CSV file to generate the LTV predictions. """ parser = argparse.ArgumentParser() _, pipeline_args = parser.parse_known_args(argv) options = pipeline_options.PipelineOptions(pipeline_args) runtime_options = options.view_as(RuntimeOptions) with beam.Pipeline(options=options) as pipeline: options = (pipeline | 'Create single element Stream containing options dict' >> beam.Create([options.get_all_options()]) | beam.Map( lambda x: { k: v.get() if isinstance( v, value_provider.ValueProvider) else v for (k, v) in x.items() }) | beam.Map(c.set_extra_options)) full_elog = ( pipeline | beam.io.ReadFromText(getattr(runtime_options, c._OPTION_INPUT_CSV), skip_header_lines=1) | beam.Map(lambda x: list(csv.reader([x]))[0]) | beam.FlatMap( c.csv_line_to_list, pvalue.AsSingleton(options)) # (customer_id, date_str, date, # sales, extra_dimension?) ) full_elog_merged = ( full_elog | beam.Filter(lambda x: x[3] > 0) # sales > 0 | beam.Map(lambda x: ((x[0], x[1]), x)) # key: (customer_id, date) | 'Group full elog by customer and date' >> beam.GroupByKey() | beam.Map(c.merge_full_elog_by_customer_and_date) # (customer_id, # date_str, date, # sales) ) min_max_dates = ( full_elog_merged | beam.Map(lambda x: x[2]) # date | beam.CombineGlobally(c.MinMaxDatesFn()) | beam.Map(c.min_max_dates_dict)) limits_dates = (min_max_dates | beam.FlatMap(c.limit_dates_boundaries, pvalue.AsSingleton(options))) cohort = (full_elog_merged | beam.FlatMap(c.filter_customers_in_cohort, pvalue.AsSingleton(limits_dates)) | 'Distinct Customer IDs in Cohort' >> util.Distinct()) cohort_count = ( cohort | 'Count cohort entries' >> beam.combiners.Count.Globally()) cohort_set = (cohort | beam.Map(lambda x: (x, 1))) all_customer_ids = ( full_elog_merged | beam.Map(lambda x: x[0]) # key: customer_id | 'Distinct all Customer IDs' >> util.Distinct()) all_customer_ids_count = ( all_customer_ids | 'Count all customers' >> beam.combiners.Count.Globally()) num_customers = ( pipeline | 'Create single elem Stream I' >> beam.Create([1]) | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count), pvalue.AsSingleton(all_customer_ids_count), pvalue.AsSingleton(options))) cal_hol_elog = (full_elog_merged | beam.FlatMap(c.filter_cohort_records_in_cal_hol, pvalue.AsDict(cohort_set), pvalue.AsSingleton(limits_dates))) cal_hol_elog_count = ( cal_hol_elog | 'Count cal hol elog entries' >> beam.combiners.Count.Globally()) calibration = (cal_hol_elog | beam.FlatMap(c.filter_records_in_calibration, pvalue.AsSingleton(limits_dates))) num_txns_total = ( full_elog_merged | beam.FlatMap(c.filter_records_in_cal_hol, pvalue.AsSingleton(limits_dates)) | 'Count num txns total' >> beam.combiners.Count.Globally()) num_txns = (pipeline | 'Create single elem Stream II' >> beam.Create([1]) | beam.FlatMap(c.count_txns, pvalue.AsSingleton(cal_hol_elog_count), pvalue.AsSingleton(num_txns_total), pvalue.AsSingleton(options))) calcbs = ( calibration | beam.Map(lambda x: (x[0], x)) | 'Group calibration elog by customer id' >> beam.GroupByKey() | beam.FlatMap( c.create_cal_cbs, pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates) ) # (customer_id, number_of_transactions, average_order_value, # frequency, recency, total_time_observed) ) first_transaction_dates_by_customer = ( cal_hol_elog | beam.Map(lambda x: (x[0], x)) # customer_id | 'Group cal hol elog by customer id' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1]))) ) # item 2 -> date ) cal_hol_elog_repeat = ( cal_hol_elog | beam.FlatMap(c.filter_first_transaction_date_records, pvalue.AsDict(first_transaction_dates_by_customer)) | beam.FlatMap( c.calculate_time_unit_numbers, # (customer_id, date, # time_unit_number) pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates)) | beam.Map(lambda x: (x[2], 1)) # key: time_unit_number | 'Group cal hol elog repeat by time unit number' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], sum(x[1]))) # (time_unit_number, occurrences) ) repeat_tx = ( pipeline | 'Create single elem Stream III' >> beam.Create([1]) | beam.FlatMap(c.calculate_cumulative_repeat_transactions, pvalue.AsIter(cal_hol_elog_repeat) ) # (time_unit_number, repeat_transactions, # repeat_transactions_cumulative) ) model_validation = ( pipeline | 'Create single elem Stream IV' >> beam.Create([1]) | beam.FlatMap( c.calculate_model_fit_validation, pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs), pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns))) _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape)) _ = (model_validation | beam.Map(lambda x: x[0]) | beam.FlatMap(c.calculate_model_fit_validation_to_text, pvalue.AsSingleton(options))) fullcbs_without_extra_dimension = ( full_elog_merged | beam.Map(lambda x: (x[0], x)) # key: customer_id | 'Group full merged elog by customer id' >> beam.GroupByKey() | beam.FlatMap( c.create_fullcbs, pvalue.AsSingleton(options), pvalue.AsSingleton(min_max_dates) ) # (customer_id, number_of_transactions, historical_aov, # frequency, recency, total_time_observed) ) full_elog_if_extra_dimension = ( full_elog | 'Discard records if no extra dimension' >> beam.FlatMap( c.discard_if_no_extra_dimension, pvalue.AsSingleton(options))) extra_dimensions_stats = ( full_elog_if_extra_dimension | beam.Map(lambda x: ( (x[0], x[4]), x)) # key: (customer_id, extra_dimension) | 'Group full elog by customer id and extra dimension' >> beam.GroupByKey() | beam.Map( c.create_extra_dimensions_stats ) # (customer_id, extra_dimension, dimension_count, tot_sales, # max_dimension_date) ) top_dimension_per_customer = ( extra_dimensions_stats | beam.Map(lambda x: (x[0], x)) # customer_id | 'Group extra dimension stats by customer id' >> beam.GroupByKey() | beam.Map( c.extract_top_extra_dimension ) # (customer_id, extra_dimension, dimension_count, tot_sales, # max_dimension_date) ) customer_dimension_map = ( top_dimension_per_customer | beam.Map(lambda x: (x[0], x[1])) # (customer_id, extra_dimension) ) fullcbs = ( fullcbs_without_extra_dimension | beam.FlatMap( c.add_top_extra_dimension_to_fullcbs, pvalue.AsSingleton(options), pvalue.AsDict(customer_dimension_map) ) # (customer_id, number_of_transactions, historical_aov, # frequency, recency, total_time_observed, # extra_dimension?) ) prediction = ( pipeline | 'Create single elem Stream V' >> beam.Create([1]) | beam.FlatMap( c.calculate_prediction, pvalue.AsSingleton(options), pvalue.AsIter(fullcbs), pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns) ) # [customer_id, p_alive, predicted_purchases, future_aov, # historical_aov, expected_value, frequency, recency, # total_time_observed, extra_dimension?], prediction_params ) prediction_by_customer_no_segments = ( prediction | beam.FlatMap(lambda x: x[0]) # Extract predictions by customer ) _ = ( prediction | beam.Map(lambda x: x[1]) # Extract predictions params | beam.FlatMap(c.calculate_prediction_to_text, pvalue.AsSingleton(options))) num_rows = (full_elog_merged | 'Count num rows in full elog merged' >> beam.combiners.Count.Globally()) segment_predictions_exact = ( pipeline | 'Create single elem Stream VII' >> beam.Create([1]) | beam.FlatMap( lambda _, rows_count: [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD], pvalue.AsSingleton(num_rows))) sharded_cust_predictions_no_segments_exact, \ sharded_cust_predictions_no_segments_hash = ( prediction_by_customer_no_segments | beam.FlatMap( c.prediction_sharded, pvalue.AsSingleton(options), pvalue.AsSingleton(segment_predictions_exact) ) # [customer_id, p_alive, predicted_purchases, future_aov, # historical_aov, expected_value, frequency, recency, # total_time_observed, extra_dimension?] | beam.Partition(lambda x, _: 0 if x[1] else 1, 2) ) # BEGIN of "exact" branch prediction_by_customer_exact = ( pipeline | 'Create single elem Stream VIII' >> beam.Create([1]) | beam.FlatMap( c.split_in_ntiles_exact, pvalue.AsSingleton(options), pvalue.AsIter(sharded_cust_predictions_no_segments_exact )) # [customer_id, p_alive, predicted_purchases, # future_aov, historical_aov, expected_value, # frequency, recency, total_time_observed, # segment, extra_dimension?] ) # END of "exact" branch # BEGIN of "hash" branch customer_count_by_expected_value = ( sharded_cust_predictions_no_segments_hash | beam.Map(lambda x: (x[0][5], 1)) # (expected_value, 1) | 'Group customer predictions by expected value' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], sum(x[1]))) # expected_value, customers_count ) hash_segment_limits = ( pipeline | 'Create single elem Stream IX' >> beam.Create([1]) | beam.FlatMap(c.expected_values_segment_limits, pvalue.AsSingleton(options), pvalue.AsIter(customer_count_by_expected_value), pvalue.AsSingleton(all_customer_ids_count))) prediction_by_customer_hash = ( sharded_cust_predictions_no_segments_hash | beam.Map(lambda x: x[0]) | beam.FlatMap(c.split_in_ntiles_hash, pvalue.AsSingleton(hash_segment_limits) ) # [customer_id, p_alive, predicted_purchases, # future_aov, historical_aov, expected_value, # frequency, recency, total_time_observed, # segment, extra_dimension?] ) # END of "hash" branch prediction_by_customer = ( # only one of these two streams will contains values (prediction_by_customer_exact, prediction_by_customer_hash) | beam.Flatten()) _ = (prediction_by_customer | beam.FlatMap( lambda x, opts: [x + ['']] if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x], pvalue.AsSingleton(options)) | 'prediction_by_customer to CSV line' >> beam.Map( c.list_to_csv_line) | 'Write prediction_by_customer' >> beam.io.WriteToText( getattr(runtime_options, c._OPTION_OUTPUT_FOLDER), header='customer_id,p_alive' ',predicted_purchases' ',future_aov,historical_aov' ',expected_value,frequency,recency' ',total_time_observed,segment' ',extra_dimension', shard_name_template='', num_shards=1, file_name_suffix='prediction_by_customer.csv')) prediction_summary_temp = ( prediction_by_customer | beam.Map(lambda x: (x[9], x)) # key: segment | 'Group customer predictions by segment' >> beam.GroupByKey() | beam.FlatMap( c.generate_prediction_summary, pvalue.AsSingleton( options)) # (segment, average_retention_probability, # average_predicted_customer_value, # average_predicted_order_value, # average_predicted_purchases, total_customer_value, # number_of_customers) ) tot_equity = ( prediction_summary_temp | beam.Map(lambda x: x[5]) # total_customer_value | beam.CombineGlobally(sum)) prediction_summary = ( prediction_summary_temp | beam.FlatMap( c.calculate_perc_of_total_customer_value, pvalue.AsSingleton(tot_equity), pvalue.AsSingleton( options)) # (segment, average_retention_probability, # average_predicted_customer_value, # average_predicted_order_value, # average_predicted_purchases, # total_customer_value, number_of_customers, # perc_of_total_customer_value) ) _ = (prediction_summary | 'prediction_summary to CSV line' >> beam.Map(c.list_to_csv_line) | 'Write prediction_summary' >> beam.io.WriteToText( getattr(runtime_options, c._OPTION_OUTPUT_FOLDER), header='segment,average_retention_probability' ',average_predicted_customer_value' ',average_predicted_order_value,average_predicted_purchases' ',total_customer_value,number_of_customers' ',perc_of_total_customer_value', shard_name_template='', num_shards=1, file_name_suffix='prediction_summary.csv')) prediction_summary_extra_dimension = ( prediction_by_customer | 'Discard prediction if there is not extra dimension' >> beam.FlatMap(c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)) | beam.Map(lambda x: (x[10], x)) # extra dimension | 'Group customer predictions by extra dimension' >> beam.GroupByKey() | beam.FlatMap(c.generate_prediction_summary_extra_dimension, pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(options))) _ = (prediction_summary_extra_dimension | 'prediction_summary_extra_dimension to CSV line' >> beam.Map( c.list_to_csv_line) | 'Write prediction_summary_extra_dimension' >> beam.io.WriteToText( getattr(runtime_options, c._OPTION_OUTPUT_FOLDER), header='extra_dimension,average_retention_probability' ',average_predicted_customer_value' ',average_predicted_order_value' ',average_predicted_purchases,total_customer_value' ',number_of_customers,perc_of_total_customer_value', shard_name_template='', num_shards=1, file_name_suffix='prediction_summary_extra_dimension.csv'))
def expand(self, pcoll): p = pcoll.pipeline temp_location = p.options.view_as(GoogleCloudOptions).temp_location load_job_name_pcv = pvalue.AsSingleton( p | "ImpulseJobName" >> beam.Create([None]) | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( p | "CreateFilePrefixView" >> beam.Create(['']) | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), * self.table_side_inputs)) all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) # Load Jobs are triggered to temporary tables, and those are later copied to # the actual appropriate destination query. This ensures atomicity when only # some of the load jobs would fail but not other. # If any of them fails, then copy jobs are not triggered. trigger_loads_outputs = (grouped_files_pc | beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=self.temp_tables, additional_bq_parameters=self.additional_bq_parameters), load_job_name_pcv, *self.schema_side_inputs).with_outputs( TriggerLoadJobs.TEMP_TABLES, main='main')) destination_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] destination_copy_job_ids_pc = ( p | "ImpulseMonitorLoadJobs" >> beam.Create([None]) | "WaitForLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_job_ids_pc)) | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, temporary_tables=self.temp_tables, test_client=self.test_client), load_job_name_pcv)) finished_copy_jobs_pc = ( p | "ImpulseMonitorCopyJobs" >> beam.Create([None]) | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_copy_job_ids_pc))) _ = ( finished_copy_jobs_pc | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda x, deleting_tables: deleting_tables, pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None)) | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0]) | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn())) return { self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def expand(self, pcoll): p = pcoll.pipeline try: step_name = self.label except AttributeError: step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT BigQueryBatchFileLoads.COUNT += 1 temp_location = p.options.view_as(GoogleCloudOptions).temp_location job_name = (p.options.view_as(GoogleCloudOptions).job_name or 'AUTOMATIC_JOB_NAME') empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) load_job_name_pcv = pvalue.AsSingleton( singleton_pc | "LoadJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP'))) schema_mod_job_name_pcv = pvalue.AsSingleton( singleton_pc | "SchemaModJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'SCHEMA_MOD_STEP'))) copy_job_name_pcv = pvalue.AsSingleton( singleton_pc | "CopyJobNamePrefix" >> beam.Map(lambda _: _generate_job_name( job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP'))) file_prefix_pcv = pvalue.AsSingleton( singleton_pc | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), * self.table_side_inputs)) if not self.with_auto_sharding: all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) else: all_destination_file_pairs_pc = self._write_files_with_auto_sharding( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) partitions = ( grouped_files_pc | beam.ParDo( PartitionFiles(self.max_partition_size, self.max_files_per_partition)).with_outputs( PartitionFiles.MULTIPLE_PARTITIONS_TAG, PartitionFiles.SINGLE_PARTITION_TAG)) multiple_partitions_per_destination_pc = partitions[ PartitionFiles.MULTIPLE_PARTITIONS_TAG] single_partition_per_destination_pc = partitions[ PartitionFiles.SINGLE_PARTITION_TAG] # When using dynamic destinations, elements with both single as well as # multiple partitions are loaded into BigQuery using temporary tables to # ensure atomicity. if self.dynamic_destinations: all_partitions = ((multiple_partitions_per_destination_pc, single_partition_per_destination_pc) | "FlattenPartitions" >> beam.Flatten()) destination_load_job_ids_pc, destination_copy_job_ids_pc = ( self._load_data(all_partitions, empty_pc, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name)) else: destination_load_job_ids_pc, destination_copy_job_ids_pc = ( self._load_data(multiple_partitions_per_destination_pc, single_partition_per_destination_pc, load_job_name_pcv, schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name)) return { self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def run(argv=None): """Main function. Main function containing the Apache Beam pipeline describing how to process the input CSV file to generate the LTV predictions. """ parser = argparse.ArgumentParser() _, pipeline_args = parser.parse_known_args(argv) options = pipeline_options.PipelineOptions(pipeline_args) runtime_options = options.view_as(RuntimeOptions) with beam.Pipeline(options=options) as pipeline: options = (pipeline | 'Create single element Stream containing options dict' >> beam.Create([options.get_all_options()]) | beam.Map( lambda x: { k: v.get() if isinstance( v, value_provider.ValueProvider) else v for (k, v) in x.items() }) | beam.Map(c.set_extra_options)) full_elog = ( pipeline | bq_mod.ReadFromBigQuery( project=getattr(runtime_options, c._OPTION_INPUT_BQ_PROJECT), query=getattr(runtime_options, c._OPTION_INPUT_BQ_QUERY), gcs_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), use_standard_sql=True) | beam.FlatMap( c.bq_row_to_list, pvalue.AsSingleton(options)) # (customer_id, date_str, date, # sales, extra_dimension?) ) full_elog_merged = ( full_elog | beam.Filter(lambda x: x[3] > 0) # sales > 0 | beam.Map(lambda x: ((x[0], x[1]), x)) # key: (customer_id, date) | 'Group full elog by customer and date' >> beam.GroupByKey() | beam.Map(c.merge_full_elog_by_customer_and_date) # (customer_id, # date_str, date, # sales) ) min_max_dates = ( full_elog_merged | beam.Map(lambda x: x[2]) # date | beam.CombineGlobally(c.MinMaxDatesFn()) | beam.Map(c.min_max_dates_dict)) limits_dates = (min_max_dates | beam.FlatMap(c.limit_dates_boundaries, pvalue.AsSingleton(options))) cohort = (full_elog_merged | beam.FlatMap(c.filter_customers_in_cohort, pvalue.AsSingleton(limits_dates)) | 'Distinct Customer IDs in Cohort' >> util.Distinct()) cohort_count = ( cohort | 'Count cohort entries' >> beam.combiners.Count.Globally()) cohort_set = (cohort | beam.Map(lambda x: (x, 1))) all_customer_ids = ( full_elog_merged | beam.Map(lambda x: x[0]) # key: customer_id | 'Distinct all Customer IDs' >> util.Distinct()) all_customer_ids_count = ( all_customer_ids | 'Count all customers' >> beam.combiners.Count.Globally()) num_customers = ( pipeline | 'Create single elem Stream I' >> beam.Create([1]) | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count), pvalue.AsSingleton(all_customer_ids_count), pvalue.AsSingleton(options))) cal_hol_elog = (full_elog_merged | beam.FlatMap(c.filter_cohort_records_in_cal_hol, pvalue.AsDict(cohort_set), pvalue.AsSingleton(limits_dates))) cal_hol_elog_count = ( cal_hol_elog | 'Count cal hol elog entries' >> beam.combiners.Count.Globally()) calibration = (cal_hol_elog | beam.FlatMap(c.filter_records_in_calibration, pvalue.AsSingleton(limits_dates))) num_txns_total = ( full_elog_merged | beam.FlatMap(c.filter_records_in_cal_hol, pvalue.AsSingleton(limits_dates)) | 'Count num txns total' >> beam.combiners.Count.Globally()) num_txns = (pipeline | 'Create single elem Stream II' >> beam.Create([1]) | beam.FlatMap(c.count_txns, pvalue.AsSingleton(cal_hol_elog_count), pvalue.AsSingleton(num_txns_total), pvalue.AsSingleton(options))) calcbs = ( calibration | beam.Map(lambda x: (x[0], x)) | 'Group calibration elog by customer id' >> beam.GroupByKey() | beam.FlatMap( c.create_cal_cbs, pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates) ) # (customer_id, number_of_transactions, average_order_value, # frequency, recency, total_time_observed) ) first_transaction_dates_by_customer = ( cal_hol_elog | beam.Map(lambda x: (x[0], x)) # customer_id | 'Group cal hol elog by customer id' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1]))) ) # item 2 -> date ) cal_hol_elog_repeat = ( cal_hol_elog | beam.FlatMap(c.filter_first_transaction_date_records, pvalue.AsDict(first_transaction_dates_by_customer)) | beam.FlatMap( c.calculate_time_unit_numbers, # (customer_id, date, # time_unit_number) pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates)) | beam.Map(lambda x: (x[2], 1)) # key: time_unit_number | 'Group cal hol elog repeat by time unit number' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], sum(x[1]))) # (time_unit_number, occurrences) ) repeat_tx = ( pipeline | 'Create single elem Stream III' >> beam.Create([1]) | beam.FlatMap(c.calculate_cumulative_repeat_transactions, pvalue.AsIter(cal_hol_elog_repeat) ) # (time_unit_number, repeat_transactions, # repeat_transactions_cumulative) ) model_validation = ( pipeline | 'Create single elem Stream IV' >> beam.Create([1]) | beam.FlatMap( c.calculate_model_fit_validation, pvalue.AsSingleton(options), pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs), pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns))) _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape)) _ = (model_validation | beam.Map(lambda x: x[0]) | 'Write to validation_params table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'validation_params'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema={ 'fields': [{ 'name': 'calibration_start_date', 'type': 'STRING' }, { 'name': 'calibration_end_date', 'type': 'STRING' }, { 'name': 'cohort_end_date', 'type': 'STRING' }, { 'name': 'holdout_end_date', 'type': 'STRING' }, { 'name': 'model_time_granularity', 'type': 'STRING' }, { 'name': 'model', 'type': 'RECORD', 'fields': [ { 'name': 'frequency_model', 'type': 'STRING' }, { 'name': 'num_customers_cohort', 'type': 'INTEGER' }, { 'name': 'perc_customers_cohort', 'type': 'FLOAT' }, { 'name': 'num_transactions_validation', 'type': 'INTEGER' }, { 'name': 'perc_transactions_validation', 'type': 'FLOAT' }, { 'name': 'mape', 'type': 'FLOAT' }, ] }] }, write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED)) fullcbs_without_extra_dimension = ( full_elog_merged | beam.Map(lambda x: (x[0], x)) # key: customer_id | 'Group full merged elog by customer id' >> beam.GroupByKey() | beam.FlatMap( c.create_fullcbs, pvalue.AsSingleton(options), pvalue.AsSingleton(min_max_dates) ) # (customer_id, number_of_transactions, historical_aov, # frequency, recency, total_time_observed) ) full_elog_if_extra_dimension = ( full_elog | 'Discard records if no extra dimension' >> beam.FlatMap( c.discard_if_no_extra_dimension, pvalue.AsSingleton(options))) extra_dimensions_stats = ( full_elog_if_extra_dimension | beam.Map(lambda x: ( (x[0], x[4]), x)) # key: (customer_id, extra_dimension) | 'Group full elog by customer id and extra dimension' >> beam.GroupByKey() | beam.Map( c.create_extra_dimensions_stats ) # (customer_id, extra_dimension, dimension_count, tot_sales, # max_dimension_date) ) top_dimension_per_customer = ( extra_dimensions_stats | beam.Map(lambda x: (x[0], x)) # customer_id | 'Group extra dimension stats by customer id' >> beam.GroupByKey() | beam.Map( c.extract_top_extra_dimension ) # (customer_id, extra_dimension, dimension_count, tot_sales, # max_dimension_date) ) customer_dimension_map = ( top_dimension_per_customer | beam.Map(lambda x: (x[0], x[1])) # (customer_id, extra_dimension) ) fullcbs = ( fullcbs_without_extra_dimension | beam.FlatMap( c.add_top_extra_dimension_to_fullcbs, pvalue.AsSingleton(options), pvalue.AsDict(customer_dimension_map) ) # (customer_id, number_of_transactions, historical_aov, # frequency, recency, total_time_observed, # extra_dimension?) ) prediction = ( pipeline | 'Create single elem Stream V' >> beam.Create([1]) | beam.FlatMap( c.calculate_prediction, pvalue.AsSingleton(options), pvalue.AsIter(fullcbs), pvalue.AsSingleton(num_customers), pvalue.AsSingleton(num_txns) ) # [customer_id, p_alive, predicted_purchases, future_aov, # historical_aov, expected_value, frequency, recency, # total_time_observed, extra_dimension?], prediction_params ) prediction_by_customer_no_segments = ( prediction | beam.FlatMap(lambda x: x[0]) # Extract predictions by customer ) _ = ( prediction | beam.Map(lambda x: x[1]) # Extract prediction params | 'Write to prediction_params table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'prediction_params'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema={ 'fields': [{ 'name': 'prediction_period', 'type': 'INTEGER' }, { 'name': 'prediction_period_unit', 'type': 'STRING' }, { 'name': 'model_time_granularity', 'type': 'STRING' }, { 'name': 'customers_modeled', 'type': 'INTEGER' }, { 'name': 'transactions_observed', 'type': 'INTEGER' }, { 'name': 'frequency_model', 'type': 'STRING' }, { 'name': 'bgnbd_model_params', 'type': 'RECORD', 'fields': [{ 'name': 'a', 'type': 'FLOAT' }, { 'name': 'b', 'type': 'FLOAT' }, { 'name': 'r', 'type': 'FLOAT' }, { 'name': 'alpha', 'type': 'FLOAT' }] }, { 'name': 'paretonbd_model_params', 'type': 'RECORD', 'fields': [{ 'name': 'r', 'type': 'FLOAT' }, { 'name': 's', 'type': 'FLOAT' }, { 'name': 'alpha', 'type': 'FLOAT' }, { 'name': 'beta', 'type': 'FLOAT' }] }, { 'name': 'gamma_gamma_params', 'type': 'RECORD', 'fields': [{ 'name': 'p', 'type': 'FLOAT' }, { 'name': 'q', 'type': 'FLOAT' }, { 'name': 'v', 'type': 'FLOAT' }] }] }, write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED)) num_rows = (full_elog_merged | 'Count num rows in full elog merged' >> beam.combiners.Count.Globally()) segment_predictions_exact = ( pipeline | 'Create single elem Stream VII' >> beam.Create([1]) | beam.FlatMap( lambda _, rows_count: [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD], pvalue.AsSingleton(num_rows))) sharded_cust_predictions_no_segments_exact, \ sharded_cust_predictions_no_segments_hash = ( prediction_by_customer_no_segments | beam.FlatMap( c.prediction_sharded, pvalue.AsSingleton(options), pvalue.AsSingleton(segment_predictions_exact) ) # [customer_id, p_alive, predicted_purchases, future_aov, # historical_aov, expected_value, frequency, recency, # total_time_observed, extra_dimension?] | beam.Partition(lambda x, _: 0 if x[1] else 1, 2) ) # BEGIN of "exact" branch prediction_by_customer_exact = ( pipeline | 'Create single elem Stream VIII' >> beam.Create([1]) | beam.FlatMap( c.split_in_ntiles_exact, pvalue.AsSingleton(options), pvalue.AsIter(sharded_cust_predictions_no_segments_exact )) # [customer_id, p_alive, predicted_purchases, # future_aov, historical_aov, expected_value, # frequency, recency, total_time_observed, # segment, extra_dimension?] ) # END of "exact" branch # BEGIN of "hash" branch customer_count_by_expected_value = ( sharded_cust_predictions_no_segments_hash | beam.Map(lambda x: (x[0][5], 1)) # (expected_value, 1) | 'Group customer predictions by expected value' >> beam.GroupByKey() | beam.Map(lambda x: (x[0], sum(x[1]))) # expected_value, customers_count ) hash_segment_limits = ( pipeline | 'Create single elem Stream IX' >> beam.Create([1]) | beam.FlatMap(c.expected_values_segment_limits, pvalue.AsSingleton(options), pvalue.AsIter(customer_count_by_expected_value), pvalue.AsSingleton(all_customer_ids_count))) prediction_by_customer_hash = ( sharded_cust_predictions_no_segments_hash | beam.Map(lambda x: x[0]) | beam.FlatMap(c.split_in_ntiles_hash, pvalue.AsSingleton(hash_segment_limits) ) # [customer_id, p_alive, predicted_purchases, # future_aov, historical_aov, expected_value, # frequency, recency, total_time_observed, # segment, extra_dimension?] ) # END of "hash" branch prediction_by_customer = ( # only one of these two streams will contains values (prediction_by_customer_exact, prediction_by_customer_hash) | beam.Flatten() | beam.Map(c.clean_nan_and_inf)) _ = (prediction_by_customer | beam.FlatMap( lambda x, opts: [x + ['']] if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x], pvalue.AsSingleton(options)) | 'prediction_by_customer to Dict' >> beam.Map(c.list_to_dict, [ 'customer_id', 'p_alive', 'predicted_purchases', 'future_aov', 'historical_aov', 'expected_value', 'frequency', 'recency', 'total_time_observed', 'segment', 'extra_dimension' ]) | 'Write to prediction_by_customer table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'prediction_by_customer'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema='customer_id:STRING, p_alive:FLOAT64' ', predicted_purchases:FLOAT64' ', future_aov:FLOAT64, historical_aov:FLOAT64' ', expected_value:FLOAT64, frequency:INT64' ', recency:FLOAT64' ', total_time_observed:FLOAT64, segment:INT64' ', extra_dimension:STRING', write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED)) prediction_summary_temp = ( prediction_by_customer | beam.Map(lambda x: (x[9], x)) # key: segment | 'Group customer predictions by segment' >> beam.GroupByKey() | beam.FlatMap( c.generate_prediction_summary, pvalue.AsSingleton( options)) # (segment, average_retention_probability, # average_predicted_customer_value, # average_predicted_order_value, # average_predicted_purchases, total_customer_value, # number_of_customers) ) tot_equity = ( prediction_summary_temp | beam.Map(lambda x: x[5]) # total_customer_value | beam.CombineGlobally(sum)) prediction_summary = ( prediction_summary_temp | beam.FlatMap( c.calculate_perc_of_total_customer_value, pvalue.AsSingleton(tot_equity), pvalue.AsSingleton( options)) # (segment, average_retention_probability, # average_predicted_customer_value, # average_predicted_order_value, # average_predicted_purchases, # total_customer_value, number_of_customers, # perc_of_total_customer_value) ) _ = ( prediction_summary | 'prediction_summary to Dict' >> beam.Map(c.list_to_dict, [ 'segment', 'average_retention_probability', 'average_predicted_customer_value', 'average_predicted_order_value', 'average_predicted_purchases', 'total_customer_value', 'number_of_customers', 'perc_of_total_customer_value' ]) | 'Write to prediction_summary table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'prediction_summary'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema='segment:INT64 ,average_retention_probability:FLOAT64' ', average_predicted_customer_value:FLOAT64' ', average_predicted_order_value:FLOAT64' ', average_predicted_purchases:FLOAT64' ', total_customer_value:FLOAT64' ', number_of_customers:FLOAT64' ', perc_of_total_customer_value:FLOAT64', write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED)) prediction_summary_extra_dimension = ( prediction_by_customer | 'Discard prediction if there is not extra dimension' >> beam.FlatMap(c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)) | beam.Map(lambda x: (x[10], x)) # extra dimension | 'Group customer predictions by extra dimension' >> beam.GroupByKey() | beam.FlatMap(c.generate_prediction_summary_extra_dimension, pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(options))) _ = (prediction_summary_extra_dimension | 'prediction_summary_extra_dimension to Dict' >> beam.Map( c.list_to_dict, [ 'extra_dimension', 'average_retention_probability', 'average_predicted_customer_value', 'average_predicted_order_value', 'average_predicted_purchases', 'total_customer_value', 'number_of_customers', 'perc_of_total_customer_value' ]) | 'Write to prediction_summary_extra_dimension table' >> io.WriteToBigQuery( table=c.TableValueProvider( getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT), getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET), 'prediction_summary_extra_dimension'), custom_gcs_temp_location=getattr(runtime_options, c._OPTION_TEMP_GCS_LOCATION), validate=False, schema='extra_dimension:STRING' ', average_retention_probability:FLOAT64' ', average_predicted_customer_value:FLOAT64' ', average_predicted_order_value:FLOAT64' ', average_predicted_purchases:FLOAT64' ', total_customer_value:FLOAT64' ', number_of_customers:INT64' ', perc_of_total_customer_value:FLOAT64', write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--from-bigquery', dest='from_bigquery', const=True, default=False, nargs='?', help='Whether to load from BigQuery') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to # run your pipeline on the Google Cloud Dataflow Service. '--runner=DirectRunner', # CHANGE 3/5: Your project ID is required in order to run your pipeline on # the Google Cloud Dataflow Service. '--project=aou-res-curation-test', # CHANGE 4/5: Your Google Cloud Storage path is required for staging local # files. '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY', # CHANGE 5/5: Your Google Cloud Storage path is required for temporary # files. '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY', '--job_name=your-wordcount-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=pipeline_options) as p: # Read all of the EHR inputs, into a dictionary of: # table -> site_name -> PCollection of table rows ehr_inputs = {} for tbl in ['person', 'measurement', 'condition_occurrence']: ehr_inputs[tbl] = {} for site in ['nyc', 'pitt']: if known_args.from_bigquery: ehr_inputs[tbl][site] = (p | f"{site}_{tbl}" >> beam.io.Read( beam.io.BigQuerySource( query= f"SELECT * FROM `aou-res-curation-test.calbach_prototype.{site}_{tbl}`", use_standard_sql=True))) else: ehr_inputs[tbl][site] = ( p | f"read {site}_{tbl}" >> ReadFromText(f"../test_data/{site}/{tbl}.json") | f"{site}_{tbl} from JSON" >> beam.Map(json.loads)) # Merge tables across all sites, resulting in: # table -> PCollection of table rows # Question: How should these ID spaces be reconciled? combined_by_domain = {} for tbl, data_by_site in ehr_inputs.items(): combined_by_domain[tbl] = data_by_site.values( ) | f"ehr merge for {tbl}" >> beam.Flatten() # 1. Move data from person table elsewhere. # Transform person rows, generate new measurement rows. combined_by_domain["person"], extracted_meas_rows = ( combined_by_domain["person"] | beam.ParDo(ExtractDobAsMeasurement()).with_outputs( ExtractDobAsMeasurement.OUTPUT_TAG_MEASUREMENT, main='person')) # Merge the new measurement rows into the larger collection. combined_by_domain["measurement"] = ( combined_by_domain["measurement"], extracted_meas_rows) | beam.Flatten() # 2. Perform a row-level table transform combined_by_domain["condition_occurrence"] = ( combined_by_domain["condition_occurrence"] | beam.Map(clamp_condition_start_datetime)) # 3. Retract participants by ID. person_id_blacklist = ( combined_by_domain['person'] | beam.Map(lambda p: p['person_id']) # Simulates more complex criteria here, likely involving other tables. | "generate the person ID blacklist" >> beam.Filter(lambda pid: int(pid) % 2 == 0) | beam.Map(lambda pid: (pid, True)) | beam.combiners.ToDict()) # Drop all data for blacklisted participants from all tables. for (domain, data) in combined_by_domain.items(): combined_by_domain[domain] = ( data | beam.Filter( filter_by_id, blacklist=pvalue.AsSingleton(person_id_blacklist))) # 4. Group-by-participant transforms, e.g. remove duplicate measurements combined_by_domain['measurement'] = ( combined_by_domain['measurement'] # Define unique rows as person+measurement concept ID. | beam.Map(lambda m: ((m["person_id"], m["measurement_concept_id"]), m)) # We don't care which one, just compare the row JSON for a deterministic result. | beam.combiners.Top.PerKey(1, key=lambda a: str(a)) | beam.Values()) # XXX: Need to figure out how ID generation is meant to work here. That will impact # how we go about creating the mapping tables. # Initial idea is that we likely attach some payload to the in-flight representation # of a row. for domain, data in combined_by_domain.items(): data | f"output for {domain}" >> beam.io.WriteToText( f"out/{domain}.txt")
def expand(self, pcoll): p = pcoll.pipeline load_job_name_pcv = pvalue.AsSingleton( p | "ImpulseJobName" >> beam.Create([None]) | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( p | "CreateFilePrefixView" >> beam.Create([self._input_gs_location]) | "GenerateFilePrefix" >> beam.Map(_generate_file_prefix)) outputs = ( pcoll | "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows()) | "AppendDestination" >> beam.ParDo( _AppendDestinationsFn(self.destination)) | beam.ParDo(WriteRecordsToFile( max_files_per_bundle=self.max_files_per_bundle, max_file_size=self.max_file_size, coder=self.coder), file_prefix=file_prefix_pcv).with_outputs( WriteRecordsToFile.UNWRITTEN_RECORD_TAG, WriteRecordsToFile.WRITTEN_FILE_TAG)) # A PCollection of (destination, file) tuples. It lists files with records, # and the destination each file is meant to be imported into. destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG] # A PCollection of (destination, record) tuples. These are later sharded, # grouped, and all records for each destination-shard is written to files. # This PCollection is necessary because not all records can be written into # files in ``WriteRecordsToFile``. unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG] more_destination_files_kv_pc = ( unwritten_records_pc | beam.ParDo(_ShardDestinations()) | "GroupShardedRows" >> beam.GroupByKey() | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1])) | "WriteGroupedRecordsToFile" >> beam.ParDo( WriteGroupedRecordsToFile(coder=self.coder), file_prefix=file_prefix_pcv)) all_destination_file_pairs_pc = ( (destination_files_kv_pc, more_destination_files_kv_pc) | "DestinationFilesUnion" >> beam.Flatten()) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) # Load Jobs are triggered to temporary tables, and those are later copied to # the actual appropriate destination query. This ensures atomicity when only # some of the load jobs would fail but not other. # If any of them fails, then copy jobs are not triggered. trigger_loads_outputs = (grouped_files_pc | beam.ParDo( TriggerLoadJobs(schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=self.temp_tables), load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES, main='main')) destination_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] destination_copy_job_ids_pc = ( p | "ImpulseMonitorLoadJobs" >> beam.Create([None]) | "WaitForLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_job_ids_pc)) | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, temporary_tables=self.temp_tables, test_client=self.test_client), load_job_name_pcv)) finished_copy_jobs_pc = ( p | "ImpulseMonitorCopyJobs" >> beam.Create([None]) | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_copy_job_ids_pc))) _ = (finished_copy_jobs_pc | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda x, deleting_tables: deleting_tables, pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/DeduplicateTables" >> Count.PerElement() | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0]) | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn())) return { self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). #logging.info('average word length: %d', word_lengths_dist.committed.mean) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input, coder=JsonCoder()) results = ( lines | 'Populate' >> beam.ParDo(Populate()).with_outputs('monitorsPC', 'infoPC', 'testsuitesPC', 'useractionsPC', 'campaignPC', main='eventsPC')) useractionsPColl = results.useractionsPC useractionsKPI = ( useractionsPColl | 'KV UA uniqTcId' >> beam.Map(lambda x: (x['uniqTcId'], x)) | 'group UA uniqTcId' >> beam.GroupByKey() | 'KV UA name' >> beam.ParDo(KVForUAByName("name")) | 'group UA name' >> beam.GroupByKey() | 'Filter TestFileTransfer' >> beam.Filter(lambda (key, values): key == 'TestFileTransfer') | 'LOG ua' >> beam.ParDo(LogLen())) eventsPColl = results.eventsPC eventGroup = (eventsPColl | 'KV Event' >> beam.Map(lambda x: (x['name'], x)) | 'group Event' >> beam.GroupByKey()) eventKPI = ( eventGroup | 'Filter batt' >> beam.Filter(lambda (key, values): key == 'batteryLevelRemaining') # | 'LOG event' >> beam.ParDo(Log()) ) eventSort = ( eventGroup | 'SortAndComplete' >> beam.ParDo(SortAndComplete()) | 'Filter RAT' >> beam.Filter(lambda (key, values): key == 'psRatChd') | 'KpiEventUserAction' >> beam.FlatMap( KpiEventUserActionbeam, ua=pvalue.AsSingleton(useractionsKPI))) ''' monitorsPColl = results.monitorsPC monitorsKPI = (monitorsPColl | 'LOG monitors' >> beam.ParDo(Log())) infoPColl = results.infoPC infoKPI = (infoPColl | 'LOG info' >> beam.ParDo(Log())) testsuitesPColl = results.testsuitesPC testsuitesKPI = (testsuitesPColl | 'LOG testsuite' >> beam.ParDo(Log())) campaignPColl = results.campaignPC campaignKPI = (campaignPColl | 'LOG campaign' >> beam.ParDo(Log())) ''' result = p.run() result.wait_until_finish()
"""平均以上の文字数を持つ文字列をフィルタリングする.""" def __init__(self): super(FilterAboveMeanLengthFn, self).__init__() def process(self, element, mean_word_length): if element >= mean_word_length: yield element if __name__ == '__main__': p = beam.Pipeline(options=PipelineOptions()) inputs = ["good morning.", "good afternoon.", "good evening."] # 主入力 word_lengths = ( p | 'create inputs' >> beam.Create(inputs) | 'compute word length' >> beam.Map(lambda element: len(element))) # 副入力 mean_word_length = word_lengths | 'compute mean word length' >> beam.CombineGlobally( beam.combiners.MeanCombineFn()) (word_lengths | 'filter above mean length' >> beam.ParDo( FilterAboveMeanLengthFn(), pvalue.AsSingleton(mean_word_length)) | 'write to text' >> beam.io.WriteToText("./output.txt")) p.run().wait_until_finish()