Esempi in Python per AsIter

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: apache_beam.pvalue

Metodo/funzione: AsIter

Esempi su hotexamples.com: 7

AsIter in Python: 7 esempi trovati. Questi sono i migliori esempi reali in Python per apache_beam.pvalue.AsIter, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Esempio n. 1

Mostra file

File: typed_pipeline_test.py Progetto: xsm110/Beam15.0

  def test_deferred_side_input_iterable(self):
    @typehints.with_input_types(str, typing.Iterable[str])
    def concat(glue, items):
      return glue.join(sorted(items))
    with TestPipeline() as p:
      main_input = p | beam.Create(['a', 'bb', 'c'])
      side_input = p | 'side' >> beam.Create(['x', 'y', 'z'])
      result = main_input | beam.Map(concat, pvalue.AsIter(side_input))
      assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz']))

    bad_side_input = p | 'bad_side' >> beam.Create([1, 2, 3])
    with self.assertRaises(typehints.TypeCheckError):
      main_input | 'fail' >> beam.Map(concat, pvalue.AsIter(bad_side_input))

Esempio n. 2

Mostra file

File: beam_util.py Progetto: artempyanykh/data-zero-to-cloud

    def expand(self, pcoll):
        do_once = pcoll.pipeline | 'DoOnceSuccess' >> core.Create([None])
        main_write_result = pcoll | 'MainWrite' >> Write(self.sink)

        return (do_once
                | 'SuccessWrite' >> core.FlatMap(
                    self._success_write, pvalue.AsIter(main_write_result)))

Esempio n. 3

Mostra file

    def _load_data(self, partitions_using_temp_tables,
                   partitions_direct_to_destination, load_job_name_pcv,
                   singleton_pc):
        """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
        # Load data using temp tables
        trigger_loads_outputs = (
            partitions_using_temp_tables
            | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=True,
                    additional_bq_parameters=self.additional_bq_parameters),
                load_job_name_pcv, *self.schema_side_inputs).with_outputs(
                    TriggerLoadJobs.TEMP_TABLES, main='main'))

        temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            singleton_pc
            | "WaitForTempTableLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(temp_tables_load_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            singleton_pc
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(
                DeleteTablesFn(self.test_client)))

        # Load data directly to destination table
        destination_load_job_ids_pc = (
            partitions_direct_to_destination
            | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=False,
                    additional_bq_parameters=self.additional_bq_parameters),
                load_job_name_pcv, *self.schema_side_inputs))

        _ = (singleton_pc
             | "WaitForDestinationLoadJobs" >> beam.ParDo(
                 WaitForBQJobs(self.test_client),
                 beam.pvalue.AsList(destination_load_job_ids_pc)))

        destination_load_job_ids_pc = (
            (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
            | beam.Flatten())

        return destination_load_job_ids_pc, destination_copy_job_ids_pc

Esempio n. 4

Mostra file

def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | beam.io.ReadFromText(getattr(runtime_options,
                                           c._OPTION_INPUT_CSV),
                                   skip_header_lines=1)
            | beam.Map(lambda x: list(csv.reader([x]))[0])
            | beam.FlatMap(
                c.csv_line_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | beam.FlatMap(c.calculate_model_fit_validation_to_text,
                            pvalue.AsSingleton(options)))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        fullcbs = (
            fullcbs_without_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed,
            #  extra_dimension?)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?], prediction_params
        )

        prediction_by_customer_no_segments = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract predictions params
            | beam.FlatMap(c.calculate_prediction_to_text,
                           pvalue.AsSingleton(options)))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten())

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to CSV line' >> beam.Map(
                 c.list_to_csv_line)
             | 'Write prediction_by_customer' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='customer_id,p_alive'
                 ',predicted_purchases'
                 ',future_aov,historical_aov'
                 ',expected_value,frequency,recency'
                 ',total_time_observed,segment'
                 ',extra_dimension',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_by_customer.csv'))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (prediction_summary
             | 'prediction_summary to CSV line' >> beam.Map(c.list_to_csv_line)
             | 'Write prediction_summary' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='segment,average_retention_probability'
                 ',average_predicted_customer_value'
                 ',average_predicted_order_value,average_predicted_purchases'
                 ',total_customer_value,number_of_customers'
                 ',perc_of_total_customer_value',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_summary.csv'))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to CSV line' >> beam.Map(
                 c.list_to_csv_line)
             |
             'Write prediction_summary_extra_dimension' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='extra_dimension,average_retention_probability'
                 ',average_predicted_customer_value'
                 ',average_predicted_order_value'
                 ',average_predicted_purchases,total_customer_value'
                 ',number_of_customers,perc_of_total_customer_value',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_summary_extra_dimension.csv'))

Esempio n. 5

Mostra file

File: fcvs_pipeline_bq.py Progetto: skb-annalect/cloud-for-marketing

def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | bq_mod.ReadFromBigQuery(
                project=getattr(runtime_options, c._OPTION_INPUT_BQ_PROJECT),
                query=getattr(runtime_options, c._OPTION_INPUT_BQ_QUERY),
                gcs_location=getattr(runtime_options,
                                     c._OPTION_TEMP_GCS_LOCATION),
                use_standard_sql=True)
            | beam.FlatMap(
                c.bq_row_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | 'Write to validation_params table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'validation_params'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema={
                     'fields': [{
                         'name': 'calibration_start_date',
                         'type': 'STRING'
                     }, {
                         'name': 'calibration_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'cohort_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'holdout_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'model_time_granularity',
                         'type': 'STRING'
                     }, {
                         'name':
                         'model',
                         'type':
                         'RECORD',
                         'fields': [
                             {
                                 'name': 'frequency_model',
                                 'type': 'STRING'
                             },
                             {
                                 'name': 'num_customers_cohort',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_customers_cohort',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'num_transactions_validation',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_transactions_validation',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'mape',
                                 'type': 'FLOAT'
                             },
                         ]
                     }]
                 },
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        fullcbs = (
            fullcbs_without_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed,
            #  extra_dimension?)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?], prediction_params
        )

        prediction_by_customer_no_segments = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract prediction params
            | 'Write to prediction_params table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_params'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema={
                    'fields': [{
                        'name': 'prediction_period',
                        'type': 'INTEGER'
                    }, {
                        'name': 'prediction_period_unit',
                        'type': 'STRING'
                    }, {
                        'name': 'model_time_granularity',
                        'type': 'STRING'
                    }, {
                        'name': 'customers_modeled',
                        'type': 'INTEGER'
                    }, {
                        'name': 'transactions_observed',
                        'type': 'INTEGER'
                    }, {
                        'name': 'frequency_model',
                        'type': 'STRING'
                    }, {
                        'name':
                        'bgnbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'a',
                            'type': 'FLOAT'
                        }, {
                            'name': 'b',
                            'type': 'FLOAT'
                        }, {
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'paretonbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 's',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }, {
                            'name': 'beta',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'gamma_gamma_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'p',
                            'type': 'FLOAT'
                        }, {
                            'name': 'q',
                            'type': 'FLOAT'
                        }, {
                            'name': 'v',
                            'type': 'FLOAT'
                        }]
                    }]
                },
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten()
            | beam.Map(c.clean_nan_and_inf))

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to Dict' >>
             beam.Map(c.list_to_dict, [
                 'customer_id', 'p_alive', 'predicted_purchases', 'future_aov',
                 'historical_aov', 'expected_value', 'frequency', 'recency',
                 'total_time_observed', 'segment', 'extra_dimension'
             ])
             | 'Write to prediction_by_customer table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_by_customer'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='customer_id:STRING, p_alive:FLOAT64'
                 ', predicted_purchases:FLOAT64'
                 ', future_aov:FLOAT64, historical_aov:FLOAT64'
                 ', expected_value:FLOAT64, frequency:INT64'
                 ', recency:FLOAT64'
                 ', total_time_observed:FLOAT64, segment:INT64'
                 ', extra_dimension:STRING',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (
            prediction_summary
            | 'prediction_summary to Dict' >> beam.Map(c.list_to_dict, [
                'segment', 'average_retention_probability',
                'average_predicted_customer_value',
                'average_predicted_order_value', 'average_predicted_purchases',
                'total_customer_value', 'number_of_customers',
                'perc_of_total_customer_value'
            ])
            | 'Write to prediction_summary table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_summary'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema='segment:INT64 ,average_retention_probability:FLOAT64'
                ', average_predicted_customer_value:FLOAT64'
                ', average_predicted_order_value:FLOAT64'
                ', average_predicted_purchases:FLOAT64'
                ', total_customer_value:FLOAT64'
                ', number_of_customers:FLOAT64'
                ', perc_of_total_customer_value:FLOAT64',
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to Dict' >> beam.Map(
                 c.list_to_dict, [
                     'extra_dimension', 'average_retention_probability',
                     'average_predicted_customer_value',
                     'average_predicted_order_value',
                     'average_predicted_purchases', 'total_customer_value',
                     'number_of_customers', 'perc_of_total_customer_value'
                 ])
             | 'Write to prediction_summary_extra_dimension table' >>
             io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_summary_extra_dimension'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='extra_dimension:STRING'
                 ', average_retention_probability:FLOAT64'
                 ', average_predicted_customer_value:FLOAT64'
                 ', average_predicted_order_value:FLOAT64'
                 ', average_predicted_purchases:FLOAT64'
                 ', total_customer_value:FLOAT64'
                 ', number_of_customers:INT64'
                 ', perc_of_total_customer_value:FLOAT64',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

Esempio n. 6

Mostra file

    def expand(self, pcoll):
        p = pcoll.pipeline

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create([self._input_gs_location])
            | "GenerateFilePrefix" >> beam.Map(_generate_file_prefix))

        outputs = (
            pcoll
            |
            "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows())
            | "AppendDestination" >> beam.ParDo(
                _AppendDestinationsFn(self.destination))
            | beam.ParDo(WriteRecordsToFile(
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                coder=self.coder),
                         file_prefix=file_prefix_pcv).with_outputs(
                             WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                             WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(schema=self.schema,
                            write_disposition=self.write_disposition,
                            create_disposition=self.create_disposition,
                            test_client=self.test_client,
                            temporary_tables=self.temp_tables),
            load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES,
                                            main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (finished_copy_jobs_pc
             | "RemoveTempTables/PassTables" >> beam.FlatMap(
                 lambda x, deleting_tables: deleting_tables,
                 pvalue.AsIter(temp_tables_pc))
             | "RemoveTempTables/DeduplicateTables" >> Count.PerElement()
             | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
             | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }

Esempio n. 7

Mostra file

    def expand(self, pcoll):
        p = pcoll.pipeline

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create([''])
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        all_destination_file_pairs_pc = self._write_files(
            destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(
                schema=self.schema,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                test_client=self.test_client,
                temporary_tables=self.temp_tables,
                additional_bq_parameters=self.additional_bq_parameters),
            load_job_name_pcv, *self.schema_side_inputs).with_outputs(
                TriggerLoadJobs.TEMP_TABLES, main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))
        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }