Example #1
0
  def test_deferred_side_input_iterable(self):
    @typehints.with_input_types(str, typing.Iterable[str])
    def concat(glue, items):
      return glue.join(sorted(items))
    with TestPipeline() as p:
      main_input = p | beam.Create(['a', 'bb', 'c'])
      side_input = p | 'side' >> beam.Create(['x', 'y', 'z'])
      result = main_input | beam.Map(concat, pvalue.AsIter(side_input))
      assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz']))

    bad_side_input = p | 'bad_side' >> beam.Create([1, 2, 3])
    with self.assertRaises(typehints.TypeCheckError):
      main_input | 'fail' >> beam.Map(concat, pvalue.AsIter(bad_side_input))
    def expand(self, pcoll):
        do_once = pcoll.pipeline | 'DoOnceSuccess' >> core.Create([None])
        main_write_result = pcoll | 'MainWrite' >> Write(self.sink)

        return (do_once
                | 'SuccessWrite' >> core.FlatMap(
                    self._success_write, pvalue.AsIter(main_write_result)))
Example #3
0
    def _load_data(self, partitions_using_temp_tables,
                   partitions_direct_to_destination, load_job_name_pcv,
                   singleton_pc):
        """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
        # Load data using temp tables
        trigger_loads_outputs = (
            partitions_using_temp_tables
            | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=True,
                    additional_bq_parameters=self.additional_bq_parameters),
                load_job_name_pcv, *self.schema_side_inputs).with_outputs(
                    TriggerLoadJobs.TEMP_TABLES, main='main'))

        temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            singleton_pc
            | "WaitForTempTableLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(temp_tables_load_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            singleton_pc
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(
                DeleteTablesFn(self.test_client)))

        # Load data directly to destination table
        destination_load_job_ids_pc = (
            partitions_direct_to_destination
            | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=False,
                    additional_bq_parameters=self.additional_bq_parameters),
                load_job_name_pcv, *self.schema_side_inputs))

        _ = (singleton_pc
             | "WaitForDestinationLoadJobs" >> beam.ParDo(
                 WaitForBQJobs(self.test_client),
                 beam.pvalue.AsList(destination_load_job_ids_pc)))

        destination_load_job_ids_pc = (
            (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
            | beam.Flatten())

        return destination_load_job_ids_pc, destination_copy_job_ids_pc
Example #4
0
def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | beam.io.ReadFromText(getattr(runtime_options,
                                           c._OPTION_INPUT_CSV),
                                   skip_header_lines=1)
            | beam.Map(lambda x: list(csv.reader([x]))[0])
            | beam.FlatMap(
                c.csv_line_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | beam.FlatMap(c.calculate_model_fit_validation_to_text,
                            pvalue.AsSingleton(options)))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        fullcbs = (
            fullcbs_without_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed,
            #  extra_dimension?)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?], prediction_params
        )

        prediction_by_customer_no_segments = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract predictions params
            | beam.FlatMap(c.calculate_prediction_to_text,
                           pvalue.AsSingleton(options)))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten())

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to CSV line' >> beam.Map(
                 c.list_to_csv_line)
             | 'Write prediction_by_customer' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='customer_id,p_alive'
                 ',predicted_purchases'
                 ',future_aov,historical_aov'
                 ',expected_value,frequency,recency'
                 ',total_time_observed,segment'
                 ',extra_dimension',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_by_customer.csv'))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (prediction_summary
             | 'prediction_summary to CSV line' >> beam.Map(c.list_to_csv_line)
             | 'Write prediction_summary' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='segment,average_retention_probability'
                 ',average_predicted_customer_value'
                 ',average_predicted_order_value,average_predicted_purchases'
                 ',total_customer_value,number_of_customers'
                 ',perc_of_total_customer_value',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_summary.csv'))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to CSV line' >> beam.Map(
                 c.list_to_csv_line)
             |
             'Write prediction_summary_extra_dimension' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='extra_dimension,average_retention_probability'
                 ',average_predicted_customer_value'
                 ',average_predicted_order_value'
                 ',average_predicted_purchases,total_customer_value'
                 ',number_of_customers,perc_of_total_customer_value',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_summary_extra_dimension.csv'))
def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | bq_mod.ReadFromBigQuery(
                project=getattr(runtime_options, c._OPTION_INPUT_BQ_PROJECT),
                query=getattr(runtime_options, c._OPTION_INPUT_BQ_QUERY),
                gcs_location=getattr(runtime_options,
                                     c._OPTION_TEMP_GCS_LOCATION),
                use_standard_sql=True)
            | beam.FlatMap(
                c.bq_row_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | 'Write to validation_params table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'validation_params'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema={
                     'fields': [{
                         'name': 'calibration_start_date',
                         'type': 'STRING'
                     }, {
                         'name': 'calibration_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'cohort_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'holdout_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'model_time_granularity',
                         'type': 'STRING'
                     }, {
                         'name':
                         'model',
                         'type':
                         'RECORD',
                         'fields': [
                             {
                                 'name': 'frequency_model',
                                 'type': 'STRING'
                             },
                             {
                                 'name': 'num_customers_cohort',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_customers_cohort',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'num_transactions_validation',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_transactions_validation',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'mape',
                                 'type': 'FLOAT'
                             },
                         ]
                     }]
                 },
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        fullcbs = (
            fullcbs_without_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed,
            #  extra_dimension?)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?], prediction_params
        )

        prediction_by_customer_no_segments = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract prediction params
            | 'Write to prediction_params table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_params'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema={
                    'fields': [{
                        'name': 'prediction_period',
                        'type': 'INTEGER'
                    }, {
                        'name': 'prediction_period_unit',
                        'type': 'STRING'
                    }, {
                        'name': 'model_time_granularity',
                        'type': 'STRING'
                    }, {
                        'name': 'customers_modeled',
                        'type': 'INTEGER'
                    }, {
                        'name': 'transactions_observed',
                        'type': 'INTEGER'
                    }, {
                        'name': 'frequency_model',
                        'type': 'STRING'
                    }, {
                        'name':
                        'bgnbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'a',
                            'type': 'FLOAT'
                        }, {
                            'name': 'b',
                            'type': 'FLOAT'
                        }, {
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'paretonbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 's',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }, {
                            'name': 'beta',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'gamma_gamma_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'p',
                            'type': 'FLOAT'
                        }, {
                            'name': 'q',
                            'type': 'FLOAT'
                        }, {
                            'name': 'v',
                            'type': 'FLOAT'
                        }]
                    }]
                },
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten()
            | beam.Map(c.clean_nan_and_inf))

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to Dict' >>
             beam.Map(c.list_to_dict, [
                 'customer_id', 'p_alive', 'predicted_purchases', 'future_aov',
                 'historical_aov', 'expected_value', 'frequency', 'recency',
                 'total_time_observed', 'segment', 'extra_dimension'
             ])
             | 'Write to prediction_by_customer table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_by_customer'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='customer_id:STRING, p_alive:FLOAT64'
                 ', predicted_purchases:FLOAT64'
                 ', future_aov:FLOAT64, historical_aov:FLOAT64'
                 ', expected_value:FLOAT64, frequency:INT64'
                 ', recency:FLOAT64'
                 ', total_time_observed:FLOAT64, segment:INT64'
                 ', extra_dimension:STRING',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (
            prediction_summary
            | 'prediction_summary to Dict' >> beam.Map(c.list_to_dict, [
                'segment', 'average_retention_probability',
                'average_predicted_customer_value',
                'average_predicted_order_value', 'average_predicted_purchases',
                'total_customer_value', 'number_of_customers',
                'perc_of_total_customer_value'
            ])
            | 'Write to prediction_summary table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_summary'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema='segment:INT64 ,average_retention_probability:FLOAT64'
                ', average_predicted_customer_value:FLOAT64'
                ', average_predicted_order_value:FLOAT64'
                ', average_predicted_purchases:FLOAT64'
                ', total_customer_value:FLOAT64'
                ', number_of_customers:FLOAT64'
                ', perc_of_total_customer_value:FLOAT64',
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to Dict' >> beam.Map(
                 c.list_to_dict, [
                     'extra_dimension', 'average_retention_probability',
                     'average_predicted_customer_value',
                     'average_predicted_order_value',
                     'average_predicted_purchases', 'total_customer_value',
                     'number_of_customers', 'perc_of_total_customer_value'
                 ])
             | 'Write to prediction_summary_extra_dimension table' >>
             io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_summary_extra_dimension'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='extra_dimension:STRING'
                 ', average_retention_probability:FLOAT64'
                 ', average_predicted_customer_value:FLOAT64'
                 ', average_predicted_order_value:FLOAT64'
                 ', average_predicted_purchases:FLOAT64'
                 ', total_customer_value:FLOAT64'
                 ', number_of_customers:INT64'
                 ', perc_of_total_customer_value:FLOAT64',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))
Example #6
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create([self._input_gs_location])
            | "GenerateFilePrefix" >> beam.Map(_generate_file_prefix))

        outputs = (
            pcoll
            |
            "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows())
            | "AppendDestination" >> beam.ParDo(
                _AppendDestinationsFn(self.destination))
            | beam.ParDo(WriteRecordsToFile(
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                coder=self.coder),
                         file_prefix=file_prefix_pcv).with_outputs(
                             WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                             WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(schema=self.schema,
                            write_disposition=self.write_disposition,
                            create_disposition=self.create_disposition,
                            test_client=self.test_client,
                            temporary_tables=self.temp_tables),
            load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES,
                                            main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (finished_copy_jobs_pc
             | "RemoveTempTables/PassTables" >> beam.FlatMap(
                 lambda x, deleting_tables: deleting_tables,
                 pvalue.AsIter(temp_tables_pc))
             | "RemoveTempTables/DeduplicateTables" >> Count.PerElement()
             | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
             | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
Example #7
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create([''])
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        all_destination_file_pairs_pc = self._write_files(
            destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(
                schema=self.schema,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                test_client=self.test_client,
                temporary_tables=self.temp_tables,
                additional_bq_parameters=self.additional_bq_parameters),
            load_job_name_pcv, *self.schema_side_inputs).with_outputs(
                TriggerLoadJobs.TEMP_TABLES, main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))
        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }