Beispiel #1
0
def run():
  p = beam.Pipeline(options=PipelineOptions())
  # Callable takes additional arguments.
  def filter_using_length(word, lower_bound, upper_bound=float('inf')):
    if lower_bound <= len(word) <= upper_bound:
      yield word

  # Construct a deferred side input.
  avg_word_len = (
      words
      | beam.Map(len)
      | beam.CombineGlobally(beam.combiners.MeanCombineFn()))

  print(avg_word_len)

  p_avg_word_len = (p | 'avg word len' >> beam.Create(avg_word_len))


  # Call with explicit side inputs.
  small_words = words | 'small' >> beam.FlatMap(filter_using_length, 0, 3)

  # A single deferred side input.
  larger_than_average = (
      words | 'large' >> beam.FlatMap(
          filter_using_length, lower_bound=pvalue.AsSingleton(p_avg_word_len))
  )

  # Mix and match.
  small_but_nontrivial = words | beam.FlatMap(
      filter_using_length,
      lower_bound=2,
      upper_bound=pvalue.AsSingleton(avg_word_len))
Beispiel #2
0
  def test_deferred_side_inputs(self):
    @typehints.with_input_types(str, int)
    def repeat(s, times):
      return s * times
    with TestPipeline() as p:
      main_input = p | beam.Create(['a', 'bb', 'c'])
      side_input = p | 'side' >> beam.Create([3])
      result = main_input | beam.Map(repeat, pvalue.AsSingleton(side_input))
      assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc']))

    bad_side_input = p | 'bad_side' >> beam.Create(['z'])
    with self.assertRaises(typehints.TypeCheckError):
      main_input | 'bis' >> beam.Map(repeat, pvalue.AsSingleton(bad_side_input))
Beispiel #3
0
def _add_inferred_headers(all_patterns,  # type: List[str]
                          pipeline,  # type: beam.Pipeline
                          known_args,  # type: argparse.Namespace
                          merged_header,  # type: pvalue.PCollection
                          pipeline_mode  # type: int
                         ):
  # type: (...) -> pvalue.PCollection
  annotation_fields_to_infer = (known_args.annotation_fields if
                                known_args.infer_annotation_types else [])
  inferred_headers = (
      _read_variants(all_patterns,
                     pipeline,
                     known_args,
                     pipeline_mode)
      | 'FilterVariants' >> filter_variants.FilterVariants(
          reference_names=known_args.reference_names)
      | 'InferHeaderFields' >> infer_headers.InferHeaderFields(
          pvalue.AsSingleton(merged_header),
          known_args.allow_incompatible_records,
          known_args.infer_headers,
          annotation_fields_to_infer))
  merged_header = (
      (inferred_headers, merged_header)
      | 'FlattenHeaders' >> beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          known_args.split_alternate_allele_info_fields,
          known_args.allow_incompatible_records))
  return merged_header
Beispiel #4
0
    def test_pardo_side_input(self):
        # pylint: disable=line-too-long
        with TestPipeline() as p:
            words = p | 'start' >> beam.Create(['a', 'bb', 'ccc', 'dddd'])

            # [START model_pardo_side_input]
            # Callable takes additional arguments.
            def filter_using_length(word,
                                    lower_bound,
                                    upper_bound=float('inf')):
                if lower_bound <= len(word) <= upper_bound:
                    yield word

            # Construct a deferred side input.
            avg_word_len = (words
                            | beam.Map(len)
                            | beam.CombineGlobally(
                                beam.combiners.MeanCombineFn()))

            # Call with explicit side inputs.
            small_words = words | 'small' >> beam.FlatMap(
                filter_using_length, 0, 3)

            # A single deferred side input.
            larger_than_average = (words | 'large' >> beam.FlatMap(
                filter_using_length,
                lower_bound=pvalue.AsSingleton(avg_word_len)))

            # Mix and match.
            small_but_nontrivial = words | beam.FlatMap(
                filter_using_length,
                lower_bound=2,
                upper_bound=pvalue.AsSingleton(avg_word_len))
            # [END model_pardo_side_input]

            assert_that(small_words, equal_to(['a', 'bb', 'ccc']))
            assert_that(larger_than_average,
                        equal_to(['ccc', 'dddd']),
                        label='larger_than_average')
            assert_that(small_but_nontrivial,
                        equal_to(['bb']),
                        label='small_but_not_trivial')
Beispiel #5
0
def _get_inferred_headers(variants,  # type: pvalue.PCollection
                          merged_header  # type: pvalue.PCollection
                         ):
  # type: (...) -> (pvalue.PCollection, pvalue.PCollection)
  inferred_headers = (variants
                      | 'FilterVariants' >> filter_variants.FilterVariants()
                      | ' InferUndefinedHeaderFields' >>
                      infer_undefined_headers.InferUndefinedHeaderFields(
                          pvalue.AsSingleton(merged_header)))
  merged_header = (
      (inferred_headers, merged_header)
      | beam.Flatten()
      | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
          allow_incompatible_records=True))
  return inferred_headers, merged_header
Beispiel #6
0
    def expand(self, records):
        o = (records
             | 'pair one' >> beam.Map(lambda x: (1, x))
             | 'group all records' >> beam.GroupByKey()
             | 'split one of' >> beam.ParDo(
                 self.PreGenerateMappings()).with_outputs(
                     'splitted', 'combine'))

        # Create mappings, and prevent fusion (this limits the parallelization
        # in the optimization step)
        mappings = (o.splitted
                    | 'create mappings' >> beam.ParDo(
                        self.GenerateMappings(), pvalue.AsSingleton(o.combine))
                    | 'prevent fusion' >> beam.Reshuffle())

        return mappings
 def test_defined_fields_filtered_one_variant(self):
   # All FORMATs and INFOs are already defined in the header section of VCF
   # files.
   with TestPipeline() as p:
     vcf_headers = self._get_sample_header_fields()
     vcf_headers_side_input = p | 'vcf_headers' >> Create([vcf_headers])
     variant = self._get_sample_variant_1()
     inferred_headers = (
         p
         | Create([variant])
         | 'InferUndefinedHeaderFields' >>
         infer_undefined_headers.InferUndefinedHeaderFields(
             pvalue.AsSingleton(vcf_headers_side_input)))
     expected = vcf_header_io.VcfHeader()
     assert_that(inferred_headers, equal_to([expected]))
     p.run()
Beispiel #8
0
def _add_inferred_headers(
        pipeline,  # type: beam.Pipeline
        known_args,  # type: argparse.Namespace
        merged_header  # type: pvalue.PCollection
):
    # type: (...) -> pvalue.PCollection
    inferred_headers = (_read_variants(pipeline, known_args)
                        | 'FilterVariants' >> filter_variants.FilterVariants(
                            reference_names=known_args.reference_names)
                        | ' InferUndefinedHeaderFields' >>
                        infer_undefined_headers.InferUndefinedHeaderFields(
                            pvalue.AsSingleton(merged_header)))
    merged_header = (
        (inferred_headers, merged_header)
        | beam.Flatten()
        | 'MergeHeadersFromVcfAndVariants' >> merge_headers.MergeHeaders(
            known_args.split_alternate_allele_info_fields,
            known_args.allow_incompatible_records))
    return merged_header
  def test_defined_fields_filtered_two_variants(self):
    # Only INFO and FORMAT in the first variants are already defined in the
    # header section of the VCF files.
    with TestPipeline() as p:
      vcf_headers = self._get_sample_header_fields()
      vcf_headers_side_input = p | 'vcf_header' >> Create([vcf_headers])
      variant_1 = self._get_sample_variant_1()
      variant_2 = self._get_sample_variant_2()
      inferred_headers = (
          p
          | Create([variant_1, variant_2])
          | 'InferUndefinedHeaderFields' >>
          infer_undefined_headers.InferUndefinedHeaderFields(
              pvalue.AsSingleton(vcf_headers_side_input)))

      expected_infos = {'IS_2': Info('IS_2', 1, 'String', '', '', '')}
      expected_formats = {'FI_2': Format('FI_2', 1, 'String', '')}
      expected = vcf_header_io.VcfHeader(
          infos=expected_infos, formats=expected_formats)
      assert_that(inferred_headers, equal_to([expected]))
      p.run()
Beispiel #10
0
def run():
    # まずパイプラインを作る
    p = beam.Pipeline(options=PipelineOptions())

    # 1.配列をパイプラインの入力に設定
    inputs = ["good morning.", "good afternoon.", "good evening."]

    mean_word_length = (inputs
                        | beam.Map(len)
                        | beam.CombineGlobally(beam.combiners.MeanCombineFn()))

    print(mean_word_length)

    p_mean_word_length = (p | 'avg word len' >> beam.Create(mean_word_length))

    output = (p
              | 'read' >> beam.Create(inputs)
              | 'FilterMeanLength' >> beam.ParDo(
                  FilterMeanLengthFn(), pvalue.AsSingleton(p_mean_word_length))
              | 'write to text' >> beam.io.WriteToText('./output.txt'))
    print(output)
    p.run().wait_until_finish()
Beispiel #11
0
def run():
    p = beam.Pipeline(options=PipelineOptions())

    inputs = ['good morning.', 'good afternoon.', 'good evening.']

    # 副入力
    mean_word_length = (
        p
        | 'CreateWordLength' >> beam.Create([len(s) for s in inputs])
        | 'ComputeMeanWordLength' >> beam.CombineGlobally(
            beam.combiners.MeanCombineFn()))

    # 主入力
    output = (
        p
        | 'CreateWord' >> beam.Create(inputs)
        | 'FilterMeanLength' >> beam.ParDo(
            FilterMeanLengthFn(),
            pvalue.AsSingleton(mean_word_length))  # ParDo の第2引数に副入力を挿入する
        | 'WriteToText' >> beam.io.WriteToText('出力先のパス'))

    p.run().wait_until_finish()
def run(argv=None):
  parser = argparse.ArgumentParser()
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  BUCKET='BUCKET_NAME'

  data = [('NC', 'F', 2020, 'Hello', 3200),
          ('NC', 'F', 2020, 'World', 3180)]

  schema = (p 
    | 'Read Schema from GCS' >> ReadFromText('gs://{}/schema.json'.format(BUCKET)))

  (p
    | 'Create Events' >> beam.Create(data) \
    | 'Enrich with side input' >> beam.ParDo(EnrichElementsFn(), pvalue.AsSingleton(schema)) \
    | 'Log elements' >> beam.ParDo(LogElementsFn()))

  result = p.run()
  result.wait_until_finish()
Beispiel #13
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location

        empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
        singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

        load_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            singleton_pc
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        all_destination_file_pairs_pc = self._write_files(
            destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        partitions = (
            grouped_files_pc
            | beam.ParDo(
                PartitionFiles(self.max_partition_size,
                               self.max_files_per_partition)).with_outputs(
                                   PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                   PartitionFiles.SINGLE_PARTITION_TAG))

        multiple_partitions_per_destination_pc = partitions[
            PartitionFiles.MULTIPLE_PARTITIONS_TAG]
        single_partition_per_destination_pc = partitions[
            PartitionFiles.SINGLE_PARTITION_TAG]

        # When using dynamic destinations, elements with both single as well as
        # multiple partitions are loaded into BigQuery using temporary tables to
        # ensure atomicity.
        if self.dynamic_destinations:
            all_partitions = ((multiple_partitions_per_destination_pc,
                               single_partition_per_destination_pc)
                              | "FlattenPartitions" >> beam.Flatten())
            destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
              _load_data(all_partitions, empty_pc, load_job_name_pcv,
                         singleton_pc)
        else:
            destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
              _load_data(multiple_partitions_per_destination_pc,
                         single_partition_per_destination_pc,
                         load_job_name_pcv, singleton_pc)

        return {
            self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
Beispiel #14
0
def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | beam.io.ReadFromText(getattr(runtime_options,
                                           c._OPTION_INPUT_CSV),
                                   skip_header_lines=1)
            | beam.Map(lambda x: list(csv.reader([x]))[0])
            | beam.FlatMap(
                c.csv_line_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | beam.FlatMap(c.calculate_model_fit_validation_to_text,
                            pvalue.AsSingleton(options)))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        fullcbs = (
            fullcbs_without_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed,
            #  extra_dimension?)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?], prediction_params
        )

        prediction_by_customer_no_segments = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract predictions params
            | beam.FlatMap(c.calculate_prediction_to_text,
                           pvalue.AsSingleton(options)))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten())

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to CSV line' >> beam.Map(
                 c.list_to_csv_line)
             | 'Write prediction_by_customer' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='customer_id,p_alive'
                 ',predicted_purchases'
                 ',future_aov,historical_aov'
                 ',expected_value,frequency,recency'
                 ',total_time_observed,segment'
                 ',extra_dimension',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_by_customer.csv'))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (prediction_summary
             | 'prediction_summary to CSV line' >> beam.Map(c.list_to_csv_line)
             | 'Write prediction_summary' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='segment,average_retention_probability'
                 ',average_predicted_customer_value'
                 ',average_predicted_order_value,average_predicted_purchases'
                 ',total_customer_value,number_of_customers'
                 ',perc_of_total_customer_value',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_summary.csv'))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to CSV line' >> beam.Map(
                 c.list_to_csv_line)
             |
             'Write prediction_summary_extra_dimension' >> beam.io.WriteToText(
                 getattr(runtime_options, c._OPTION_OUTPUT_FOLDER),
                 header='extra_dimension,average_retention_probability'
                 ',average_predicted_customer_value'
                 ',average_predicted_order_value'
                 ',average_predicted_purchases,total_customer_value'
                 ',number_of_customers,perc_of_total_customer_value',
                 shard_name_template='',
                 num_shards=1,
                 file_name_suffix='prediction_summary_extra_dimension.csv'))
Beispiel #15
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create([''])
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        all_destination_file_pairs_pc = self._write_files(
            destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(
                schema=self.schema,
                write_disposition=self.write_disposition,
                create_disposition=self.create_disposition,
                test_client=self.test_client,
                temporary_tables=self.temp_tables,
                additional_bq_parameters=self.additional_bq_parameters),
            load_job_name_pcv, *self.schema_side_inputs).with_outputs(
                TriggerLoadJobs.TEMP_TABLES, main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))
        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
    def expand(self, pcoll):
        p = pcoll.pipeline
        try:
            step_name = self.label
        except AttributeError:
            step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT
            BigQueryBatchFileLoads.COUNT += 1

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location
        job_name = (p.options.view_as(GoogleCloudOptions).job_name
                    or 'AUTOMATIC_JOB_NAME')

        empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
        singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

        load_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | "LoadJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP')))

        schema_mod_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            |
            "SchemaModJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD,
                'SCHEMA_MOD_STEP')))

        copy_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | "CopyJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP')))

        file_prefix_pcv = pvalue.AsSingleton(
            singleton_pc
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        if not self.with_auto_sharding:
            all_destination_file_pairs_pc = self._write_files(
                destination_data_kv_pc, file_prefix_pcv)
        else:
            all_destination_file_pairs_pc = self._write_files_with_auto_sharding(
                destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        partitions = (
            grouped_files_pc
            | beam.ParDo(
                PartitionFiles(self.max_partition_size,
                               self.max_files_per_partition)).with_outputs(
                                   PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                   PartitionFiles.SINGLE_PARTITION_TAG))

        multiple_partitions_per_destination_pc = partitions[
            PartitionFiles.MULTIPLE_PARTITIONS_TAG]
        single_partition_per_destination_pc = partitions[
            PartitionFiles.SINGLE_PARTITION_TAG]

        # When using dynamic destinations, elements with both single as well as
        # multiple partitions are loaded into BigQuery using temporary tables to
        # ensure atomicity.
        if self.dynamic_destinations:
            all_partitions = ((multiple_partitions_per_destination_pc,
                               single_partition_per_destination_pc)
                              | "FlattenPartitions" >> beam.Flatten())
            destination_load_job_ids_pc, destination_copy_job_ids_pc = (
                self._load_data(all_partitions, empty_pc, load_job_name_pcv,
                                schema_mod_job_name_pcv, copy_job_name_pcv, p,
                                step_name))
        else:
            destination_load_job_ids_pc, destination_copy_job_ids_pc = (
                self._load_data(multiple_partitions_per_destination_pc,
                                single_partition_per_destination_pc,
                                load_job_name_pcv, schema_mod_job_name_pcv,
                                copy_job_name_pcv, p, step_name))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
def run(argv=None):
    """Main function.

    Main function containing the Apache Beam pipeline describing how to process
    the input CSV file to generate the LTV predictions.
    """
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args(argv)
    options = pipeline_options.PipelineOptions(pipeline_args)
    runtime_options = options.view_as(RuntimeOptions)

    with beam.Pipeline(options=options) as pipeline:
        options = (pipeline
                   | 'Create single element Stream containing options dict' >>
                   beam.Create([options.get_all_options()])
                   | beam.Map(
                       lambda x: {
                           k: v.get() if isinstance(
                               v, value_provider.ValueProvider) else v
                           for (k, v) in x.items()
                       })
                   | beam.Map(c.set_extra_options))

        full_elog = (
            pipeline
            | bq_mod.ReadFromBigQuery(
                project=getattr(runtime_options, c._OPTION_INPUT_BQ_PROJECT),
                query=getattr(runtime_options, c._OPTION_INPUT_BQ_QUERY),
                gcs_location=getattr(runtime_options,
                                     c._OPTION_TEMP_GCS_LOCATION),
                use_standard_sql=True)
            | beam.FlatMap(
                c.bq_row_to_list,
                pvalue.AsSingleton(options))  # (customer_id, date_str, date,
            #  sales, extra_dimension?)
        )

        full_elog_merged = (
            full_elog
            | beam.Filter(lambda x: x[3] > 0)  # sales > 0
            | beam.Map(lambda x: ((x[0], x[1]), x))  # key: (customer_id, date)
            | 'Group full elog by customer and date' >> beam.GroupByKey()
            | beam.Map(c.merge_full_elog_by_customer_and_date)  # (customer_id,
            #  date_str, date,
            #  sales)
        )

        min_max_dates = (
            full_elog_merged
            | beam.Map(lambda x: x[2])  # date
            | beam.CombineGlobally(c.MinMaxDatesFn())
            | beam.Map(c.min_max_dates_dict))

        limits_dates = (min_max_dates
                        | beam.FlatMap(c.limit_dates_boundaries,
                                       pvalue.AsSingleton(options)))

        cohort = (full_elog_merged
                  | beam.FlatMap(c.filter_customers_in_cohort,
                                 pvalue.AsSingleton(limits_dates))
                  | 'Distinct Customer IDs in Cohort' >> util.Distinct())

        cohort_count = (
            cohort
            | 'Count cohort entries' >> beam.combiners.Count.Globally())

        cohort_set = (cohort | beam.Map(lambda x: (x, 1)))

        all_customer_ids = (
            full_elog_merged
            | beam.Map(lambda x: x[0])  # key: customer_id
            | 'Distinct all Customer IDs' >> util.Distinct())

        all_customer_ids_count = (
            all_customer_ids
            | 'Count all customers' >> beam.combiners.Count.Globally())

        num_customers = (
            pipeline
            | 'Create single elem Stream I' >> beam.Create([1])
            | beam.FlatMap(c.count_customers, pvalue.AsSingleton(cohort_count),
                           pvalue.AsSingleton(all_customer_ids_count),
                           pvalue.AsSingleton(options)))

        cal_hol_elog = (full_elog_merged
                        | beam.FlatMap(c.filter_cohort_records_in_cal_hol,
                                       pvalue.AsDict(cohort_set),
                                       pvalue.AsSingleton(limits_dates)))

        cal_hol_elog_count = (
            cal_hol_elog
            | 'Count cal hol elog entries' >> beam.combiners.Count.Globally())

        calibration = (cal_hol_elog
                       | beam.FlatMap(c.filter_records_in_calibration,
                                      pvalue.AsSingleton(limits_dates)))

        num_txns_total = (
            full_elog_merged
            | beam.FlatMap(c.filter_records_in_cal_hol,
                           pvalue.AsSingleton(limits_dates))
            | 'Count num txns total' >> beam.combiners.Count.Globally())

        num_txns = (pipeline
                    | 'Create single elem Stream II' >> beam.Create([1])
                    | beam.FlatMap(c.count_txns,
                                   pvalue.AsSingleton(cal_hol_elog_count),
                                   pvalue.AsSingleton(num_txns_total),
                                   pvalue.AsSingleton(options)))

        calcbs = (
            calibration
            | beam.Map(lambda x: (x[0], x))
            | 'Group calibration elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_cal_cbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates)
            )  # (customer_id, number_of_transactions, average_order_value,
            #  frequency, recency, total_time_observed)
        )

        first_transaction_dates_by_customer = (
            cal_hol_elog
            | beam.Map(lambda x: (x[0], x))  # customer_id
            | 'Group cal hol elog by customer id' >> beam.GroupByKey()
            | beam.Map(lambda x: (x[0], min(map(operator.itemgetter(2), x[1])))
                       )  # item 2 -> date
        )

        cal_hol_elog_repeat = (
            cal_hol_elog
            | beam.FlatMap(c.filter_first_transaction_date_records,
                           pvalue.AsDict(first_transaction_dates_by_customer))
            | beam.FlatMap(
                c.calculate_time_unit_numbers,  # (customer_id, date,
                #  time_unit_number)
                pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates))
            | beam.Map(lambda x: (x[2], 1))  # key: time_unit_number
            | 'Group cal hol elog repeat by time unit number' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # (time_unit_number, occurrences)
        )

        repeat_tx = (
            pipeline
            | 'Create single elem Stream III' >> beam.Create([1])
            | beam.FlatMap(c.calculate_cumulative_repeat_transactions,
                           pvalue.AsIter(cal_hol_elog_repeat)
                           )  # (time_unit_number, repeat_transactions,
            #  repeat_transactions_cumulative)
        )

        model_validation = (
            pipeline
            | 'Create single elem Stream IV' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_model_fit_validation, pvalue.AsSingleton(options),
                pvalue.AsSingleton(limits_dates), pvalue.AsIter(calcbs),
                pvalue.AsIter(repeat_tx), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)))

        _ = (model_validation | beam.Map(c.raise_error_if_invalid_mape))

        _ = (model_validation
             | beam.Map(lambda x: x[0])
             | 'Write to validation_params table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'validation_params'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema={
                     'fields': [{
                         'name': 'calibration_start_date',
                         'type': 'STRING'
                     }, {
                         'name': 'calibration_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'cohort_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'holdout_end_date',
                         'type': 'STRING'
                     }, {
                         'name': 'model_time_granularity',
                         'type': 'STRING'
                     }, {
                         'name':
                         'model',
                         'type':
                         'RECORD',
                         'fields': [
                             {
                                 'name': 'frequency_model',
                                 'type': 'STRING'
                             },
                             {
                                 'name': 'num_customers_cohort',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_customers_cohort',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'num_transactions_validation',
                                 'type': 'INTEGER'
                             },
                             {
                                 'name': 'perc_transactions_validation',
                                 'type': 'FLOAT'
                             },
                             {
                                 'name': 'mape',
                                 'type': 'FLOAT'
                             },
                         ]
                     }]
                 },
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        fullcbs_without_extra_dimension = (
            full_elog_merged
            | beam.Map(lambda x: (x[0], x))  # key: customer_id
            | 'Group full merged elog by customer id' >> beam.GroupByKey()
            | beam.FlatMap(
                c.create_fullcbs, pvalue.AsSingleton(options),
                pvalue.AsSingleton(min_max_dates)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed)
        )

        full_elog_if_extra_dimension = (
            full_elog
            | 'Discard records if no extra dimension' >> beam.FlatMap(
                c.discard_if_no_extra_dimension, pvalue.AsSingleton(options)))

        extra_dimensions_stats = (
            full_elog_if_extra_dimension
            | beam.Map(lambda x: (
                (x[0], x[4]), x))  # key: (customer_id, extra_dimension)
            | 'Group full elog by customer id and extra dimension' >>
            beam.GroupByKey()
            | beam.Map(
                c.create_extra_dimensions_stats
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        top_dimension_per_customer = (
            extra_dimensions_stats
            | beam.Map(lambda x: (x[0], x))  # customer_id
            |
            'Group extra dimension stats by customer id' >> beam.GroupByKey()
            | beam.Map(
                c.extract_top_extra_dimension
            )  # (customer_id, extra_dimension, dimension_count, tot_sales,
            #  max_dimension_date)
        )

        customer_dimension_map = (
            top_dimension_per_customer
            | beam.Map(lambda x:
                       (x[0], x[1]))  # (customer_id, extra_dimension)
        )

        fullcbs = (
            fullcbs_without_extra_dimension
            | beam.FlatMap(
                c.add_top_extra_dimension_to_fullcbs,
                pvalue.AsSingleton(options),
                pvalue.AsDict(customer_dimension_map)
            )  # (customer_id, number_of_transactions, historical_aov,
            #  frequency, recency, total_time_observed,
            #  extra_dimension?)
        )

        prediction = (
            pipeline
            | 'Create single elem Stream V' >> beam.Create([1])
            | beam.FlatMap(
                c.calculate_prediction, pvalue.AsSingleton(options),
                pvalue.AsIter(fullcbs), pvalue.AsSingleton(num_customers),
                pvalue.AsSingleton(num_txns)
            )  # [customer_id, p_alive, predicted_purchases, future_aov,
            #  historical_aov, expected_value, frequency, recency,
            #  total_time_observed, extra_dimension?], prediction_params
        )

        prediction_by_customer_no_segments = (
            prediction
            | beam.FlatMap(lambda x: x[0])  # Extract predictions by customer
        )

        _ = (
            prediction
            | beam.Map(lambda x: x[1])  # Extract prediction params
            | 'Write to prediction_params table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_params'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema={
                    'fields': [{
                        'name': 'prediction_period',
                        'type': 'INTEGER'
                    }, {
                        'name': 'prediction_period_unit',
                        'type': 'STRING'
                    }, {
                        'name': 'model_time_granularity',
                        'type': 'STRING'
                    }, {
                        'name': 'customers_modeled',
                        'type': 'INTEGER'
                    }, {
                        'name': 'transactions_observed',
                        'type': 'INTEGER'
                    }, {
                        'name': 'frequency_model',
                        'type': 'STRING'
                    }, {
                        'name':
                        'bgnbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'a',
                            'type': 'FLOAT'
                        }, {
                            'name': 'b',
                            'type': 'FLOAT'
                        }, {
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'paretonbd_model_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'r',
                            'type': 'FLOAT'
                        }, {
                            'name': 's',
                            'type': 'FLOAT'
                        }, {
                            'name': 'alpha',
                            'type': 'FLOAT'
                        }, {
                            'name': 'beta',
                            'type': 'FLOAT'
                        }]
                    }, {
                        'name':
                        'gamma_gamma_params',
                        'type':
                        'RECORD',
                        'fields': [{
                            'name': 'p',
                            'type': 'FLOAT'
                        }, {
                            'name': 'q',
                            'type': 'FLOAT'
                        }, {
                            'name': 'v',
                            'type': 'FLOAT'
                        }]
                    }]
                },
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        num_rows = (full_elog_merged
                    | 'Count num rows in full elog merged' >>
                    beam.combiners.Count.Globally())

        segment_predictions_exact = (
            pipeline
            | 'Create single elem Stream VII' >> beam.Create([1])
            | beam.FlatMap(
                lambda _, rows_count:
                [rows_count <= c._SEGMENT_PREDICTION_THRESHOLD],
                pvalue.AsSingleton(num_rows)))

        sharded_cust_predictions_no_segments_exact, \
            sharded_cust_predictions_no_segments_hash = (
                prediction_by_customer_no_segments
                | beam.FlatMap(
                    c.prediction_sharded,
                    pvalue.AsSingleton(options),
                    pvalue.AsSingleton(segment_predictions_exact)
                )  # [customer_id, p_alive, predicted_purchases, future_aov,
                   #  historical_aov, expected_value, frequency, recency,
                   #  total_time_observed, extra_dimension?]
                | beam.Partition(lambda x, _: 0 if x[1] else 1, 2)
            )

        # BEGIN of "exact" branch
        prediction_by_customer_exact = (
            pipeline
            | 'Create single elem Stream VIII' >> beam.Create([1])
            | beam.FlatMap(
                c.split_in_ntiles_exact, pvalue.AsSingleton(options),
                pvalue.AsIter(sharded_cust_predictions_no_segments_exact
                              ))  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "exact" branch

        # BEGIN of "hash" branch
        customer_count_by_expected_value = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: (x[0][5], 1))  # (expected_value, 1)
            | 'Group customer predictions by expected value' >>
            beam.GroupByKey()
            | beam.Map(lambda x:
                       (x[0], sum(x[1])))  # expected_value, customers_count
        )

        hash_segment_limits = (
            pipeline
            | 'Create single elem Stream IX' >> beam.Create([1])
            | beam.FlatMap(c.expected_values_segment_limits,
                           pvalue.AsSingleton(options),
                           pvalue.AsIter(customer_count_by_expected_value),
                           pvalue.AsSingleton(all_customer_ids_count)))

        prediction_by_customer_hash = (
            sharded_cust_predictions_no_segments_hash
            | beam.Map(lambda x: x[0])
            | beam.FlatMap(c.split_in_ntiles_hash,
                           pvalue.AsSingleton(hash_segment_limits)
                           )  # [customer_id, p_alive, predicted_purchases,
            #  future_aov, historical_aov, expected_value,
            #  frequency, recency, total_time_observed,
            #  segment, extra_dimension?]
        )
        # END of "hash" branch

        prediction_by_customer = (
            # only one of these two streams will contains values
            (prediction_by_customer_exact, prediction_by_customer_hash)
            | beam.Flatten()
            | beam.Map(c.clean_nan_and_inf))

        _ = (prediction_by_customer
             | beam.FlatMap(
                 lambda x, opts: [x + ['']]
                 if not opts[c._OPTION_EXTRA_DIMENSION_EXISTS] else [x],
                 pvalue.AsSingleton(options))
             | 'prediction_by_customer to Dict' >>
             beam.Map(c.list_to_dict, [
                 'customer_id', 'p_alive', 'predicted_purchases', 'future_aov',
                 'historical_aov', 'expected_value', 'frequency', 'recency',
                 'total_time_observed', 'segment', 'extra_dimension'
             ])
             | 'Write to prediction_by_customer table' >> io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_by_customer'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='customer_id:STRING, p_alive:FLOAT64'
                 ', predicted_purchases:FLOAT64'
                 ', future_aov:FLOAT64, historical_aov:FLOAT64'
                 ', expected_value:FLOAT64, frequency:INT64'
                 ', recency:FLOAT64'
                 ', total_time_observed:FLOAT64, segment:INT64'
                 ', extra_dimension:STRING',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_temp = (
            prediction_by_customer
            | beam.Map(lambda x: (x[9], x))  # key: segment
            | 'Group customer predictions by segment' >> beam.GroupByKey()
            | beam.FlatMap(
                c.generate_prediction_summary, pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases, total_customer_value,
            #  number_of_customers)
        )

        tot_equity = (
            prediction_summary_temp
            | beam.Map(lambda x: x[5])  # total_customer_value
            | beam.CombineGlobally(sum))

        prediction_summary = (
            prediction_summary_temp
            | beam.FlatMap(
                c.calculate_perc_of_total_customer_value,
                pvalue.AsSingleton(tot_equity), pvalue.AsSingleton(
                    options))  # (segment, average_retention_probability,
            #  average_predicted_customer_value,
            #  average_predicted_order_value,
            #  average_predicted_purchases,
            #  total_customer_value, number_of_customers,
            #  perc_of_total_customer_value)
        )

        _ = (
            prediction_summary
            | 'prediction_summary to Dict' >> beam.Map(c.list_to_dict, [
                'segment', 'average_retention_probability',
                'average_predicted_customer_value',
                'average_predicted_order_value', 'average_predicted_purchases',
                'total_customer_value', 'number_of_customers',
                'perc_of_total_customer_value'
            ])
            | 'Write to prediction_summary table' >> io.WriteToBigQuery(
                table=c.TableValueProvider(
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                    getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                    'prediction_summary'),
                custom_gcs_temp_location=getattr(runtime_options,
                                                 c._OPTION_TEMP_GCS_LOCATION),
                validate=False,
                schema='segment:INT64 ,average_retention_probability:FLOAT64'
                ', average_predicted_customer_value:FLOAT64'
                ', average_predicted_order_value:FLOAT64'
                ', average_predicted_purchases:FLOAT64'
                ', total_customer_value:FLOAT64'
                ', number_of_customers:FLOAT64'
                ', perc_of_total_customer_value:FLOAT64',
                write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))

        prediction_summary_extra_dimension = (
            prediction_by_customer
            | 'Discard prediction if there is not extra dimension' >>
            beam.FlatMap(c.discard_if_no_extra_dimension,
                         pvalue.AsSingleton(options))
            | beam.Map(lambda x: (x[10], x))  # extra dimension
            | 'Group customer predictions by extra dimension' >>
            beam.GroupByKey()
            | beam.FlatMap(c.generate_prediction_summary_extra_dimension,
                           pvalue.AsSingleton(tot_equity),
                           pvalue.AsSingleton(options)))

        _ = (prediction_summary_extra_dimension
             | 'prediction_summary_extra_dimension to Dict' >> beam.Map(
                 c.list_to_dict, [
                     'extra_dimension', 'average_retention_probability',
                     'average_predicted_customer_value',
                     'average_predicted_order_value',
                     'average_predicted_purchases', 'total_customer_value',
                     'number_of_customers', 'perc_of_total_customer_value'
                 ])
             | 'Write to prediction_summary_extra_dimension table' >>
             io.WriteToBigQuery(
                 table=c.TableValueProvider(
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_PROJECT),
                     getattr(runtime_options, c._OPTION_OUTPUT_BQ_DATASET),
                     'prediction_summary_extra_dimension'),
                 custom_gcs_temp_location=getattr(runtime_options,
                                                  c._OPTION_TEMP_GCS_LOCATION),
                 validate=False,
                 schema='extra_dimension:STRING'
                 ', average_retention_probability:FLOAT64'
                 ', average_predicted_customer_value:FLOAT64'
                 ', average_predicted_order_value:FLOAT64'
                 ', average_predicted_purchases:FLOAT64'
                 ', total_customer_value:FLOAT64'
                 ', number_of_customers:INT64'
                 ', perc_of_total_customer_value:FLOAT64',
                 write_disposition=io.BigQueryDisposition.WRITE_TRUNCATE,
                 create_disposition=io.BigQueryDisposition.CREATE_IF_NEEDED))
Beispiel #18
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--from-bigquery',
                        dest='from_bigquery',
                        const=True,
                        default=False,
                        nargs='?',
                        help='Whether to load from BigQuery')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        # CHANGE 2/5: (OPTIONAL) Change this to DataflowRunner to
        # run your pipeline on the Google Cloud Dataflow Service.
        '--runner=DirectRunner',
        # CHANGE 3/5: Your project ID is required in order to run your pipeline on
        # the Google Cloud Dataflow Service.
        '--project=aou-res-curation-test',
        # CHANGE 4/5: Your Google Cloud Storage path is required for staging local
        # files.
        '--staging_location=gs://YOUR_BUCKET_NAME/AND_STAGING_DIRECTORY',
        # CHANGE 5/5: Your Google Cloud Storage path is required for temporary
        # files.
        '--temp_location=gs://YOUR_BUCKET_NAME/AND_TEMP_DIRECTORY',
        '--job_name=your-wordcount-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        # Read all of the EHR inputs, into a dictionary of:
        #   table -> site_name -> PCollection of table rows
        ehr_inputs = {}
        for tbl in ['person', 'measurement', 'condition_occurrence']:
            ehr_inputs[tbl] = {}
            for site in ['nyc', 'pitt']:
                if known_args.from_bigquery:
                    ehr_inputs[tbl][site] = (p | f"{site}_{tbl}" >> beam.io.Read(
                        beam.io.BigQuerySource(
                            query=
                            f"SELECT * FROM `aou-res-curation-test.calbach_prototype.{site}_{tbl}`",
                            use_standard_sql=True)))
                else:
                    ehr_inputs[tbl][site] = (
                        p
                        | f"read {site}_{tbl}" >>
                        ReadFromText(f"../test_data/{site}/{tbl}.json")
                        | f"{site}_{tbl} from JSON" >> beam.Map(json.loads))

        # Merge tables across all sites, resulting in:
        #  table -> PCollection of table rows
        # Question: How should these ID spaces be reconciled?
        combined_by_domain = {}
        for tbl, data_by_site in ehr_inputs.items():
            combined_by_domain[tbl] = data_by_site.values(
            ) | f"ehr merge for {tbl}" >> beam.Flatten()

        # 1. Move data from person table elsewhere.

        # Transform person rows, generate new measurement rows.
        combined_by_domain["person"], extracted_meas_rows = (
            combined_by_domain["person"]
            | beam.ParDo(ExtractDobAsMeasurement()).with_outputs(
                ExtractDobAsMeasurement.OUTPUT_TAG_MEASUREMENT, main='person'))

        # Merge the new measurement rows into the larger collection.
        combined_by_domain["measurement"] = (
            combined_by_domain["measurement"],
            extracted_meas_rows) | beam.Flatten()

        # 2. Perform a row-level table transform
        combined_by_domain["condition_occurrence"] = (
            combined_by_domain["condition_occurrence"]
            | beam.Map(clamp_condition_start_datetime))

        # 3. Retract participants by ID.
        person_id_blacklist = (
            combined_by_domain['person']
            | beam.Map(lambda p: p['person_id'])
            # Simulates more complex criteria here, likely involving other tables.
            | "generate the person ID blacklist" >>
            beam.Filter(lambda pid: int(pid) % 2 == 0)
            | beam.Map(lambda pid: (pid, True))
            | beam.combiners.ToDict())

        # Drop all data for blacklisted participants from all tables.
        for (domain, data) in combined_by_domain.items():
            combined_by_domain[domain] = (
                data
                | beam.Filter(
                    filter_by_id,
                    blacklist=pvalue.AsSingleton(person_id_blacklist)))

        # 4. Group-by-participant transforms, e.g. remove duplicate measurements
        combined_by_domain['measurement'] = (
            combined_by_domain['measurement']
            # Define unique rows as person+measurement concept ID.
            | beam.Map(lambda m:
                       ((m["person_id"], m["measurement_concept_id"]), m))
            # We don't care which one, just compare the row JSON for a deterministic result.
            | beam.combiners.Top.PerKey(1, key=lambda a: str(a))
            | beam.Values())

        # XXX: Need to figure out how ID generation is meant to work here. That will impact
        # how we go about creating the mapping tables.
        # Initial idea is that we likely attach some payload to the in-flight representation
        # of a row.

        for domain, data in combined_by_domain.items():
            data | f"output for {domain}" >> beam.io.WriteToText(
                f"out/{domain}.txt")
Beispiel #19
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        load_job_name_pcv = pvalue.AsSingleton(
            p
            | "ImpulseJobName" >> beam.Create([None])
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            p
            | "CreateFilePrefixView" >> beam.Create([self._input_gs_location])
            | "GenerateFilePrefix" >> beam.Map(_generate_file_prefix))

        outputs = (
            pcoll
            |
            "ApplyGlobalWindow" >> beam.WindowInto(beam.window.GlobalWindows())
            | "AppendDestination" >> beam.ParDo(
                _AppendDestinationsFn(self.destination))
            | beam.ParDo(WriteRecordsToFile(
                max_files_per_bundle=self.max_files_per_bundle,
                max_file_size=self.max_file_size,
                coder=self.coder),
                         file_prefix=file_prefix_pcv).with_outputs(
                             WriteRecordsToFile.UNWRITTEN_RECORD_TAG,
                             WriteRecordsToFile.WRITTEN_FILE_TAG))

        # A PCollection of (destination, file) tuples. It lists files with records,
        # and the destination each file is meant to be imported into.
        destination_files_kv_pc = outputs[WriteRecordsToFile.WRITTEN_FILE_TAG]

        # A PCollection of (destination, record) tuples. These are later sharded,
        # grouped, and all records for each destination-shard is written to files.
        # This PCollection is necessary because not all records can be written into
        # files in ``WriteRecordsToFile``.
        unwritten_records_pc = outputs[WriteRecordsToFile.UNWRITTEN_RECORD_TAG]

        more_destination_files_kv_pc = (
            unwritten_records_pc
            | beam.ParDo(_ShardDestinations())
            | "GroupShardedRows" >> beam.GroupByKey()
            | "DropShardNumber" >> beam.Map(lambda x: (x[0][0], x[1]))
            | "WriteGroupedRecordsToFile" >> beam.ParDo(
                WriteGroupedRecordsToFile(coder=self.coder),
                file_prefix=file_prefix_pcv))

        all_destination_file_pairs_pc = (
            (destination_files_kv_pc, more_destination_files_kv_pc)
            | "DestinationFilesUnion" >> beam.Flatten())

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        # Load Jobs are triggered to temporary tables, and those are later copied to
        # the actual appropriate destination query. This ensures atomicity when only
        # some of the load jobs would fail but not other.
        # If any of them fails, then copy jobs are not triggered.
        trigger_loads_outputs = (grouped_files_pc | beam.ParDo(
            TriggerLoadJobs(schema=self.schema,
                            write_disposition=self.write_disposition,
                            create_disposition=self.create_disposition,
                            test_client=self.test_client,
                            temporary_tables=self.temp_tables),
            load_job_name_pcv).with_outputs(TriggerLoadJobs.TEMP_TABLES,
                                            main='main'))

        destination_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                temporary_tables=self.temp_tables,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (finished_copy_jobs_pc
             | "RemoveTempTables/PassTables" >> beam.FlatMap(
                 lambda x, deleting_tables: deleting_tables,
                 pvalue.AsIter(temp_tables_pc))
             | "RemoveTempTables/DeduplicateTables" >> Count.PerElement()
             | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
             | "RemoveTempTables/Delete" >> beam.ParDo(DeleteTablesFn()))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    #logging.info('average word length: %d', word_lengths_dist.committed.mean)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(known_args.input, coder=JsonCoder())

    results = (
        lines
        | 'Populate' >> beam.ParDo(Populate()).with_outputs('monitorsPC',
                                                            'infoPC',
                                                            'testsuitesPC',
                                                            'useractionsPC',
                                                            'campaignPC',
                                                            main='eventsPC'))

    useractionsPColl = results.useractionsPC
    useractionsKPI = (
        useractionsPColl
        | 'KV UA uniqTcId' >> beam.Map(lambda x: (x['uniqTcId'], x))
        | 'group UA uniqTcId' >> beam.GroupByKey()
        | 'KV UA name' >> beam.ParDo(KVForUAByName("name"))
        | 'group UA name' >> beam.GroupByKey()
        | 'Filter TestFileTransfer' >>
        beam.Filter(lambda (key, values): key == 'TestFileTransfer')
        | 'LOG ua' >> beam.ParDo(LogLen()))

    eventsPColl = results.eventsPC
    eventGroup = (eventsPColl
                  | 'KV Event' >> beam.Map(lambda x: (x['name'], x))
                  | 'group Event' >> beam.GroupByKey())

    eventKPI = (
        eventGroup
        | 'Filter batt' >>
        beam.Filter(lambda (key, values): key == 'batteryLevelRemaining')
        #              | 'LOG event' >> beam.ParDo(Log())
    )

    eventSort = (
        eventGroup
        | 'SortAndComplete' >> beam.ParDo(SortAndComplete())
        | 'Filter RAT' >> beam.Filter(lambda (key, values): key == 'psRatChd')
        | 'KpiEventUserAction' >> beam.FlatMap(
            KpiEventUserActionbeam, ua=pvalue.AsSingleton(useractionsKPI)))
    '''


  
  monitorsPColl = results.monitorsPC
  monitorsKPI = (monitorsPColl
              | 'LOG monitors' >> beam.ParDo(Log()))
  infoPColl = results.infoPC
  infoKPI = (infoPColl
              | 'LOG info' >> beam.ParDo(Log()))
  testsuitesPColl = results.testsuitesPC
  testsuitesKPI = (testsuitesPColl
              | 'LOG testsuite' >> beam.ParDo(Log()))
  campaignPColl = results.campaignPC
  campaignKPI = (campaignPColl
              | 'LOG campaign' >> beam.ParDo(Log()))
  '''

    result = p.run()
    result.wait_until_finish()
Beispiel #21
0
    """平均以上の文字数を持つ文字列をフィルタリングする."""
    def __init__(self):
        super(FilterAboveMeanLengthFn, self).__init__()

    def process(self, element, mean_word_length):
        if element >= mean_word_length:
            yield element


if __name__ == '__main__':
    p = beam.Pipeline(options=PipelineOptions())

    inputs = ["good morning.", "good afternoon.", "good evening."]

    # 主入力
    word_lengths = (
        p
        | 'create inputs' >> beam.Create(inputs)
        | 'compute word length' >> beam.Map(lambda element: len(element)))

    # 副入力
    mean_word_length = word_lengths | 'compute mean word length' >> beam.CombineGlobally(
        beam.combiners.MeanCombineFn())

    (word_lengths
     | 'filter above mean length' >> beam.ParDo(
         FilterAboveMeanLengthFn(), pvalue.AsSingleton(mean_word_length))
     | 'write to text' >> beam.io.WriteToText("./output.txt"))

    p.run().wait_until_finish()