def expand(self, inputs):
    pcoll, = inputs
    if self._top_k is not None and self._top_k < 0:
      raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got '
                       '{}.'.format(self._top_k))
    if self._frequency_threshold is not None and self._frequency_threshold < 0:
      raise ValueError(
          'frequency_threshold for VocabularyImpl should be >= 0 or None, '
          'got {}.'.format(self._frequency_threshold))
    if self._coverage_top_k is not None and self._coverage_top_k < 0:
      raise ValueError('coverage_top_k for VocabularyImpl should be >= 0 or '
                       'None, got {}.'.format(self._coverage_top_k))
    if (self._coverage_frequency_threshold is not None and
        self._coverage_frequency_threshold < 0):
      raise ValueError(
          'coverage_frequency_threshold for VocabularyImpl should be >= 0 or '
          'None, got {}.'.format(self._coverage_frequency_threshold))

    # Create a PCollection of (count, element) pairs, then iterates over
    # this to create a single element PCollection containing this list of
    # pairs in sorted order by decreasing counts (and by values for equal
    # counts).

    def is_problematic_string(kv):
      string, _ = kv  # Ignore counts.
      return string and b'\n' not in string and b'\r' not in string

    if (self._vocab_ordering_type ==
        tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION):
      flatten_map_fn = _flatten_to_key_and_means_accumulator_list
      combine_transform = _MutualInformationTransform(  # pylint: disable=no-value-for-parameter
          self._use_adjusted_mutual_info, self._min_diff_from_avg)
    elif (self._vocab_ordering_type ==
          tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY):
      flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples
      combine_transform = beam.CombinePerKey(sum)
    else:
      flatten_map_fn = _flatten_value_to_list
      combine_transform = beam.combiners.Count.PerElement()

    raw_counts = (
        pcoll
        | 'FlattenStringsAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn)
        | 'CountPerString' >> combine_transform
        | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string)
        | 'SwapStringsAndCounts' >> beam.KvSwap())

    counts = (
        raw_counts | 'ApplyFrequencyThresholdAndTopK' >> (
            _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                self._frequency_threshold, self._top_k, None)))

    if self._key_fn:
      coverage_counts = (
          raw_counts | 'ApplyCoverageFrequencyThresholdAndTopK' >> (
              _ApplyFrequencyThresholdAndTopK(  # pylint: disable=no-value-for-parameter
                  self._coverage_frequency_threshold, self._coverage_top_k,
                  self._key_fn)))

      counts = (
          (counts, coverage_counts)
          | 'MergeStandardAndCoverageArms' >> beam.Flatten()
          | 'RemoveDuplicates' >> beam.RemoveDuplicates())

    return counts | 'WriteVocabFile' >> (
        _WriteVocabFile(  # pylint: disable=no-value-for-parameter
            self._base_temp_dir, self._vocab_filename, self._store_frequency))
Example #2
0
  def testMultiClassConfusionMatrixPlotWithStringLabels(self):
    computations = (
        multi_class_confusion_matrix_plot.MultiClassConfusionMatrixPlot()
        .computations(example_weighted=True))
    matrices = computations[0]
    plot = computations[1]

    # Examples from b/149558504.
    example1 = {
        'labels': np.array([['unacc']]),
        'predictions': {
            'probabilities':
                np.array([[
                    1.0000000e+00, 6.9407083e-24, 2.7419115e-38, 0.0000000e+00
                ]]),
            'all_classes':
                np.array([['unacc', 'acc', 'vgood', 'good']]),
        },
        'example_weights': np.array([0.5])
    }
    example2 = {
        'labels': np.array([['vgood']]),
        'predictions': {
            'probabilities': np.array([[0.2, 0.3, 0.4, 0.1]]),
            'all_classes': np.array([['unacc', 'acc', 'vgood', 'good']]),
        },
        'example_weights': np.array([1.0])
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeMatrices' >> beam.CombinePerKey(matrices.combiner)
          | 'ComputePlot' >> beam.Map(lambda x: (x[0], plot.result(x[1]))))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_plots = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_plots, 1)
          key = metric_types.PlotKey(
              name='multi_class_confusion_matrix_plot', example_weighted=True)
          got_matrix = got_plots[key]
          self.assertProtoEquals(
              """
              matrices {
                threshold: 0.0
                entries {
                  actual_class_id: 0
                  predicted_class_id: 0
                  num_weighted_examples: 0.5
                }
                entries {
                  actual_class_id: 2
                  predicted_class_id: 2
                  num_weighted_examples: 1.0
                }
              }
          """, got_matrix)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
Example #3
0
    def testQueryStatistics(self):
        metrics = query_statistics.QueryStatistics().computations(
            query_key='query')[0]

        query1_example1 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.2]),
            'example_weights': np.array([1.0]),
            'features': {
                'query': np.array(['query1']),
                'gain': np.array([1.0])
            }
        }
        query1_example2 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.8]),
            'example_weights': np.array([1.0]),
            'features': {
                'query': np.array(['query1']),
                'gain': np.array([0.5])
            }
        }
        query2_example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([2.0]),
            'features': {
                'query': np.array(['query2']),
                'gain': np.array([0.5])
            }
        }
        query2_example2 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.9]),
            'example_weights': np.array([2.0]),
            'features': {
                'query': np.array(['query2']),
                'gain': np.array([1.0])
            }
        }
        query2_example3 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.1]),
            'example_weights': np.array([2.0]),
            'features': {
                'query': np.array(['query2']),
                'gain': np.array([0.1])
            }
        }
        query3_example1 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.9]),
            'example_weights': np.array([3.0]),
            'features': {
                'query': np.array(['query3']),
                'gain': np.array([1.0])
            }
        }
        examples = [
            tfma_util.merge_extracts([query1_example1, query1_example2]),
            tfma_util.merge_extracts(
                [query2_example1, query2_example2, query2_example3]),
            tfma_util.merge_extracts([query3_example1])
        ]

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (pipeline
                      | 'Create' >> beam.Create(examples)
                      | 'Process' >> beam.Map(
                          metric_util.to_standard_metric_inputs, True)
                      | 'AddSlice' >> beam.Map(lambda x: ((), x))
                      | 'Combine' >> beam.CombinePerKey(metrics.combiner))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    total_queries_key = metric_types.MetricKey(
                        name='total_queries')
                    total_documents_key = metric_types.MetricKey(
                        name='total_documents')
                    min_documents_key = metric_types.MetricKey(
                        name='min_documents')
                    max_documents_key = metric_types.MetricKey(
                        name='max_documents')
                    self.assertDictElementsAlmostEqual(got_metrics, {
                        total_queries_key: 3,
                        total_documents_key: 6,
                        min_documents_key: 1,
                        max_documents_key: 3
                    },
                                                       places=5)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Example #4
0
 def expand(self, pcoll):
     return (pcoll
             | beam.Map(lambda elem: (elem[self.field], elem['score']))
             | beam.CombinePerKey(sum))
  def testFlipCountWitEvalConfig(self):
    eval_config = text_format.Parse(
        """
        model_specs: {
          name: "original"
        }
        model_specs: {
          name: "counterfactual"
          is_baseline: true
        }
        """, config_pb2.EvalConfig())
    computations = flip_count.FlipCount(
        thresholds=[0.3], example_id_key='example_id_key').computations(
            eval_config=eval_config,
            example_weighted=True,
            model_names=['original', 'counterfactual'],
            output_names=[''])
    binary_confusion_matrix = computations[0]
    matrices = computations[1]
    metrics = computations[2]
    original_model_name = 'original'
    counterfactual_model_name = 'counterfactual'
    examples = [
        {
            'labels': None,
            'predictions': {
                original_model_name: np.array([0.5]),
                counterfactual_model_name: np.array([0.7]),
            },
            'example_weights': np.array([1.0]),
            'features': {
                'example_id_key': np.array(['id_1']),
            },
        },
        {
            'labels': None,
            'predictions': {
                original_model_name: np.array([0.1, 0.7]),  # to test flattening
                counterfactual_model_name: np.array([1.0, 0.1])
            },
            'example_weights': np.array([3.0]),
            'features': {
                'example_id_key': np.array(['id_2']),
            },
        },
        {
            'labels': None,
            'predictions': {
                original_model_name: np.array([0.5, 0.2]),
                counterfactual_model_name: np.array([0.2, 0.4]),
            },
            'example_weights': np.array([2.0]),
            'features': {
                'example_id_key': np.array(['id_3']),
            },
        },
        {
            'labels': None,
            'predictions': {
                original_model_name: np.array([0.2, 0.1]),
                counterfactual_model_name: np.array([0.4, 0.5]),
            },
            'example_weights': np.array([1.0]),
            'features': {
                'example_id_key': np.array(['id_4']),
            },
        }
    ]

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create(examples)
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs, True)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeBinaryConfusionMatrix' >> beam.CombinePerKey(
              binary_confusion_matrix.combiner)
          |
          'ComputeMatrices' >> beam.Map(lambda x: (x[0], matrices.result(x[1])))
          |
          'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1]))))

      # pylint: enable=no-value-for-parameter
      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_metrics, 6)
          self.assertDictElementsAlmostEqual(
              got_metrics, {
                  metric_types.MetricKey(
                      name='flip_count/[email protected]',
                      model_name='original',
                      example_weighted=True):
                      5.0,
                  metric_types.MetricKey(
                      name='flip_count/[email protected]',
                      model_name='original',
                      example_weighted=True):
                      7.0,
                  metric_types.MetricKey(
                      name='flip_count/[email protected]',
                      model_name='original',
                      example_weighted=True):
                      6.0,
                  metric_types.MetricKey(
                      name='flip_count/[email protected]',
                      model_name='original',
                      example_weighted=True):
                      7.0,
              })
          self.assertAllEqual(
              got_metrics[metric_types.MetricKey(
                  name='flip_count/[email protected]',
                  model_name='original',
                  example_weighted=True)], np.array([['id_2'], ['id_3']]))
          self.assertAllEqual(
              got_metrics[metric_types.MetricKey(
                  name='flip_count/[email protected]',
                  model_name='original',
                  example_weighted=True)],
              np.array([['id_2'], ['id_3'], ['id_4']]))
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
Example #6
0
 def expand(self, pcoll):
     return (pcoll
             | beam.Map(lambda info: (info[self.field], info['score']))
             | beam.CombinePerKey(sum_ints))
  def testConfusionMatrixAtThresholds(self):
    computations = confusion_matrix_metrics.ConfusionMatrixAtThresholds(
        thresholds=[0.3, 0.5, 0.8]).computations()
    histogram = computations[0]
    matrices = computations[1]
    metrics = computations[2]

    example1 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.0]),
        'example_weights': np.array([1.0]),
    }
    example2 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.5]),
        'example_weights': np.array([1.0]),
    }
    example3 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.3]),
        'example_weights': np.array([1.0]),
    }
    example4 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.9]),
        'example_weights': np.array([1.0]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([example1, example2, example3, example4])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
          | 'ComputeMatrices' >> beam.Map(
              lambda x: (x[0], matrices.result(x[1])))  # pyformat: ignore
          | 'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1])))
      )  # pyformat: ignore

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_metrics, 1)
          key = metric_types.MetricKey(name='confusion_matrix_at_thresholds')
          self.assertIn(key, got_metrics)
          got_metric = got_metrics[key]
          self.assertProtoEquals(
              """
              matrices {
                threshold: 0.3
                false_negatives: 1.0
                true_negatives: 1.0
                false_positives: 1.0
                true_positives: 1.0
                precision: 0.5
                recall: 0.5
              }
              matrices {
                threshold: 0.5
                false_negatives: 1.0
                true_negatives: 2.0
                true_positives: 1.0
                precision: 1.0
                recall: 0.5
              }
              matrices {
                threshold: 0.8
                false_negatives: 1.0
                true_negatives: 2.0
                true_positives: 1.0
                precision: 1.0
                recall: 0.5
              }
          """, got_metric)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
Example #8
0
 def expand(self, input_or_inputs):
     return (input_or_inputs | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
             | beam.CombinePerKey(sum))
Example #9
0
def count_decorated_fn(input_or_inputs):
    """ Count as a decorated function. """
    return (input_or_inputs | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | beam.CombinePerKey(sum))
p = beam.Pipeline()

# Input Pcollection
input_collection = (
    p
    | "Read from file" >>
    beam.io.ReadFromText('structure_of_beam_apps/dept-data.txt')
    | "Split rows" >> beam.Map(SplitRow))

# Accounts dept pipeline branch
accounts_count = (input_collection
                  | "Get all Accounts dept person" >>
                  beam.Filter(lambda record: record[3] == 'Accounts')
                  | "Pair each accounts employee name with 1" >>
                  beam.Map(lambda record: (record[1], 1))
                  | "Group and sum Accounts" >> beam.CombinePerKey(sum)
                  | "Write results for accounts" >>
                  beam.io.WriteToText('structure_of_beam_apps/data/Account'))

# HR dept pipeline branch
hr_count = (
    input_collection
    |
    "Get all HR dept persons" >> beam.Filter(lambda record: record[3] == 'HR')
    | "Pair each hr employee name with 1" >> beam.Map(lambda record:
                                                      (record[1], 1))
    | "Group and sum HR" >> beam.CombinePerKey(sum)
    | "Write results for HR" >>
    beam.io.WriteToText('structure_of_beam_apps/data/HR'))

# Run pipeline
Example #11
0
def run(argv=None, save_main_session=True):
  parser = argparse.ArgumentParser()

  parser.add_argument('--target_month_start',
      type=int,
      required=True,
      help='Start date of the expiration target month')
  parser.add_argument('--target_month_end',
      type=int,
      required=True,
      help='End date of the expiration target month')
  parser.add_argument('--user_logs_start',
      type=int,
      required=True,
      help='Start date for filtering users by activity')
  parser.add_argument('--dataset',
      type=str,
      required=True,
      help='Real or fake data')
  parser.add_argument('--credentials',
      type=str,
      required=True,
      help='Path to service account JSON')


  args, pipeline_args = parser.parse_known_args(argv)

  options = PipelineOptions(pipeline_args)

  # Set GCP credentials
  os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credentials
  
  #"C:\\Users\\Owner\\Google Drive\\Northeastern\\DS 5500 Capstone\\ds5500-capstone\\amiable-octane-267022-061a6f297eeb.json"

  google_cloud_options = options.view_as(GoogleCloudOptions)
  gcs_project = google_cloud_options.project
  google_cloud_options.region = "us-east1"
  # TODO update as parameter
  google_cloud_options.staging_location = 'gs://pipeline_beam/temp' #'gs://arr-beam-test/temp'
  google_cloud_options.temp_location = 'gs://pipeline_beam/temp' #'gs://arr-beam-test/temp'
  
  # look into not using public IPs 
  # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
  # https://cloud.google.com/dataflow/docs/guides/specifying-networks#public_ip_parameter
  # no_use_public_ips The public IPs parameter requires the Beam SDK for Python. The Dataflow SDK for Python does not support this parameter.	

  # We also require the --project option to access --dataset
  if google_cloud_options.project is None:
    parser.print_usage()
    print(sys.argv[0] + ': error: argument --project is required')
    sys.exit(1)

  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  options.view_as(SetupOptions).save_main_session = save_main_session

  with beam.Pipeline(options=options) as p:
      ## Find MNSOs with an expiration in the target month
      target_month_start = args.target_month_start
      target_month_end = args.target_month_end
      user_logs_start = args.user_logs_start

      ## Table params; fake dataset for testing purposes
      if args.dataset == "fake":
          project_input_dataset = gcs_project + ":beam."
          project_input_dataset_standard = gcs_project + ".beam."
          user_logs_tbl = "user_logs_fake" 
          users_tbl = "users_fake" 
          transactions_tbl = "transactions_fake" 
      else:
          project_input_dataset = gcs_project + ":kkbox."
          project_input_dataset_standard = gcs_project + ".kkbox."
          user_logs_tbl = "user_logs"
          users_tbl = "members" 
          transactions_tbl = "transactions" 
          
      project_output_dataset = gcs_project + ":kkbox."
      name_suffix = str(int(datetime.now().timestamp()))
      features_output_train_tbl = project_output_dataset + "output_train_" + name_suffix 
      features_output_val_tbl = project_output_dataset + "output_val_" + name_suffix 
      features_output_test_tbl = project_output_dataset + "output_test_" + name_suffix 

      # Read users from BQ
      users = (
          p | 'Query Users tabl e in BQ' >> beam.io.Read(beam.io.BigQuerySource(table=project_input_dataset+users_tbl))
          | beam.ParDo(ParseUsersBQ(target_month_start))
          )

      # Read valid user transactions from BQ
      query_params = {'users_tbl':project_input_dataset_standard+users_tbl,
                'transactions_tbl':project_input_dataset_standard+transactions_tbl,
                'user_logs_tbl':project_input_dataset_standard+user_logs_tbl,
                'target_month_start':target_month_start,
                'target_month_end':target_month_end,
                'user_logs_start':user_logs_start}

      # Filter to include only transactions with users who had expirations in the target month
      transactions = (
          p | 'Query valid user transactions' >> beam.io.Read(beam.io.BigQuerySource(query = 
            """#standardSQL
            SELECT t.transaction_date,t.membership_expire_date,t.msno,t.payment_method_id,payment_plan_days,t.plan_list_price,t.actual_amount_paid,t.is_auto_renew,t.is_cancel
                from `{transactions_tbl}` t
            where t.msno in
                (SELECT MSNO from `{transactions_tbl}` where membership_expire_date >= {target_month_start} and membership_expire_date <= {target_month_end})
            """.format(**query_params), use_standard_sql=True)) | beam.ParDo(ParseTransactionsBQ())
          )

      user_log_features = (
          p | 'Query user_logs' >> beam.io.Read(beam.io.BigQuerySource(query = 
            """ #standardSQL
                SELECT msno,
                count(msno) as user_log_entries,
                avg(num_25) as avg_num_25,
                avg(num_50) as avg_num_50,
                avg(num_75) as avg_num_75,
                avg(num_985) as avg_num_985,
                avg(num_100) as avg_num_100,
                avg(num_unq) as avg_num_unq,
                sum(num_25) as sum_num_25,
                sum(num_50) as sum_num_50,
                sum(num_75) as sum_num_75,
                sum(num_985) as sum_num_985,
                sum(num_100) as sum_num_100,
                sum(num_unq) as sum_num_unq,
                from `{user_logs_tbl}` 
                group by msno having min(CAST(date as INT64)) <= {user_logs_start}
                
            """.format(**query_params), use_standard_sql=True))
          | beam.ParDo(ParseUserLogsBQ())
          )
      
      ###
      # Generate Labels
      ###
      # TODO add these as cmd line arguments
      final_month_start = 20170301
      final_month_end = 20170331
      
      # Find users who did not churn based on transaction records to later assign 1 or 0
      not_churned_users = (
        transactions | 'Filter Transactions in Feb AND Expiration is not Feb' >> 
        beam.Filter(lambda elem: elem['transaction_date'] >= target_month_start 
                    and elem['transaction_date'] <= final_month_end 
                    and elem['membership_expire_date'] >= final_month_start
                    and elem['is_cancel'] == 0)
        | 'Create key-value pair with 0 for no churn' >> beam.Map(lambda elem: (elem['msno'],0)) 
            )
      
      ####
      ## Feature engineering
      ####

      # Always Auto-Renew: 1 if the user consistently have auto-renew on all transactions. 0 otherwise
      feature_autorenew = (
      transactions | beam.Map(lambda x: (x["msno"],x["is_auto_renew"])) | beam.CombinePerKey(min))

      # Discount amount mean
      feature_discount_mean = (
      transactions | beam.Map(lambda x: (x["msno"],x["discount_amount"])) | beam.CombinePerKey(AverageFn())
      )

      # Payment ID encoding
      feature_payment_id_encoded = (
          # Converte transactions into key,value pair of (msno,one-hot-encoded-payment-method)
      transactions | beam.Map(lambda x: (x["msno"],OneHotPaymentId({"payment_method_id":x["payment_method_id"]})))
          | beam.CombinePerKey(SummarisePaymentId())
      )
      
      
      def print_fun(x):
          print(x)

      ##
      # Combine all data
      ##

      # Combine all user features with users and filter users with no data
      all_output =( 
          {'user_demo': users, 
           'feature_user_log_counts':user_log_features,
           'feature_autorenew':feature_autorenew,
           'feature_discount_mean':feature_discount_mean,
           'feature_payment_id_encoded':feature_payment_id_encoded,
           'not_churned_users':not_churned_users} 
          | "Combine users, labels, and features" >> beam.CoGroupByKey()
          | "Filter out empty entries" >> beam.Filter(lambda x: len(x[1]["user_demo"]) > 0 and
                                                      len(x[1]["feature_user_log_counts"]) > 0 and
                                                      len(x[1]["feature_autorenew"]) > 0 and 
                                                      len(x[1]["feature_discount_mean"]) > 0 and
                                                      len(x[1]["feature_payment_id_encoded"]) > 0)
      | "Flatten to single dictionary for BQ" >> beam.Map(format_for_BQ) 
      )

      
      ## partition into train/val/test based on random float
      train, val, test = (all_output | beam.Partition(train_test_split, 3))
      
      # write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE causes the function to sleep for 150 seconds to wait for delete to finalize before write

      # output to BQ
      #features_output_schema
      val | "Validation output" >> beam.io.WriteToBigQuery(table=features_output_val_tbl,schema=features_output_schema)
      train | "Training output" >> beam.io.WriteToBigQuery(table=features_output_train_tbl,schema=features_output_schema)
      test | "Testing output" >> beam.io.WriteToBigQuery(table=features_output_test_tbl,schema=features_output_schema)
Example #12
0
  def expand(self, pcoll):
    top_k = self._spec.top_k
    frequency_threshold = self._spec.frequency_threshold
    assert top_k is None or top_k >= 0
    assert frequency_threshold is None or frequency_threshold >= 0

    # Creates a PCollection of (count, element) pairs, then iterates over
    # this to create a single element PCollection containing this list of
    # pairs in sorted order by decreasing counts (and by values for equal
    # counts).
    counts = (
        pcoll
        | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list)
        | 'CountWithinList' >>
        # Specification of with_output_types allows for combiner optimizations.
        (beam.FlatMap(lambda lst: six.iteritems(collections.Counter(lst))).
         with_output_types(KV[common.PRIMITIVE_TYPE, int]))
        | 'CountGlobally' >> beam.CombinePerKey(sum))

    counts = (
        counts
        | 'FilterProblematicStrings' >> beam.Filter(
            lambda kv: kv[0] and '\n' not in kv[0] and '\r' not in kv[0])
        | 'SwapElementsAndCounts' >> beam.KvSwap())

    # Filter is cheaper than TopK computation and the two commute, so
    # filter first.
    if frequency_threshold is not None:
      counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >>
                 beam.Filter(lambda kv: kv[0] >= frequency_threshold))

    if top_k is not None:
      counts = (counts
                | 'Top(%s)' % top_k
                >> beam.transforms.combiners.Top.Largest(top_k)
                | 'FlattenList' >> beam.FlatMap(lambda lst: lst))

    # Performance optimization to obviate reading from finely sharded files
    # via AsIter. By forcing all data into a single group we end up reading
    # from a single file.
    #
    @beam.ptransform_fn
    def Reshard(pcoll):  # pylint: disable=invalid-name
      return (
          pcoll
          | 'PairWithNone' >> beam.Map(lambda x: (None, x))
          | 'GroupByNone' >> beam.GroupByKey()
          | 'ExtractValues' >> beam.FlatMap(lambda x: x[1]))
    counts |= 'ReshardToOneGroup' >> Reshard()  # pylint: disable=no-value-for-parameter

    # Using AsIter instead of AsList below in order to reduce max memory
    # usage (due to AsList caching).
    def order_by_decreasing_counts(ignored, counts_iter, store_frequency):
      """Sort the vocabulary by frequency count."""
      del ignored
      counts = list(counts_iter)
      if not counts:
        counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')]
      counts.sort(reverse=True)  # Largest first.
      if store_frequency:
        # Returns ['count1 element1', ... ]
        return ['{} {}'.format(count, element) for count, element in counts]
      else:
        return [element for _, element in counts]

    vocabulary_file = os.path.join(self._temp_assets_dir,
                                   self._spec.vocab_filename)
    vocab_is_written = (
        pcoll.pipeline
        | 'Prepare' >> beam.Create([None])
        | 'OrderByDecreasingCounts' >> beam.FlatMap(
            order_by_decreasing_counts,
            counts_iter=beam.pvalue.AsIter(counts),
            store_frequency=self._spec.store_frequency)
        | 'WriteToFile' >> beam.io.WriteToText(vocabulary_file,
                                               shard_name_template=''))
    # Return the vocabulary path.
    wait_for_vocabulary_transform = (
        pcoll.pipeline
        | 'CreatePath' >> beam.Create([[vocabulary_file]])
        # Ensure that the analysis returns only after the file is written.
        | 'WaitForVocabularyFile' >> beam.Map(
            lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written)))
    return wait_for_vocabulary_transform
Example #13
0
def _MutualInformationTransformAccumulate(pcol):  # pylint: disable=invalid-name
    """Accumulates information needed for mutual information computation."""
    return (pcol | 'VocabCountPerLabelPerTokenAccumulate' >>
            beam.CombinePerKey(_WeightedMeanCombineFn()))
Example #14
0
def _ComputePerSlice(  # pylint: disable=invalid-name
    sliced_extracts: beam.pvalue.PCollection,
    computations: List[metric_types.MetricComputation],
    derived_computations: List[metric_types.DerivedMetricComputation],
    cross_slice_computations: List[metric_types.CrossSliceMetricComputation],
    cross_slice_specs: Optional[Iterable[config.CrossSlicingSpec]] = None,
    compute_with_sampling: Optional[bool] = False,
    num_jackknife_samples: int = 0,
    skip_ci_metric_keys: Set[metric_types.MetricKey] = frozenset(),
    random_seed_for_testing: Optional[int] = None,
    baseline_model_name: Optional[Text] = None) -> beam.pvalue.PCollection:  # pytype: disable=annotation-type-mismatch
    """PTransform for computing, aggregating and combining metrics and plots.

  Args:
    sliced_extracts: Incoming PCollection consisting of slice key and extracts.
    computations: List of MetricComputations.
    derived_computations: List of DerivedMetricComputations.
    cross_slice_computations: List of CrossSliceMetricComputation.
    cross_slice_specs: List of CrossSlicingSpec.
    compute_with_sampling: True to compute with bootstrap sampling. This allows
      _ComputePerSlice to be used to generate unsampled values from the whole
      data set, as well as bootstrap resamples, in which each element is treated
      as if it showed up p ~ poission(1) times.
    num_jackknife_samples: number of delete-d jackknife estimates to use in
      computing standard errors on metrics.
    skip_ci_metric_keys: List of metric keys for which to skip confidence
      interval computation.
    random_seed_for_testing: Seed to use for unit testing.
    baseline_model_name: Name for baseline model.

  Returns:
    PCollection of (slice key, dict of metrics).
  """
    # TODO(b/123516222): Remove this workaround per discussions in CL/227944001
    sliced_extracts.element_type = beam.typehints.Any

    def convert_and_add_derived_values(
        sliced_results: Tuple[slicer.SliceKeyType,
                              Tuple[metric_types.MetricsDict, ...]],
        derived_computations: List[metric_types.DerivedMetricComputation],
    ) -> Tuple[slicer.SliceKeyType, metric_types.MetricsDict]:
        """Converts per slice tuple of dicts into single dict and adds derived."""
        result = {}
        for v in sliced_results[1]:
            result.update(v)
        for c in derived_computations:
            result.update(c.result(result))
        return sliced_results[0], result

    def add_diff_metrics(
        sliced_metrics: Tuple[Union[slicer.SliceKeyType,
                                    slicer.CrossSliceKeyType],
                              Dict[metric_types.MetricKey, Any]],
        baseline_model_name: Optional[Text],
    ) -> Tuple[slicer.SliceKeyType, Dict[metric_types.MetricKey, Any]]:
        """Add diff metrics if there is a baseline model."""

        result = copy.copy(sliced_metrics[1])

        if baseline_model_name:
            diff_result = {}
            for k, v in result.items():
                if _is_private_metrics(k):
                    continue
                if k.model_name != baseline_model_name and k.make_baseline_key(
                        baseline_model_name) in result:
                    # Check if metric is diffable, skip plots and non-numerical values.
                    if _is_metric_diffable(v):
                        diff_result[k.make_diff_key()] = v - result[
                            k.make_baseline_key(baseline_model_name)]
            result.update(diff_result)

        # Remove private metrics
        _remove_private_metrics(result)

        return (sliced_metrics[0], result)

    combiner = _ComputationsCombineFn(
        computations=computations,
        compute_with_sampling=compute_with_sampling,
        random_seed_for_testing=random_seed_for_testing)
    if num_jackknife_samples:
        # We do not use the hotkey fanout hint used by the non-jacknife path because
        # the random jackknife partitioning naturally mitigates hot keys.
        sliced_combiner_outputs = (
            sliced_extracts
            | 'JackknifeCombinePerSliceKey' >>
            jackknife.JackknifeCombinePerKey(combiner, num_jackknife_samples))
    else:
        sliced_combiner_outputs = (
            sliced_extracts
            | 'CombinePerSliceKey' >> beam.CombinePerKey(combiner).
            with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT))

    sliced_derived_values_and_diffs = (
        sliced_combiner_outputs
        | 'ConvertAndAddDerivedValues' >> beam.Map(
            convert_and_add_derived_values, derived_computations)
        | 'AddCrossSliceMetrics' >> _AddCrossSliceMetrics(  # pylint: disable=no-value-for-parameter
            cross_slice_specs, cross_slice_computations)
        | 'AddDiffMetrics' >> beam.Map(add_diff_metrics, baseline_model_name))

    if num_jackknife_samples:
        return (sliced_derived_values_and_diffs
                | 'MergeJackknifeSamples' >> jackknife.MergeJackknifeSamples(
                    num_jackknife_samples, skip_ci_metric_keys))
    else:
        return sliced_derived_values_and_diffs
Example #15
0
 def expand(self, p):
     return (p
             | 'extract_field' >> beam.Map(lambda x:
                                           (vars(x)[self.field], x.score))
             | beam.CombinePerKey(sum))
Example #16
0
def count_decorated_with_side_input_fn(input_or_inputs, factor=1):
    """ Count as a decorated function with a side input. """
    return (input_or_inputs
            | 'pair_with_one' >> beam.Map(lambda x: (x, factor))
            | beam.CombinePerKey(sum))
    def testTjuDicriminationMetricsWithWeights(self, metric, expected_value):
        computations = metric.computations()
        shared_metrics = computations[0]
        metric = computations[1]

        # Positive labels: 1.0 * 0.0 + 2.0 * 1.0 + 3.0 * 1.0 + 4.0 * 0.0 = 5.0
        # Negative labels: 1.0 * 1.0 + 2.0 * 0.0 + 3.0 * 0.0 + 4.0 * 1.0 = 5.0
        # Positive predictions: 1.0 * 0.0 * 0.8 + 2.0 * 1.0 * 0.3 + 3.0 * 1.0 * 0.9
        #                       + 4.0 * 0.0 * 0.2 = 3.3
        # Negative predictions: 1.0 * 1.0 * 0.8 + 2.0 * 0.0 * 0.7 + 3.0 * 0.0 * 0.1
        #                       + 4.0 * 1.0 * 0.2 = 1.6
        example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.8]),
            'example_weights': np.array([1.0]),
        }
        example2 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.3]),
            'example_weights': np.array([2.0]),
        }
        example3 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.9]),
            'example_weights': np.array([3.0]),
        }
        example4 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.2]),
            'example_weights': np.array([4.0]),
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create(
                    [example1, example2, example3, example4])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeWeightedTotals' >> beam.CombinePerKey(
                    shared_metrics.combiner)
                | 'ComputeMetric' >> beam.Map(lambda x:
                                              (x[0], metric.result(x[1]))))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_metrics = got[0]
                    self.assertEqual(got_slice_key, ())
                    key = metric.keys[0]
                    self.assertDictElementsAlmostEqual(got_metrics,
                                                       {key: expected_value},
                                                       places=5)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
Example #18
0
    def testCalibrationPlot(self):
        computations = calibration_plot.CalibrationPlot(
            num_buckets=10).computations()
        histogram = computations[0]
        plot = computations[1]

        example1 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.2]),
            'example_weights': np.array([1.0])
        }
        example2 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.8]),
            'example_weights': np.array([2.0])
        }
        example3 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([3.0])
        }
        example4 = {
            'labels': np.array([1.0]),
            'predictions': np.array([-0.1]),
            'example_weights': np.array([4.0])
        }
        example5 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.5]),
            'example_weights': np.array([5.0])
        }
        example6 = {
            'labels': np.array([1.0]),
            'predictions': np.array([0.8]),
            'example_weights': np.array([6.0])
        }
        example7 = {
            'labels': np.array([0.0]),
            'predictions': np.array([0.2]),
            'example_weights': np.array([7.0])
        }
        example8 = {
            'labels': np.array([1.0]),
            'predictions': np.array([1.1]),
            'example_weights': np.array([8.0])
        }

        with beam.Pipeline() as pipeline:
            # pylint: disable=no-value-for-parameter
            result = (
                pipeline
                | 'Create' >> beam.Create([
                    example1, example2, example3, example4, example5, example6,
                    example7, example8
                ])
                | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
                | 'AddSlice' >> beam.Map(lambda x: ((), x))
                | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
                |
                'ComputePlot' >> beam.Map(lambda x: (x[0], plot.result(x[1]))))

            # pylint: enable=no-value-for-parameter

            def check_result(got):
                try:
                    self.assertLen(got, 1)
                    got_slice_key, got_plots = got[0]
                    self.assertEqual(got_slice_key, ())
                    self.assertLen(got_plots, 1)
                    key = metric_types.PlotKey(name='calibration_plot')
                    self.assertIn(key, got_plots)
                    got_plot = got_plots[key]
                    self.assertProtoEquals(
                        """
              buckets {
                lower_threshold_inclusive: -inf
                upper_threshold_exclusive: 0.0
                total_weighted_label {
                  value: 4.0
                }
                total_weighted_refined_prediction {
                  value: -0.4
                }
                num_weighted_examples {
                  value: 4.0
                }
              }
              buckets {
                lower_threshold_inclusive: 0.0
                upper_threshold_exclusive: 0.1
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.1
                upper_threshold_exclusive: 0.2
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.2
                upper_threshold_exclusive: 0.3
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                  value: 1.6
                }
                num_weighted_examples {
                  value: 8.0
                }
              }
              buckets {
                lower_threshold_inclusive: 0.3
                upper_threshold_exclusive: 0.4
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.4
                upper_threshold_exclusive: 0.5
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.5
                upper_threshold_exclusive: 0.6
                total_weighted_label {
                  value: 5.0
                }
                total_weighted_refined_prediction {
                  value: 4.0
                }
                num_weighted_examples {
                  value: 8.0
                }
              }
              buckets {
                lower_threshold_inclusive: 0.6
                upper_threshold_exclusive: 0.7
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.7
                upper_threshold_exclusive: 0.8
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 0.8
                upper_threshold_exclusive: 0.9
                total_weighted_label {
                  value: 8.0
                }
                total_weighted_refined_prediction {
                  value: 6.4
                }
                num_weighted_examples {
                  value: 8.0
                }
              }
              buckets {
                lower_threshold_inclusive: 0.9
                upper_threshold_exclusive: 1.0
                total_weighted_label {
                }
                total_weighted_refined_prediction {
                }
                num_weighted_examples {
                }
              }
              buckets {
                lower_threshold_inclusive: 1.0
                upper_threshold_exclusive: inf
                total_weighted_label {
                  value: 8.0
                }
                total_weighted_refined_prediction {
                  value: 8.8
                }
                num_weighted_examples {
                  value: 8.0
                }
              }
          """, got_plot)

                except AssertionError as err:
                    raise util.BeamAssertException(err)

            util.assert_that(result, check_result, label='result')
if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='Find the most used Java packages')
    parser.add_argument('--output_prefix',
                        default='/tmp/output',
                        help='Output prefix')
    parser.add_argument(
        '--input',
        default=
        '../javahelp/src/main/java/com/google/cloud/training/dataanalyst/javahelp/',
        help='Input directory')

    options, pipeline_args = parser.parse_known_args()
    p = beam.Pipeline(argv=pipeline_args)

    input = '{0}*.java'.format(options.input)
    output_prefix = options.output_prefix
    keyword = 'import'

    # find most used packages
    (p
     | 'GetJava' >> beam.io.ReadFromText(input)
     | 'GetImports' >> beam.FlatMap(lambda line: startsWith(line, keyword))
     | 'PackageUse' >> beam.FlatMap(lambda line: packageUse(line, keyword))
     | 'TotalUse' >> beam.CombinePerKey(sum)
     | 'Top_5' >> beam.transforms.combiners.Top.Of(5, by_value)
     | 'write' >> beam.io.WriteToText(output_prefix))

    p.run().wait_until_finish()
Example #20
0
 def test_combine_per_key(self):
   with self.create_pipeline() as p:
     res = (p
            | beam.Create([('a', 1), ('a', 2), ('b', 3)])
            | beam.CombinePerKey(beam.combiners.MeanCombineFn()))
     assert_that(res, equal_to([('a', 1.5), ('b', 3.0)]))
  def testConfusionMatrixMetrics(self, metric, expected_value):
    computations = metric.computations()
    histogram = computations[0]
    matrices = computations[1]
    metrics = computations[2]

    # tp = 1
    # tn = 2
    # fp = 3
    # fn = 4
    example1 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.6]),
        'example_weights': np.array([1.0]),
    }
    example2 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.3]),
        'example_weights': np.array([1.0]),
    }
    example3 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.2]),
        'example_weights': np.array([1.0]),
    }
    example4 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.6]),
        'example_weights': np.array([1.0]),
    }
    example5 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.7]),
        'example_weights': np.array([1.0]),
    }
    example6 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.8]),
        'example_weights': np.array([1.0]),
    }
    example7 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.1]),
        'example_weights': np.array([1.0]),
    }
    example8 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.2]),
        'example_weights': np.array([1.0]),
    }
    example9 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.3]),
        'example_weights': np.array([1.0]),
    }
    example10 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.4]),
        'example_weights': np.array([1.0]),
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create([
              example1, example2, example3, example4, example5, example6,
              example7, example8, example9, example10
          ])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner)
          | 'ComputeMatrices' >> beam.Map(
              lambda x: (x[0], matrices.result(x[1])))  # pyformat: ignore
          | 'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1])))
      )  # pyformat: ignore

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_metrics, 1)
          key = metrics.keys[0]
          self.assertDictElementsAlmostEqual(
              got_metrics, {key: expected_value}, places=5)
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
Example #22
0
def run(argv=None):
    """Main entry point; defines and runs the pipeline."""
    logging.info("Starting pipeline.")

    parser = argparse.ArgumentParser()
    parser.add_argument('--usps_key',
                        dest='usps_key',
                        default=None,
                        help='USPS API key')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend(
        ['--project=voterdb-test', '--job_name=voter-pipeline'])

    #if not known_args.usps_key:
    #    raise Exception("Provide USPS API key.")

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        # TODO: Select rather than hard-code bucket/file name
        raw = (
            p
            | "AllNYSVoters_2017-03-27.csv" >>
            beam.io.ReadFromText("gs://upload-raw/AllNYSVoters_2018-03-13.csv")
            | "DictFromRawLine" >> beam.ParDo(DictFromRawLine()))

        elections = (p
                     | "Voter.ElectionCodes" >> beam.io.Read(
                         beam.io.BigQuerySource(table='Voter.ElectionCodes',
                                                validate=True))
                     | "beam.Map(make_kv_pair, 'Election')" >> beam.Map(
                         make_kv_pair, 'Election'))

        counties = (
            p
            | "Voter.CountyCodes" >> beam.io.Read(
                beam.io.BigQuerySource(table='Voter.CountyCodes',
                                       validate=True))
            |
            "beam.Map(make_kv_pair, 'Code')" >> beam.Map(make_kv_pair, 'Code'))

        output = (
            raw
            | "Voter.Raw" >> beam.io.WriteToBigQuery(
                table='Voter.Raw',
                schema=RAW_VF_SCHEMA,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
            ))

        output = (
            raw
            | "key_by_county_type" >> beam.Map(key_by_county_type,
                                               beam.pvalue.AsDict(counties))
            | "beam.CombinePerKey" >> beam.CombinePerKey(sum)
            | "flatten_sum" >> beam.Map(flatten_sum)
            | "Voter.Counts" >> beam.io.WriteToBigQuery(
                table='Voter.Counts',
                schema=COUNTS_SCHEMA,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
            ))

        output = (
            raw
            # | "BatchElements" >> beam.BatchElements()
            # | "BatchRunner" >> beam.ParDo(BatchRunner(), known_args.usps_key)
            | "build_formatted" >> beam.FlatMap(build_formatted,
                                                beam.pvalue.AsDict(elections),
                                                beam.pvalue.AsDict(counties))
            | "Voter.Formatted" >> beam.io.WriteToBigQuery(
                table='Voter.Formatted',
                schema=FORMATTED_SCHEMA,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED
            ))
def needs_help(pcoll):
    return (pcoll
            | 'PackageHelp' >> beam.FlatMap(
                lambda rowdict: packageHelp(rowdict['content'], 'package'))
            | 'TotalHelp' >> beam.CombinePerKey(sum)
            | 'DropZero' >> beam.Filter(lambda packages: packages[1] > 0))
Example #24
0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

# beam-playground:
#   name: CombinePerKey
#   description: Task from katas to implement the summation of scores per player.
#   multifile: false
#   pipeline_options:
#   categories:
#     - Combiners

import apache_beam as beam

from log_elements import LogElements

PLAYER_1 = 'Player 1'
PLAYER_2 = 'Player 2'
PLAYER_3 = 'Player 3'

with beam.Pipeline() as p:

  (p | beam.Create([(PLAYER_1, 15), (PLAYER_2, 10), (PLAYER_1, 100),
                    (PLAYER_3, 25), (PLAYER_2, 75)])
     | beam.CombinePerKey(sum)
     | LogElements())

  def testFlipCount(self):
    computations = flip_count.FlipCount(
        thresholds=[0.3],
        counterfactual_prediction_key='counterfactual_pred_key',
        example_id_key='example_id_key').computations(example_weighted=True)
    binary_confusion_matrix = computations[0]
    matrices = computations[1]
    metrics = computations[2]
    # TODO(b/171180441): Handle absence of ground truth labels in counterfactual
    # examples while computing flip count metrics.
    examples = [
        {
            'labels': None,
            'predictions': np.array([0.5]),
            'example_weights': np.array([1.0]),
            'features': {
                'counterfactual_pred_key': np.array([0.7]),
                'example_id_key': np.array(['id_1']),
            },
        },
        {
            'labels': None,
            'predictions': np.array([0.1, 0.7]),  # to test flattening
            'example_weights': np.array([3.0]),
            'features': {
                'counterfactual_pred_key': np.array([1.0, 0.1]),
                'example_id_key': np.array(['id_2']),
            },
        },
        {
            'labels': None,
            'predictions': np.array([0.5, 0.2]),
            'example_weights': np.array([2.0]),
            'features': {
                'counterfactual_pred_key': np.array([0.2, 0.4]),
                'example_id_key': np.array(['id_3']),
            },
        },
        {
            'labels': None,
            'predictions': np.array([0.2, 0.1]),
            'example_weights': np.array([1.0]),
            'features': {
                'counterfactual_pred_key': np.array([0.4, 0.5]),
                'example_id_key': np.array(['id_4']),
            },
        }
    ]

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create(examples)
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs, True)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeBinaryConfusionMatrix' >> beam.CombinePerKey(
              binary_confusion_matrix.combiner)
          | 'ComputeMatrices' >> beam.Map(
              lambda x: (x[0], matrices.result(x[1])))  # pyformat: ignore
          |
          'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1]))))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_metrics = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_metrics, 6)
          self.assertDictElementsAlmostEqual(
              got_metrics, {
                  metric_types.MetricKey(
                      name='flip_count/[email protected]',
                      example_weighted=True):
                      5.0,
                  metric_types.MetricKey(
                      name='flip_count/[email protected]',
                      example_weighted=True):
                      7.0,
                  metric_types.MetricKey(
                      name='flip_count/[email protected]',
                      example_weighted=True):
                      6.0,
                  metric_types.MetricKey(
                      name='flip_count/[email protected]',
                      example_weighted=True):
                      7.0,
              })
          self.assertAllEqual(
              got_metrics[metric_types.MetricKey(
                  name='flip_count/[email protected]',
                  example_weighted=True)], np.array([['id_2'], ['id_3']]))
          self.assertAllEqual(
              got_metrics[metric_types.MetricKey(
                  name='flip_count/[email protected]',
                  example_weighted=True)],
              np.array([['id_2'], ['id_3'], ['id_4']]))
        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
Example #26
0

#p1 = beam.Pipeline()

pipeline1 = beam.Pipeline(options=options)

table_spec = 'crazy-hippo-01:department_dataflow.group_by_name'

table_schema = 'name:STRING, count:INTEGER'

#with beam.Pipeline() as pipeline1:
dep_data_count = (
    pipeline1
    | 'Read from file' >> beam.io.ReadFromText(
        'gs://crazy-hippo-01/dataflow_beam_data/dept-data.txt')
    | 'Select_data' >> beam.Map(selectData)
    | 'Filter record on Accounts' >> beam.Filter(filtering)
    | 'Create Dict of Records' >> beam.Map(lambda record: (record[1], 1))
    | 'Apply CombinePerKey on Records' >> beam.CombinePerKey(sum)
    | 'Make into Dict' >> beam.Map(lambda x: {
        "name": x[0],
        "count": x[1]
    })
    #|'Write to Cloud Storage' >> beam.io.WriteToText('gs://crazy-hippo-01/dataflow_beam_data/output_new')
    | 'Write to BQ' >> beam.io.WriteToBigQuery(
        table_spec,
        schema=table_schema,
        write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))

pipeline1.run().wait_until_finish()
Example #27
0
  def testMultiClassConfusionMatrixPlot(self):
    computations = (
        multi_class_confusion_matrix_plot.MultiClassConfusionMatrixPlot()
        .computations(example_weighted=True))
    matrices = computations[0]
    plot = computations[1]

    example1 = {
        'labels': np.array([2.0]),
        'predictions': np.array([0.2, 0.3, 0.5]),
        'example_weights': np.array([0.5])
    }
    example2 = {
        'labels': np.array([0.0]),
        'predictions': np.array([0.1, 0.4, 0.5]),
        'example_weights': np.array([1.0])
    }
    example3 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.3, 0.2, 0.5]),
        'example_weights': np.array([0.25])
    }
    example4 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.1, 0.9, 0.0]),
        'example_weights': np.array([1.0])
    }
    example5 = {
        'labels': np.array([1.0]),
        'predictions': np.array([0.1, 0.8, 0.1]),
        'example_weights': np.array([1.0])
    }
    example6 = {
        'labels': np.array([2.0]),
        'predictions': np.array([0.3, 0.2, 0.5]),
        'example_weights': np.array([1.0])
    }

    with beam.Pipeline() as pipeline:
      # pylint: disable=no-value-for-parameter
      result = (
          pipeline
          | 'Create' >> beam.Create(
              [example1, example2, example3, example4, example5, example6])
          | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs)
          | 'AddSlice' >> beam.Map(lambda x: ((), x))
          | 'ComputeMatrices' >> beam.CombinePerKey(matrices.combiner)
          | 'ComputePlot' >> beam.Map(lambda x: (x[0], plot.result(x[1]))))

      # pylint: enable=no-value-for-parameter

      def check_result(got):
        try:
          self.assertLen(got, 1)
          got_slice_key, got_plots = got[0]
          self.assertEqual(got_slice_key, ())
          self.assertLen(got_plots, 1)
          key = metric_types.PlotKey(
              name='multi_class_confusion_matrix_plot', example_weighted=True)
          got_matrix = got_plots[key]
          self.assertProtoEquals(
              """
              matrices {
                threshold: 0.0
                entries {
                  actual_class_id: 0
                  predicted_class_id: 2
                  num_weighted_examples: 1.0
                }
                entries {
                  actual_class_id: 1
                  predicted_class_id: 1
                  num_weighted_examples: 2.0
                }
                entries {
                  actual_class_id: 1
                  predicted_class_id: 2
                  num_weighted_examples: 0.25
                }
                entries {
                  actual_class_id: 2
                  predicted_class_id: 2
                  num_weighted_examples: 1.5
                }
              }
          """, got_matrix)

        except AssertionError as err:
          raise util.BeamAssertException(err)

      util.assert_that(result, check_result, label='result')
Example #28
0
 def expand(self, pcoll):
     return (pcoll
             | 'PairWithOne' >> beam.Map(lambda v: (v, 1))
             | beam.CombinePerKey(sum))
Example #29
0
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://lswa-scalica/input/df_input.txt',
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        default='gs://lswa-scalica/output/df_output.txt',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--runner=DataflowRunner',
        '--project=scalica-224416',
        '--staging_location=gs://lswa-scalica/staging',
        '--temp_location=gs://lswa-scalica/tmp',
        '--job_name=scalica-job',
    ])

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        # format input into a dictionary
        def format_input(line):
            split_line = line.split(',')
            user_id = split_line[0]
            followees = split_line[1].split('-')
            followees = [int(followee) for followee in followees if followee]
            followers = split_line[2].split('-')
            followers = [int(follower) for follower in followers if follower]

            return {
                'user_id': user_id,
                'followees': followees,
                'followers': followers
            }

        # split followees and followers into list of pairs
        def split(user_data):
            follower_pairs = []

            for followee in user_data['followees']:
                for follower in user_data['followers']:
                    if followee != follower:
                        follower_pair = str(followee) + ',' + str(follower)
                        follower_pairs.append(follower_pair)

            return follower_pairs

        # emit a count for each follower_pair
        def map_count(follower_pair):
            return (follower_pair, 1)

        # format each follower pair + counter
        def format_result(map_pair):
            (follower_pair, count) = map_pair
            return '%s: %s' % (follower_pair, count)

        logging.info('reading from input')

        # Read the input file
        lines = p | ReadFromText(known_args.input)
        print(lines)

        suggestions = (lines
                       | 'FormatInput' >> beam.Map(format_input)
                       | 'Split' >> beam.FlatMap(split)
                       | 'MapCount' >> beam.Map(map_count)
                       | 'GroupAndSum' >> beam.CombinePerKey(sum))

        logging.info('generated suggestions')

        output = suggestions | 'Format' >> beam.Map(format_result)

        # for convenience, only write to one shard
        # for scalability, don't define this parameter so Dataflow scales numshards appropriately
        output | WriteToText(known_args.output, num_shards=1)
Example #30
0
 def expand(self, pcoll):
     return pcoll \
            | beam.CombinePerKey(sum).with_output_types(
                typing.Tuple[unicode, int])