def expand(self, inputs): pcoll, = inputs if self._top_k is not None and self._top_k < 0: raise ValueError('top_k for VocabularyImpl should be >= 0 or None, got ' '{}.'.format(self._top_k)) if self._frequency_threshold is not None and self._frequency_threshold < 0: raise ValueError( 'frequency_threshold for VocabularyImpl should be >= 0 or None, ' 'got {}.'.format(self._frequency_threshold)) if self._coverage_top_k is not None and self._coverage_top_k < 0: raise ValueError('coverage_top_k for VocabularyImpl should be >= 0 or ' 'None, got {}.'.format(self._coverage_top_k)) if (self._coverage_frequency_threshold is not None and self._coverage_frequency_threshold < 0): raise ValueError( 'coverage_frequency_threshold for VocabularyImpl should be >= 0 or ' 'None, got {}.'.format(self._coverage_frequency_threshold)) # Create a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). def is_problematic_string(kv): string, _ = kv # Ignore counts. return string and b'\n' not in string and b'\r' not in string if (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_MUTUAL_INFORMATION): flatten_map_fn = _flatten_to_key_and_means_accumulator_list combine_transform = _MutualInformationTransform( # pylint: disable=no-value-for-parameter self._use_adjusted_mutual_info, self._min_diff_from_avg) elif (self._vocab_ordering_type == tf_utils.VocabOrderingType.WEIGHTED_FREQUENCY): flatten_map_fn = _flatten_value_and_weights_to_list_of_tuples combine_transform = beam.CombinePerKey(sum) else: flatten_map_fn = _flatten_value_to_list combine_transform = beam.combiners.Count.PerElement() raw_counts = ( pcoll | 'FlattenStringsAndMaybeWeightsLabels' >> beam.FlatMap(flatten_map_fn) | 'CountPerString' >> combine_transform | 'FilterProblematicStrings' >> beam.Filter(is_problematic_string) | 'SwapStringsAndCounts' >> beam.KvSwap()) counts = ( raw_counts | 'ApplyFrequencyThresholdAndTopK' >> ( _ApplyFrequencyThresholdAndTopK( # pylint: disable=no-value-for-parameter self._frequency_threshold, self._top_k, None))) if self._key_fn: coverage_counts = ( raw_counts | 'ApplyCoverageFrequencyThresholdAndTopK' >> ( _ApplyFrequencyThresholdAndTopK( # pylint: disable=no-value-for-parameter self._coverage_frequency_threshold, self._coverage_top_k, self._key_fn))) counts = ( (counts, coverage_counts) | 'MergeStandardAndCoverageArms' >> beam.Flatten() | 'RemoveDuplicates' >> beam.RemoveDuplicates()) return counts | 'WriteVocabFile' >> ( _WriteVocabFile( # pylint: disable=no-value-for-parameter self._base_temp_dir, self._vocab_filename, self._store_frequency))
def testMultiClassConfusionMatrixPlotWithStringLabels(self): computations = ( multi_class_confusion_matrix_plot.MultiClassConfusionMatrixPlot() .computations(example_weighted=True)) matrices = computations[0] plot = computations[1] # Examples from b/149558504. example1 = { 'labels': np.array([['unacc']]), 'predictions': { 'probabilities': np.array([[ 1.0000000e+00, 6.9407083e-24, 2.7419115e-38, 0.0000000e+00 ]]), 'all_classes': np.array([['unacc', 'acc', 'vgood', 'good']]), }, 'example_weights': np.array([0.5]) } example2 = { 'labels': np.array([['vgood']]), 'predictions': { 'probabilities': np.array([[0.2, 0.3, 0.4, 0.1]]), 'all_classes': np.array([['unacc', 'acc', 'vgood', 'good']]), }, 'example_weights': np.array([1.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeMatrices' >> beam.CombinePerKey(matrices.combiner) | 'ComputePlot' >> beam.Map(lambda x: (x[0], plot.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_plots, 1) key = metric_types.PlotKey( name='multi_class_confusion_matrix_plot', example_weighted=True) got_matrix = got_plots[key] self.assertProtoEquals( """ matrices { threshold: 0.0 entries { actual_class_id: 0 predicted_class_id: 0 num_weighted_examples: 0.5 } entries { actual_class_id: 2 predicted_class_id: 2 num_weighted_examples: 1.0 } } """, got_matrix) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testQueryStatistics(self): metrics = query_statistics.QueryStatistics().computations( query_key='query')[0] query1_example1 = { 'labels': np.array([1.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([1.0]), 'features': { 'query': np.array(['query1']), 'gain': np.array([1.0]) } } query1_example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([1.0]), 'features': { 'query': np.array(['query1']), 'gain': np.array([0.5]) } } query2_example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([2.0]), 'features': { 'query': np.array(['query2']), 'gain': np.array([0.5]) } } query2_example2 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([2.0]), 'features': { 'query': np.array(['query2']), 'gain': np.array([1.0]) } } query2_example3 = { 'labels': np.array([0.0]), 'predictions': np.array([0.1]), 'example_weights': np.array([2.0]), 'features': { 'query': np.array(['query2']), 'gain': np.array([0.1]) } } query3_example1 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([3.0]), 'features': { 'query': np.array(['query3']), 'gain': np.array([1.0]) } } examples = [ tfma_util.merge_extracts([query1_example1, query1_example2]), tfma_util.merge_extracts( [query2_example1, query2_example2, query2_example3]), tfma_util.merge_extracts([query3_example1]) ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = (pipeline | 'Create' >> beam.Create(examples) | 'Process' >> beam.Map( metric_util.to_standard_metric_inputs, True) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'Combine' >> beam.CombinePerKey(metrics.combiner)) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) total_queries_key = metric_types.MetricKey( name='total_queries') total_documents_key = metric_types.MetricKey( name='total_documents') min_documents_key = metric_types.MetricKey( name='min_documents') max_documents_key = metric_types.MetricKey( name='max_documents') self.assertDictElementsAlmostEqual(got_metrics, { total_queries_key: 3, total_documents_key: 6, min_documents_key: 1, max_documents_key: 3 }, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def expand(self, pcoll): return (pcoll | beam.Map(lambda elem: (elem[self.field], elem['score'])) | beam.CombinePerKey(sum))
def testFlipCountWitEvalConfig(self): eval_config = text_format.Parse( """ model_specs: { name: "original" } model_specs: { name: "counterfactual" is_baseline: true } """, config_pb2.EvalConfig()) computations = flip_count.FlipCount( thresholds=[0.3], example_id_key='example_id_key').computations( eval_config=eval_config, example_weighted=True, model_names=['original', 'counterfactual'], output_names=['']) binary_confusion_matrix = computations[0] matrices = computations[1] metrics = computations[2] original_model_name = 'original' counterfactual_model_name = 'counterfactual' examples = [ { 'labels': None, 'predictions': { original_model_name: np.array([0.5]), counterfactual_model_name: np.array([0.7]), }, 'example_weights': np.array([1.0]), 'features': { 'example_id_key': np.array(['id_1']), }, }, { 'labels': None, 'predictions': { original_model_name: np.array([0.1, 0.7]), # to test flattening counterfactual_model_name: np.array([1.0, 0.1]) }, 'example_weights': np.array([3.0]), 'features': { 'example_id_key': np.array(['id_2']), }, }, { 'labels': None, 'predictions': { original_model_name: np.array([0.5, 0.2]), counterfactual_model_name: np.array([0.2, 0.4]), }, 'example_weights': np.array([2.0]), 'features': { 'example_id_key': np.array(['id_3']), }, }, { 'labels': None, 'predictions': { original_model_name: np.array([0.2, 0.1]), counterfactual_model_name: np.array([0.4, 0.5]), }, 'example_weights': np.array([1.0]), 'features': { 'example_id_key': np.array(['id_4']), }, } ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create(examples) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs, True) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeBinaryConfusionMatrix' >> beam.CombinePerKey( binary_confusion_matrix.combiner) | 'ComputeMatrices' >> beam.Map(lambda x: (x[0], matrices.result(x[1]))) | 'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 6) self.assertDictElementsAlmostEqual( got_metrics, { metric_types.MetricKey( name='flip_count/[email protected]', model_name='original', example_weighted=True): 5.0, metric_types.MetricKey( name='flip_count/[email protected]', model_name='original', example_weighted=True): 7.0, metric_types.MetricKey( name='flip_count/[email protected]', model_name='original', example_weighted=True): 6.0, metric_types.MetricKey( name='flip_count/[email protected]', model_name='original', example_weighted=True): 7.0, }) self.assertAllEqual( got_metrics[metric_types.MetricKey( name='flip_count/[email protected]', model_name='original', example_weighted=True)], np.array([['id_2'], ['id_3']])) self.assertAllEqual( got_metrics[metric_types.MetricKey( name='flip_count/[email protected]', model_name='original', example_weighted=True)], np.array([['id_2'], ['id_3'], ['id_4']])) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def expand(self, pcoll): return (pcoll | beam.Map(lambda info: (info[self.field], info['score'])) | beam.CombinePerKey(sum_ints))
def testConfusionMatrixAtThresholds(self): computations = confusion_matrix_metrics.ConfusionMatrixAtThresholds( thresholds=[0.3, 0.5, 0.8]).computations() histogram = computations[0] matrices = computations[1] metrics = computations[2] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.0]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([1.0]), } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]), } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([1.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map( lambda x: (x[0], matrices.result(x[1]))) # pyformat: ignore | 'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1]))) ) # pyformat: ignore # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) key = metric_types.MetricKey(name='confusion_matrix_at_thresholds') self.assertIn(key, got_metrics) got_metric = got_metrics[key] self.assertProtoEquals( """ matrices { threshold: 0.3 false_negatives: 1.0 true_negatives: 1.0 false_positives: 1.0 true_positives: 1.0 precision: 0.5 recall: 0.5 } matrices { threshold: 0.5 false_negatives: 1.0 true_negatives: 2.0 true_positives: 1.0 precision: 1.0 recall: 0.5 } matrices { threshold: 0.8 false_negatives: 1.0 true_negatives: 2.0 true_positives: 1.0 precision: 1.0 recall: 0.5 } """, got_metric) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def expand(self, input_or_inputs): return (input_or_inputs | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | beam.CombinePerKey(sum))
def count_decorated_fn(input_or_inputs): """ Count as a decorated function. """ return (input_or_inputs | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | beam.CombinePerKey(sum))
p = beam.Pipeline() # Input Pcollection input_collection = ( p | "Read from file" >> beam.io.ReadFromText('structure_of_beam_apps/dept-data.txt') | "Split rows" >> beam.Map(SplitRow)) # Accounts dept pipeline branch accounts_count = (input_collection | "Get all Accounts dept person" >> beam.Filter(lambda record: record[3] == 'Accounts') | "Pair each accounts employee name with 1" >> beam.Map(lambda record: (record[1], 1)) | "Group and sum Accounts" >> beam.CombinePerKey(sum) | "Write results for accounts" >> beam.io.WriteToText('structure_of_beam_apps/data/Account')) # HR dept pipeline branch hr_count = ( input_collection | "Get all HR dept persons" >> beam.Filter(lambda record: record[3] == 'HR') | "Pair each hr employee name with 1" >> beam.Map(lambda record: (record[1], 1)) | "Group and sum HR" >> beam.CombinePerKey(sum) | "Write results for HR" >> beam.io.WriteToText('structure_of_beam_apps/data/HR')) # Run pipeline
def run(argv=None, save_main_session=True): parser = argparse.ArgumentParser() parser.add_argument('--target_month_start', type=int, required=True, help='Start date of the expiration target month') parser.add_argument('--target_month_end', type=int, required=True, help='End date of the expiration target month') parser.add_argument('--user_logs_start', type=int, required=True, help='Start date for filtering users by activity') parser.add_argument('--dataset', type=str, required=True, help='Real or fake data') parser.add_argument('--credentials', type=str, required=True, help='Path to service account JSON') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) # Set GCP credentials os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = args.credentials #"C:\\Users\\Owner\\Google Drive\\Northeastern\\DS 5500 Capstone\\ds5500-capstone\\amiable-octane-267022-061a6f297eeb.json" google_cloud_options = options.view_as(GoogleCloudOptions) gcs_project = google_cloud_options.project google_cloud_options.region = "us-east1" # TODO update as parameter google_cloud_options.staging_location = 'gs://pipeline_beam/temp' #'gs://arr-beam-test/temp' google_cloud_options.temp_location = 'gs://pipeline_beam/temp' #'gs://arr-beam-test/temp' # look into not using public IPs # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params # https://cloud.google.com/dataflow/docs/guides/specifying-networks#public_ip_parameter # no_use_public_ips The public IPs parameter requires the Beam SDK for Python. The Dataflow SDK for Python does not support this parameter. # We also require the --project option to access --dataset if google_cloud_options.project is None: parser.print_usage() print(sys.argv[0] + ': error: argument --project is required') sys.exit(1) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = save_main_session with beam.Pipeline(options=options) as p: ## Find MNSOs with an expiration in the target month target_month_start = args.target_month_start target_month_end = args.target_month_end user_logs_start = args.user_logs_start ## Table params; fake dataset for testing purposes if args.dataset == "fake": project_input_dataset = gcs_project + ":beam." project_input_dataset_standard = gcs_project + ".beam." user_logs_tbl = "user_logs_fake" users_tbl = "users_fake" transactions_tbl = "transactions_fake" else: project_input_dataset = gcs_project + ":kkbox." project_input_dataset_standard = gcs_project + ".kkbox." user_logs_tbl = "user_logs" users_tbl = "members" transactions_tbl = "transactions" project_output_dataset = gcs_project + ":kkbox." name_suffix = str(int(datetime.now().timestamp())) features_output_train_tbl = project_output_dataset + "output_train_" + name_suffix features_output_val_tbl = project_output_dataset + "output_val_" + name_suffix features_output_test_tbl = project_output_dataset + "output_test_" + name_suffix # Read users from BQ users = ( p | 'Query Users tabl e in BQ' >> beam.io.Read(beam.io.BigQuerySource(table=project_input_dataset+users_tbl)) | beam.ParDo(ParseUsersBQ(target_month_start)) ) # Read valid user transactions from BQ query_params = {'users_tbl':project_input_dataset_standard+users_tbl, 'transactions_tbl':project_input_dataset_standard+transactions_tbl, 'user_logs_tbl':project_input_dataset_standard+user_logs_tbl, 'target_month_start':target_month_start, 'target_month_end':target_month_end, 'user_logs_start':user_logs_start} # Filter to include only transactions with users who had expirations in the target month transactions = ( p | 'Query valid user transactions' >> beam.io.Read(beam.io.BigQuerySource(query = """#standardSQL SELECT t.transaction_date,t.membership_expire_date,t.msno,t.payment_method_id,payment_plan_days,t.plan_list_price,t.actual_amount_paid,t.is_auto_renew,t.is_cancel from `{transactions_tbl}` t where t.msno in (SELECT MSNO from `{transactions_tbl}` where membership_expire_date >= {target_month_start} and membership_expire_date <= {target_month_end}) """.format(**query_params), use_standard_sql=True)) | beam.ParDo(ParseTransactionsBQ()) ) user_log_features = ( p | 'Query user_logs' >> beam.io.Read(beam.io.BigQuerySource(query = """ #standardSQL SELECT msno, count(msno) as user_log_entries, avg(num_25) as avg_num_25, avg(num_50) as avg_num_50, avg(num_75) as avg_num_75, avg(num_985) as avg_num_985, avg(num_100) as avg_num_100, avg(num_unq) as avg_num_unq, sum(num_25) as sum_num_25, sum(num_50) as sum_num_50, sum(num_75) as sum_num_75, sum(num_985) as sum_num_985, sum(num_100) as sum_num_100, sum(num_unq) as sum_num_unq, from `{user_logs_tbl}` group by msno having min(CAST(date as INT64)) <= {user_logs_start} """.format(**query_params), use_standard_sql=True)) | beam.ParDo(ParseUserLogsBQ()) ) ### # Generate Labels ### # TODO add these as cmd line arguments final_month_start = 20170301 final_month_end = 20170331 # Find users who did not churn based on transaction records to later assign 1 or 0 not_churned_users = ( transactions | 'Filter Transactions in Feb AND Expiration is not Feb' >> beam.Filter(lambda elem: elem['transaction_date'] >= target_month_start and elem['transaction_date'] <= final_month_end and elem['membership_expire_date'] >= final_month_start and elem['is_cancel'] == 0) | 'Create key-value pair with 0 for no churn' >> beam.Map(lambda elem: (elem['msno'],0)) ) #### ## Feature engineering #### # Always Auto-Renew: 1 if the user consistently have auto-renew on all transactions. 0 otherwise feature_autorenew = ( transactions | beam.Map(lambda x: (x["msno"],x["is_auto_renew"])) | beam.CombinePerKey(min)) # Discount amount mean feature_discount_mean = ( transactions | beam.Map(lambda x: (x["msno"],x["discount_amount"])) | beam.CombinePerKey(AverageFn()) ) # Payment ID encoding feature_payment_id_encoded = ( # Converte transactions into key,value pair of (msno,one-hot-encoded-payment-method) transactions | beam.Map(lambda x: (x["msno"],OneHotPaymentId({"payment_method_id":x["payment_method_id"]}))) | beam.CombinePerKey(SummarisePaymentId()) ) def print_fun(x): print(x) ## # Combine all data ## # Combine all user features with users and filter users with no data all_output =( {'user_demo': users, 'feature_user_log_counts':user_log_features, 'feature_autorenew':feature_autorenew, 'feature_discount_mean':feature_discount_mean, 'feature_payment_id_encoded':feature_payment_id_encoded, 'not_churned_users':not_churned_users} | "Combine users, labels, and features" >> beam.CoGroupByKey() | "Filter out empty entries" >> beam.Filter(lambda x: len(x[1]["user_demo"]) > 0 and len(x[1]["feature_user_log_counts"]) > 0 and len(x[1]["feature_autorenew"]) > 0 and len(x[1]["feature_discount_mean"]) > 0 and len(x[1]["feature_payment_id_encoded"]) > 0) | "Flatten to single dictionary for BQ" >> beam.Map(format_for_BQ) ) ## partition into train/val/test based on random float train, val, test = (all_output | beam.Partition(train_test_split, 3)) # write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE causes the function to sleep for 150 seconds to wait for delete to finalize before write # output to BQ #features_output_schema val | "Validation output" >> beam.io.WriteToBigQuery(table=features_output_val_tbl,schema=features_output_schema) train | "Training output" >> beam.io.WriteToBigQuery(table=features_output_train_tbl,schema=features_output_schema) test | "Testing output" >> beam.io.WriteToBigQuery(table=features_output_test_tbl,schema=features_output_schema)
def expand(self, pcoll): top_k = self._spec.top_k frequency_threshold = self._spec.frequency_threshold assert top_k is None or top_k >= 0 assert frequency_threshold is None or frequency_threshold >= 0 # Creates a PCollection of (count, element) pairs, then iterates over # this to create a single element PCollection containing this list of # pairs in sorted order by decreasing counts (and by values for equal # counts). counts = ( pcoll | 'FlattenValueToList' >> beam.Map(_flatten_value_to_list) | 'CountWithinList' >> # Specification of with_output_types allows for combiner optimizations. (beam.FlatMap(lambda lst: six.iteritems(collections.Counter(lst))). with_output_types(KV[common.PRIMITIVE_TYPE, int])) | 'CountGlobally' >> beam.CombinePerKey(sum)) counts = ( counts | 'FilterProblematicStrings' >> beam.Filter( lambda kv: kv[0] and '\n' not in kv[0] and '\r' not in kv[0]) | 'SwapElementsAndCounts' >> beam.KvSwap()) # Filter is cheaper than TopK computation and the two commute, so # filter first. if frequency_threshold is not None: counts |= ('FilterByFrequencyThreshold(%s)' % frequency_threshold >> beam.Filter(lambda kv: kv[0] >= frequency_threshold)) if top_k is not None: counts = (counts | 'Top(%s)' % top_k >> beam.transforms.combiners.Top.Largest(top_k) | 'FlattenList' >> beam.FlatMap(lambda lst: lst)) # Performance optimization to obviate reading from finely sharded files # via AsIter. By forcing all data into a single group we end up reading # from a single file. # @beam.ptransform_fn def Reshard(pcoll): # pylint: disable=invalid-name return ( pcoll | 'PairWithNone' >> beam.Map(lambda x: (None, x)) | 'GroupByNone' >> beam.GroupByKey() | 'ExtractValues' >> beam.FlatMap(lambda x: x[1])) counts |= 'ReshardToOneGroup' >> Reshard() # pylint: disable=no-value-for-parameter # Using AsIter instead of AsList below in order to reduce max memory # usage (due to AsList caching). def order_by_decreasing_counts(ignored, counts_iter, store_frequency): """Sort the vocabulary by frequency count.""" del ignored counts = list(counts_iter) if not counts: counts = [(1, '49d0cd50-04bb-48c0-bc6f-5b575dce351a')] counts.sort(reverse=True) # Largest first. if store_frequency: # Returns ['count1 element1', ... ] return ['{} {}'.format(count, element) for count, element in counts] else: return [element for _, element in counts] vocabulary_file = os.path.join(self._temp_assets_dir, self._spec.vocab_filename) vocab_is_written = ( pcoll.pipeline | 'Prepare' >> beam.Create([None]) | 'OrderByDecreasingCounts' >> beam.FlatMap( order_by_decreasing_counts, counts_iter=beam.pvalue.AsIter(counts), store_frequency=self._spec.store_frequency) | 'WriteToFile' >> beam.io.WriteToText(vocabulary_file, shard_name_template='')) # Return the vocabulary path. wait_for_vocabulary_transform = ( pcoll.pipeline | 'CreatePath' >> beam.Create([[vocabulary_file]]) # Ensure that the analysis returns only after the file is written. | 'WaitForVocabularyFile' >> beam.Map( lambda x, y: x, y=beam.pvalue.AsIter(vocab_is_written))) return wait_for_vocabulary_transform
def _MutualInformationTransformAccumulate(pcol): # pylint: disable=invalid-name """Accumulates information needed for mutual information computation.""" return (pcol | 'VocabCountPerLabelPerTokenAccumulate' >> beam.CombinePerKey(_WeightedMeanCombineFn()))
def _ComputePerSlice( # pylint: disable=invalid-name sliced_extracts: beam.pvalue.PCollection, computations: List[metric_types.MetricComputation], derived_computations: List[metric_types.DerivedMetricComputation], cross_slice_computations: List[metric_types.CrossSliceMetricComputation], cross_slice_specs: Optional[Iterable[config.CrossSlicingSpec]] = None, compute_with_sampling: Optional[bool] = False, num_jackknife_samples: int = 0, skip_ci_metric_keys: Set[metric_types.MetricKey] = frozenset(), random_seed_for_testing: Optional[int] = None, baseline_model_name: Optional[Text] = None) -> beam.pvalue.PCollection: # pytype: disable=annotation-type-mismatch """PTransform for computing, aggregating and combining metrics and plots. Args: sliced_extracts: Incoming PCollection consisting of slice key and extracts. computations: List of MetricComputations. derived_computations: List of DerivedMetricComputations. cross_slice_computations: List of CrossSliceMetricComputation. cross_slice_specs: List of CrossSlicingSpec. compute_with_sampling: True to compute with bootstrap sampling. This allows _ComputePerSlice to be used to generate unsampled values from the whole data set, as well as bootstrap resamples, in which each element is treated as if it showed up p ~ poission(1) times. num_jackknife_samples: number of delete-d jackknife estimates to use in computing standard errors on metrics. skip_ci_metric_keys: List of metric keys for which to skip confidence interval computation. random_seed_for_testing: Seed to use for unit testing. baseline_model_name: Name for baseline model. Returns: PCollection of (slice key, dict of metrics). """ # TODO(b/123516222): Remove this workaround per discussions in CL/227944001 sliced_extracts.element_type = beam.typehints.Any def convert_and_add_derived_values( sliced_results: Tuple[slicer.SliceKeyType, Tuple[metric_types.MetricsDict, ...]], derived_computations: List[metric_types.DerivedMetricComputation], ) -> Tuple[slicer.SliceKeyType, metric_types.MetricsDict]: """Converts per slice tuple of dicts into single dict and adds derived.""" result = {} for v in sliced_results[1]: result.update(v) for c in derived_computations: result.update(c.result(result)) return sliced_results[0], result def add_diff_metrics( sliced_metrics: Tuple[Union[slicer.SliceKeyType, slicer.CrossSliceKeyType], Dict[metric_types.MetricKey, Any]], baseline_model_name: Optional[Text], ) -> Tuple[slicer.SliceKeyType, Dict[metric_types.MetricKey, Any]]: """Add diff metrics if there is a baseline model.""" result = copy.copy(sliced_metrics[1]) if baseline_model_name: diff_result = {} for k, v in result.items(): if _is_private_metrics(k): continue if k.model_name != baseline_model_name and k.make_baseline_key( baseline_model_name) in result: # Check if metric is diffable, skip plots and non-numerical values. if _is_metric_diffable(v): diff_result[k.make_diff_key()] = v - result[ k.make_baseline_key(baseline_model_name)] result.update(diff_result) # Remove private metrics _remove_private_metrics(result) return (sliced_metrics[0], result) combiner = _ComputationsCombineFn( computations=computations, compute_with_sampling=compute_with_sampling, random_seed_for_testing=random_seed_for_testing) if num_jackknife_samples: # We do not use the hotkey fanout hint used by the non-jacknife path because # the random jackknife partitioning naturally mitigates hot keys. sliced_combiner_outputs = ( sliced_extracts | 'JackknifeCombinePerSliceKey' >> jackknife.JackknifeCombinePerKey(combiner, num_jackknife_samples)) else: sliced_combiner_outputs = ( sliced_extracts | 'CombinePerSliceKey' >> beam.CombinePerKey(combiner). with_hot_key_fanout(_COMBINE_PER_SLICE_KEY_HOT_KEY_FANOUT)) sliced_derived_values_and_diffs = ( sliced_combiner_outputs | 'ConvertAndAddDerivedValues' >> beam.Map( convert_and_add_derived_values, derived_computations) | 'AddCrossSliceMetrics' >> _AddCrossSliceMetrics( # pylint: disable=no-value-for-parameter cross_slice_specs, cross_slice_computations) | 'AddDiffMetrics' >> beam.Map(add_diff_metrics, baseline_model_name)) if num_jackknife_samples: return (sliced_derived_values_and_diffs | 'MergeJackknifeSamples' >> jackknife.MergeJackknifeSamples( num_jackknife_samples, skip_ci_metric_keys)) else: return sliced_derived_values_and_diffs
def expand(self, p): return (p | 'extract_field' >> beam.Map(lambda x: (vars(x)[self.field], x.score)) | beam.CombinePerKey(sum))
def count_decorated_with_side_input_fn(input_or_inputs, factor=1): """ Count as a decorated function with a side input. """ return (input_or_inputs | 'pair_with_one' >> beam.Map(lambda x: (x, factor)) | beam.CombinePerKey(sum))
def testTjuDicriminationMetricsWithWeights(self, metric, expected_value): computations = metric.computations() shared_metrics = computations[0] metric = computations[1] # Positive labels: 1.0 * 0.0 + 2.0 * 1.0 + 3.0 * 1.0 + 4.0 * 0.0 = 5.0 # Negative labels: 1.0 * 1.0 + 2.0 * 0.0 + 3.0 * 0.0 + 4.0 * 1.0 = 5.0 # Positive predictions: 1.0 * 0.0 * 0.8 + 2.0 * 1.0 * 0.3 + 3.0 * 1.0 * 0.9 # + 4.0 * 0.0 * 0.2 = 3.3 # Negative predictions: 1.0 * 1.0 * 0.8 + 2.0 * 0.0 * 0.7 + 3.0 * 0.0 * 0.1 # + 4.0 * 1.0 * 0.2 = 1.6 example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([2.0]), } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.9]), 'example_weights': np.array([3.0]), } example4 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([4.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeWeightedTotals' >> beam.CombinePerKey( shared_metrics.combiner) | 'ComputeMetric' >> beam.Map(lambda x: (x[0], metric.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) key = metric.keys[0] self.assertDictElementsAlmostEqual(got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def testCalibrationPlot(self): computations = calibration_plot.CalibrationPlot( num_buckets=10).computations() histogram = computations[0] plot = computations[1] example1 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([1.0]) } example2 = { 'labels': np.array([1.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([2.0]) } example3 = { 'labels': np.array([0.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([3.0]) } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([-0.1]), 'example_weights': np.array([4.0]) } example5 = { 'labels': np.array([1.0]), 'predictions': np.array([0.5]), 'example_weights': np.array([5.0]) } example6 = { 'labels': np.array([1.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([6.0]) } example7 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([7.0]) } example8 = { 'labels': np.array([1.0]), 'predictions': np.array([1.1]), 'example_weights': np.array([8.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([ example1, example2, example3, example4, example5, example6, example7, example8 ]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputePlot' >> beam.Map(lambda x: (x[0], plot.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_plots, 1) key = metric_types.PlotKey(name='calibration_plot') self.assertIn(key, got_plots) got_plot = got_plots[key] self.assertProtoEquals( """ buckets { lower_threshold_inclusive: -inf upper_threshold_exclusive: 0.0 total_weighted_label { value: 4.0 } total_weighted_refined_prediction { value: -0.4 } num_weighted_examples { value: 4.0 } } buckets { lower_threshold_inclusive: 0.0 upper_threshold_exclusive: 0.1 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.1 upper_threshold_exclusive: 0.2 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.2 upper_threshold_exclusive: 0.3 total_weighted_label { } total_weighted_refined_prediction { value: 1.6 } num_weighted_examples { value: 8.0 } } buckets { lower_threshold_inclusive: 0.3 upper_threshold_exclusive: 0.4 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.4 upper_threshold_exclusive: 0.5 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.5 upper_threshold_exclusive: 0.6 total_weighted_label { value: 5.0 } total_weighted_refined_prediction { value: 4.0 } num_weighted_examples { value: 8.0 } } buckets { lower_threshold_inclusive: 0.6 upper_threshold_exclusive: 0.7 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.7 upper_threshold_exclusive: 0.8 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 0.8 upper_threshold_exclusive: 0.9 total_weighted_label { value: 8.0 } total_weighted_refined_prediction { value: 6.4 } num_weighted_examples { value: 8.0 } } buckets { lower_threshold_inclusive: 0.9 upper_threshold_exclusive: 1.0 total_weighted_label { } total_weighted_refined_prediction { } num_weighted_examples { } } buckets { lower_threshold_inclusive: 1.0 upper_threshold_exclusive: inf total_weighted_label { value: 8.0 } total_weighted_refined_prediction { value: 8.8 } num_weighted_examples { value: 8.0 } } """, got_plot) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
if __name__ == '__main__': parser = argparse.ArgumentParser( description='Find the most used Java packages') parser.add_argument('--output_prefix', default='/tmp/output', help='Output prefix') parser.add_argument( '--input', default= '../javahelp/src/main/java/com/google/cloud/training/dataanalyst/javahelp/', help='Input directory') options, pipeline_args = parser.parse_known_args() p = beam.Pipeline(argv=pipeline_args) input = '{0}*.java'.format(options.input) output_prefix = options.output_prefix keyword = 'import' # find most used packages (p | 'GetJava' >> beam.io.ReadFromText(input) | 'GetImports' >> beam.FlatMap(lambda line: startsWith(line, keyword)) | 'PackageUse' >> beam.FlatMap(lambda line: packageUse(line, keyword)) | 'TotalUse' >> beam.CombinePerKey(sum) | 'Top_5' >> beam.transforms.combiners.Top.Of(5, by_value) | 'write' >> beam.io.WriteToText(output_prefix)) p.run().wait_until_finish()
def test_combine_per_key(self): with self.create_pipeline() as p: res = (p | beam.Create([('a', 1), ('a', 2), ('b', 3)]) | beam.CombinePerKey(beam.combiners.MeanCombineFn())) assert_that(res, equal_to([('a', 1.5), ('b', 3.0)]))
def testConfusionMatrixMetrics(self, metric, expected_value): computations = metric.computations() histogram = computations[0] matrices = computations[1] metrics = computations[2] # tp = 1 # tn = 2 # fp = 3 # fn = 4 example1 = { 'labels': np.array([1.0]), 'predictions': np.array([0.6]), 'example_weights': np.array([1.0]), } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]), } example3 = { 'labels': np.array([0.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([1.0]), } example4 = { 'labels': np.array([0.0]), 'predictions': np.array([0.6]), 'example_weights': np.array([1.0]), } example5 = { 'labels': np.array([0.0]), 'predictions': np.array([0.7]), 'example_weights': np.array([1.0]), } example6 = { 'labels': np.array([0.0]), 'predictions': np.array([0.8]), 'example_weights': np.array([1.0]), } example7 = { 'labels': np.array([1.0]), 'predictions': np.array([0.1]), 'example_weights': np.array([1.0]), } example8 = { 'labels': np.array([1.0]), 'predictions': np.array([0.2]), 'example_weights': np.array([1.0]), } example9 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3]), 'example_weights': np.array([1.0]), } example10 = { 'labels': np.array([1.0]), 'predictions': np.array([0.4]), 'example_weights': np.array([1.0]), } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create([ example1, example2, example3, example4, example5, example6, example7, example8, example9, example10 ]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeHistogram' >> beam.CombinePerKey(histogram.combiner) | 'ComputeMatrices' >> beam.Map( lambda x: (x[0], matrices.result(x[1]))) # pyformat: ignore | 'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1]))) ) # pyformat: ignore # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 1) key = metrics.keys[0] self.assertDictElementsAlmostEqual( got_metrics, {key: expected_value}, places=5) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def run(argv=None): """Main entry point; defines and runs the pipeline.""" logging.info("Starting pipeline.") parser = argparse.ArgumentParser() parser.add_argument('--usps_key', dest='usps_key', default=None, help='USPS API key') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend( ['--project=voterdb-test', '--job_name=voter-pipeline']) #if not known_args.usps_key: # raise Exception("Provide USPS API key.") pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # TODO: Select rather than hard-code bucket/file name raw = ( p | "AllNYSVoters_2017-03-27.csv" >> beam.io.ReadFromText("gs://upload-raw/AllNYSVoters_2018-03-13.csv") | "DictFromRawLine" >> beam.ParDo(DictFromRawLine())) elections = (p | "Voter.ElectionCodes" >> beam.io.Read( beam.io.BigQuerySource(table='Voter.ElectionCodes', validate=True)) | "beam.Map(make_kv_pair, 'Election')" >> beam.Map( make_kv_pair, 'Election')) counties = ( p | "Voter.CountyCodes" >> beam.io.Read( beam.io.BigQuerySource(table='Voter.CountyCodes', validate=True)) | "beam.Map(make_kv_pair, 'Code')" >> beam.Map(make_kv_pair, 'Code')) output = ( raw | "Voter.Raw" >> beam.io.WriteToBigQuery( table='Voter.Raw', schema=RAW_VF_SCHEMA, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED )) output = ( raw | "key_by_county_type" >> beam.Map(key_by_county_type, beam.pvalue.AsDict(counties)) | "beam.CombinePerKey" >> beam.CombinePerKey(sum) | "flatten_sum" >> beam.Map(flatten_sum) | "Voter.Counts" >> beam.io.WriteToBigQuery( table='Voter.Counts', schema=COUNTS_SCHEMA, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED )) output = ( raw # | "BatchElements" >> beam.BatchElements() # | "BatchRunner" >> beam.ParDo(BatchRunner(), known_args.usps_key) | "build_formatted" >> beam.FlatMap(build_formatted, beam.pvalue.AsDict(elections), beam.pvalue.AsDict(counties)) | "Voter.Formatted" >> beam.io.WriteToBigQuery( table='Voter.Formatted', schema=FORMATTED_SCHEMA, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED ))
def needs_help(pcoll): return (pcoll | 'PackageHelp' >> beam.FlatMap( lambda rowdict: packageHelp(rowdict['content'], 'package')) | 'TotalHelp' >> beam.CombinePerKey(sum) | 'DropZero' >> beam.Filter(lambda packages: packages[1] > 0))
# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # beam-playground: # name: CombinePerKey # description: Task from katas to implement the summation of scores per player. # multifile: false # pipeline_options: # categories: # - Combiners import apache_beam as beam from log_elements import LogElements PLAYER_1 = 'Player 1' PLAYER_2 = 'Player 2' PLAYER_3 = 'Player 3' with beam.Pipeline() as p: (p | beam.Create([(PLAYER_1, 15), (PLAYER_2, 10), (PLAYER_1, 100), (PLAYER_3, 25), (PLAYER_2, 75)]) | beam.CombinePerKey(sum) | LogElements())
def testFlipCount(self): computations = flip_count.FlipCount( thresholds=[0.3], counterfactual_prediction_key='counterfactual_pred_key', example_id_key='example_id_key').computations(example_weighted=True) binary_confusion_matrix = computations[0] matrices = computations[1] metrics = computations[2] # TODO(b/171180441): Handle absence of ground truth labels in counterfactual # examples while computing flip count metrics. examples = [ { 'labels': None, 'predictions': np.array([0.5]), 'example_weights': np.array([1.0]), 'features': { 'counterfactual_pred_key': np.array([0.7]), 'example_id_key': np.array(['id_1']), }, }, { 'labels': None, 'predictions': np.array([0.1, 0.7]), # to test flattening 'example_weights': np.array([3.0]), 'features': { 'counterfactual_pred_key': np.array([1.0, 0.1]), 'example_id_key': np.array(['id_2']), }, }, { 'labels': None, 'predictions': np.array([0.5, 0.2]), 'example_weights': np.array([2.0]), 'features': { 'counterfactual_pred_key': np.array([0.2, 0.4]), 'example_id_key': np.array(['id_3']), }, }, { 'labels': None, 'predictions': np.array([0.2, 0.1]), 'example_weights': np.array([1.0]), 'features': { 'counterfactual_pred_key': np.array([0.4, 0.5]), 'example_id_key': np.array(['id_4']), }, } ] with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create(examples) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs, True) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeBinaryConfusionMatrix' >> beam.CombinePerKey( binary_confusion_matrix.combiner) | 'ComputeMatrices' >> beam.Map( lambda x: (x[0], matrices.result(x[1]))) # pyformat: ignore | 'ComputeMetrics' >> beam.Map(lambda x: (x[0], metrics.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_metrics = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_metrics, 6) self.assertDictElementsAlmostEqual( got_metrics, { metric_types.MetricKey( name='flip_count/[email protected]', example_weighted=True): 5.0, metric_types.MetricKey( name='flip_count/[email protected]', example_weighted=True): 7.0, metric_types.MetricKey( name='flip_count/[email protected]', example_weighted=True): 6.0, metric_types.MetricKey( name='flip_count/[email protected]', example_weighted=True): 7.0, }) self.assertAllEqual( got_metrics[metric_types.MetricKey( name='flip_count/[email protected]', example_weighted=True)], np.array([['id_2'], ['id_3']])) self.assertAllEqual( got_metrics[metric_types.MetricKey( name='flip_count/[email protected]', example_weighted=True)], np.array([['id_2'], ['id_3'], ['id_4']])) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
#p1 = beam.Pipeline() pipeline1 = beam.Pipeline(options=options) table_spec = 'crazy-hippo-01:department_dataflow.group_by_name' table_schema = 'name:STRING, count:INTEGER' #with beam.Pipeline() as pipeline1: dep_data_count = ( pipeline1 | 'Read from file' >> beam.io.ReadFromText( 'gs://crazy-hippo-01/dataflow_beam_data/dept-data.txt') | 'Select_data' >> beam.Map(selectData) | 'Filter record on Accounts' >> beam.Filter(filtering) | 'Create Dict of Records' >> beam.Map(lambda record: (record[1], 1)) | 'Apply CombinePerKey on Records' >> beam.CombinePerKey(sum) | 'Make into Dict' >> beam.Map(lambda x: { "name": x[0], "count": x[1] }) #|'Write to Cloud Storage' >> beam.io.WriteToText('gs://crazy-hippo-01/dataflow_beam_data/output_new') | 'Write to BQ' >> beam.io.WriteToBigQuery( table_spec, schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)) pipeline1.run().wait_until_finish()
def testMultiClassConfusionMatrixPlot(self): computations = ( multi_class_confusion_matrix_plot.MultiClassConfusionMatrixPlot() .computations(example_weighted=True)) matrices = computations[0] plot = computations[1] example1 = { 'labels': np.array([2.0]), 'predictions': np.array([0.2, 0.3, 0.5]), 'example_weights': np.array([0.5]) } example2 = { 'labels': np.array([0.0]), 'predictions': np.array([0.1, 0.4, 0.5]), 'example_weights': np.array([1.0]) } example3 = { 'labels': np.array([1.0]), 'predictions': np.array([0.3, 0.2, 0.5]), 'example_weights': np.array([0.25]) } example4 = { 'labels': np.array([1.0]), 'predictions': np.array([0.1, 0.9, 0.0]), 'example_weights': np.array([1.0]) } example5 = { 'labels': np.array([1.0]), 'predictions': np.array([0.1, 0.8, 0.1]), 'example_weights': np.array([1.0]) } example6 = { 'labels': np.array([2.0]), 'predictions': np.array([0.3, 0.2, 0.5]), 'example_weights': np.array([1.0]) } with beam.Pipeline() as pipeline: # pylint: disable=no-value-for-parameter result = ( pipeline | 'Create' >> beam.Create( [example1, example2, example3, example4, example5, example6]) | 'Process' >> beam.Map(metric_util.to_standard_metric_inputs) | 'AddSlice' >> beam.Map(lambda x: ((), x)) | 'ComputeMatrices' >> beam.CombinePerKey(matrices.combiner) | 'ComputePlot' >> beam.Map(lambda x: (x[0], plot.result(x[1])))) # pylint: enable=no-value-for-parameter def check_result(got): try: self.assertLen(got, 1) got_slice_key, got_plots = got[0] self.assertEqual(got_slice_key, ()) self.assertLen(got_plots, 1) key = metric_types.PlotKey( name='multi_class_confusion_matrix_plot', example_weighted=True) got_matrix = got_plots[key] self.assertProtoEquals( """ matrices { threshold: 0.0 entries { actual_class_id: 0 predicted_class_id: 2 num_weighted_examples: 1.0 } entries { actual_class_id: 1 predicted_class_id: 1 num_weighted_examples: 2.0 } entries { actual_class_id: 1 predicted_class_id: 2 num_weighted_examples: 0.25 } entries { actual_class_id: 2 predicted_class_id: 2 num_weighted_examples: 1.5 } } """, got_matrix) except AssertionError as err: raise util.BeamAssertException(err) util.assert_that(result, check_result, label='result')
def expand(self, pcoll): return (pcoll | 'PairWithOne' >> beam.Map(lambda v: (v, 1)) | beam.CombinePerKey(sum))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input', dest='input', default='gs://lswa-scalica/input/df_input.txt', help='Input file to process.') parser.add_argument('--output', dest='output', default='gs://lswa-scalica/output/df_output.txt', help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_args.extend([ '--runner=DataflowRunner', '--project=scalica-224416', '--staging_location=gs://lswa-scalica/staging', '--temp_location=gs://lswa-scalica/tmp', '--job_name=scalica-job', ]) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: # format input into a dictionary def format_input(line): split_line = line.split(',') user_id = split_line[0] followees = split_line[1].split('-') followees = [int(followee) for followee in followees if followee] followers = split_line[2].split('-') followers = [int(follower) for follower in followers if follower] return { 'user_id': user_id, 'followees': followees, 'followers': followers } # split followees and followers into list of pairs def split(user_data): follower_pairs = [] for followee in user_data['followees']: for follower in user_data['followers']: if followee != follower: follower_pair = str(followee) + ',' + str(follower) follower_pairs.append(follower_pair) return follower_pairs # emit a count for each follower_pair def map_count(follower_pair): return (follower_pair, 1) # format each follower pair + counter def format_result(map_pair): (follower_pair, count) = map_pair return '%s: %s' % (follower_pair, count) logging.info('reading from input') # Read the input file lines = p | ReadFromText(known_args.input) print(lines) suggestions = (lines | 'FormatInput' >> beam.Map(format_input) | 'Split' >> beam.FlatMap(split) | 'MapCount' >> beam.Map(map_count) | 'GroupAndSum' >> beam.CombinePerKey(sum)) logging.info('generated suggestions') output = suggestions | 'Format' >> beam.Map(format_result) # for convenience, only write to one shard # for scalability, don't define this parameter so Dataflow scales numshards appropriately output | WriteToText(known_args.output, num_shards=1)
def expand(self, pcoll): return pcoll \ | beam.CombinePerKey(sum).with_output_types( typing.Tuple[unicode, int])