def _AddCrossSliceMetrics( # pylint: disable=invalid-name sliced_combiner_outputs: beam.pvalue.PCollection, cross_slice_specs: Optional[Iterable[config.CrossSlicingSpec]] ) -> Tuple[slicer.SliceKeyOrCrossSliceKeyType, metric_types.MetricsDict]: """Generates CrossSlice metrics from SingleSlices.""" def is_slice_applicable( sliced_combiner_output: Tuple[slicer.SliceKeyType, metric_types.MetricsDict], slicing_specs: Union[config.SlicingSpec, Iterable[config.SlicingSpec]] ) -> bool: slice_key, _ = sliced_combiner_output for slicing_spec in slicing_specs: if slicer.SingleSliceSpec( spec=slicing_spec).is_slice_applicable(slice_key): return True return False def compute_cross_slices( baseline_slice: Tuple[slicer.SliceKeyType, metric_types.MetricsDict], comparison_slices: Iterable[Tuple[slicer.SliceKeyType, Dict[metric_types.MetricKey, Any]]] ) -> Iterator[Tuple[slicer.CrossSliceKeyType, Dict[metric_types.MetricKey, Any]]]: baseline_slice_key, baseline_metrics = baseline_slice for (comparison_slice_key, comparison_metrics) in comparison_slices: result = {} for (comparison_metric_key, comparison_metric_value) in comparison_metrics.items(): if comparison_metric_key not in baseline_metrics: continue result[comparison_metric_key] = ( baseline_metrics[comparison_metric_key] - comparison_metric_value) yield ((baseline_slice_key, comparison_slice_key), result) cross_slice_outputs = [] for cross_slice_ind, cross_slice_spec in enumerate(cross_slice_specs): baseline_slices = ( sliced_combiner_outputs | 'FilterBaselineSlices(%d)' % cross_slice_ind >> beam.Filter( is_slice_applicable, [cross_slice_spec.baseline_spec])) slicing_specs = list(cross_slice_spec.slicing_specs) comparison_slices = ( sliced_combiner_outputs | 'FilterComparisonSlices(%d)' % cross_slice_ind >> beam.Filter( is_slice_applicable, slicing_specs)) cross_slice_outputs.append( baseline_slices | 'GenerateCrossSlices(%d)' % cross_slice_ind >> beam.FlatMap( compute_cross_slices, comparison_slices=beam.pvalue.AsIter(comparison_slices))) if cross_slice_outputs: cross_slice_outputs = (cross_slice_outputs | 'FlattenCrossSliceResults' >> beam.Flatten()) return ([sliced_combiner_outputs, cross_slice_outputs] | 'CombineSingleSlicesWithCrossSlice' >> beam.Flatten()) else: return sliced_combiner_outputs
def expand(self, pcoll: beam.pvalue.PCollection) -> beam.pvalue.PCollection: def _sum_pairwise( iter_of_pairs: Iterator[Tuple[Union[int, float], Union[int, float]]] ) -> Tuple[Union[int, float], Union[int, float]]: """Computes sum of counts and weights.""" # We take advantage of the fact that constructing a np array from a list # is much faster as the length is known beforehand. if isinstance(iter_of_pairs, list): arr = np.array( iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)]) else: arr = np.fromiter( iter_of_pairs, dtype=[('c', np.int64), ('w', np.float)]) return arr['c'].sum(), arr['w'].sum() if self._weight_feature is not None: sum_fn = _sum_pairwise else: # For non-weighted case, use sum combine fn over integers to allow Beam # to use Cython combiner. sum_fn = sum top_k_tuples_combined = ( pcoll | 'ToTopKTuples' >> beam.FlatMap( _to_topk_tuples, bytes_features=self._bytes_features, categorical_features=self._categorical_features, weight_feature=self._weight_feature) | 'CombineCountsAndWeights' >> beam.CombinePerKey(sum_fn) | 'Rearrange' >> beam.MapTuple(lambda k, v: ((k[0], k[1]), (v, k[2])))) # (slice_key, feature), (count_and_maybe_weight, value) top_k = top_k_tuples_combined if self._weight_feature is not None: top_k |= 'Unweighted_DropWeightsAndRearrange' >> beam.MapTuple( lambda k, v: (k, (v[0][0], v[1]))) # (slice_key, feature), (count, value) top_k = ( top_k | 'Unweighted_TopK' >> beam.combiners.Top().PerKey( max(self._num_top_values, self._num_rank_histogram_buckets)) | 'Unweighted_ToFeatureValueCount' >> beam.MapTuple( lambda k, v: (k, [FeatureValueCount(t[1], t[0]) for t in v])) | 'Unweighted_ToProto' >> beam.Map( _make_dataset_feature_stats_proto_with_topk_for_single_feature, categorical_features=self._categorical_features, is_weighted_stats=False, num_top_values=self._num_top_values, frequency_threshold=self._frequency_threshold, num_rank_histogram_buckets=self._num_rank_histogram_buckets)) uniques = ( top_k_tuples_combined | 'Uniques_Keys' >> beam.Keys() | 'Uniques_CountPerFeatureName' >> beam.combiners.Count().PerElement() | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map( _make_dataset_feature_stats_proto_with_uniques_for_single_feature, categorical_features=self._categorical_features)) result_protos = [top_k, uniques] if self._weight_feature is not None: weighted_top_k = ( top_k_tuples_combined | 'Weighted_DropCountsAndRearrange' >> beam.MapTuple(lambda k, v: (k, (v[0][1], v[1]))) # (slice_key, feature), (weight, value) | 'Weighted_TopK' >> beam.combiners.Top().PerKey( max(self._num_top_values, self._num_rank_histogram_buckets)) | 'Weighted_ToFeatureValueCount' >> beam.MapTuple( lambda k, v: (k, [FeatureValueCount(t[1], t[0]) for t in v])) | 'Weighted_ToProto' >> beam.Map( _make_dataset_feature_stats_proto_with_topk_for_single_feature, categorical_features=self._categorical_features, is_weighted_stats=True, num_top_values=self._num_top_values, frequency_threshold=self._weighted_frequency_threshold, num_rank_histogram_buckets=self._num_rank_histogram_buckets)) result_protos.append(weighted_top_k) return (result_protos | 'FlattenTopKUniquesFeatureStatsProtos' >> beam.Flatten())
options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = project_id google_cloud_options.job_name = job_id google_cloud_options.staging_location = bucket_id google_cloud_options.temp_location = bucket_id options.view_as(StandardOptions).runner = 'DataflowRunner' class FormatAsRow(beam.DoFn): def process(self, element): cf = 'cf' column_names = ['prediction', 'time', 'prob_0', 'prob_1'] direct_row = row.DirectRow(row_key=element['event_id']) for name in column_names: direct_row.set_cell(column_family_id=cf, column=name, value=element[name], timestamp=datetime.datetime.now()) yield direct_row with beam.Pipeline(options=options) as p: _ = p | beam.io.ReadFromText(bucket_id+'kinglear.txt')\ | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))\ | beam.combiners.Count.PerElement()\ | beam.Map(lambda word_count: '%s: %s' % (word_count[0], word_count[1]))\ | beam.io.WriteToText(bucket_id+'counts.txt') result = p.run()
def expand(self, pcoll): return (pcoll | beam.FlatMap(lambda line: map(int, line.split(','))) | beam.Map(lambda num: num * 10))
def expand(self, uri_to_content): # Compute the total number of documents, and prepare a singleton # PCollection to use as side input. total_documents = (uri_to_content | 'GetUris 1' >> beam.Keys() | 'GetUniqueUris' >> beam.RemoveDuplicates() | 'CountUris' >> beam.combiners.Count.Globally()) # Create a collection of pairs mapping a URI to each of the words # in the document associated with that that URI. def split_into_words(uri_line): (uri, line) = uri_line return [(uri, w.lower()) for w in re.findall(r'[A-Za-z\']+', line)] uri_to_words = (uri_to_content | 'SplitWords' >> beam.FlatMap(split_into_words)) # Compute a mapping from each word to the total number of documents # in which it appears. word_to_doc_count = ( uri_to_words | 'GetUniqueWordsPerDoc' >> beam.RemoveDuplicates() | 'GetWords' >> beam.Values() | 'CountDocsPerWord' >> beam.combiners.Count.PerElement()) # Compute a mapping from each URI to the total number of words in the # document associated with that URI. uri_to_word_total = ( uri_to_words | 'GetUris 2' >> beam.Keys() | 'CountWordsInDoc' >> beam.combiners.Count.PerElement()) # Count, for each (URI, word) pair, the number of occurrences of that word # in the document associated with the URI. uri_and_word_to_count = ( uri_to_words | 'CountWord-DocPairs' >> beam.combiners.Count.PerElement()) # Adjust the above collection to a mapping from (URI, word) pairs to counts # into an isomorphic mapping from URI to (word, count) pairs, to prepare # for a join by the URI key. def shift_keys(uri_word_count): return (uri_word_count[0][0], (uri_word_count[0][1], uri_word_count[1])) uri_to_word_and_count = (uri_and_word_to_count | 'ShiftKeys' >> beam.Map(shift_keys)) # Perform a CoGroupByKey (a sort of pre-join) on the prepared # uri_to_word_total and uri_to_word_and_count tagged by 'word totals' and # 'word counts' strings. This yields a mapping from URI to a dictionary # that maps the above mentioned tag strings to an iterable containing the # word total for that URI and word and count respectively. # # A diagram (in which '[]' just means 'iterable'): # # URI: {'word totals': [count], # Total words within this URI's document. # 'word counts': [(word, count), # Counts of specific words # (word, count), # within this URI's document. # ... ]} uri_to_word_and_count_and_total = ( { 'word totals': uri_to_word_total, 'word counts': uri_to_word_and_count } | 'CoGroupByUri' >> beam.CoGroupByKey()) # Compute a mapping from each word to a (URI, term frequency) pair for each # URI. A word's term frequency for a document is simply the number of times # that word occurs in the document divided by the total number of words in # the document. def compute_term_frequency(uri_count_and_total): (uri, count_and_total) = uri_count_and_total word_and_count = count_and_total['word counts'] # We have an iterable for one element that we want extracted. [word_total] = count_and_total['word totals'] for word, count in word_and_count: yield word, (uri, float(count) / word_total) word_to_uri_and_tf = ( uri_to_word_and_count_and_total | 'ComputeTermFrequencies' >> beam.FlatMap(compute_term_frequency)) # Compute a mapping from each word to its document frequency. # A word's document frequency in a corpus is the number of # documents in which the word appears divided by the total # number of documents in the corpus. # # This calculation uses a side input, a Dataflow-computed auxiliary value # presented to each invocation of our MapFn lambda. The second argument to # the lambda (called total---note that we are unpacking the first argument) # receives the value we listed after the lambda in Map(). Additional side # inputs (and ordinary Python values, too) can be provided to MapFns and # DoFns in this way. word_to_df = ( word_to_doc_count | 'ComputeDocFrequencies' >> beam.Map( lambda (word, count), total: (word, float(count) / total), AsSingleton(total_documents))) # Join the term frequency and document frequency collections, # each keyed on the word. word_to_uri_and_tf_and_df = ( { 'tf': word_to_uri_and_tf, 'df': word_to_df } | 'CoGroupWordsByTf-df' >> beam.CoGroupByKey()) # Compute a mapping from each word to a (URI, TF-IDF) score for each URI. # There are a variety of definitions of TF-IDF # ("term frequency - inverse document frequency") score; here we use a # basic version that is the term frequency divided by the log of the # document frequency. def compute_tf_idf(word_tf_and_df): (word, tf_and_df) = word_tf_and_df [docf] = tf_and_df['df'] for uri, tf in tf_and_df['tf']: yield word, (uri, tf * math.log(1 / docf)) word_to_uri_and_tfidf = ( word_to_uri_and_tf_and_df | 'ComputeTf-idf' >> beam.FlatMap(compute_tf_idf)) return word_to_uri_and_tfidf
import sys def my_grep(line, term): if line.startswith(term): yield line if __name__ == '__main__': p = beam.Pipeline(argv=sys.argv) input = '../javahelp/src/main/java/com/google/cloud/training/dataanalyst/javahelp/*.java' output_prefix = '/tmp/output' searchTerm = 'import' # find all lines that contain the searchTerm (p | 'GetJava' >> beam.io.ReadFromText(input) | 'Grep' >> beam.FlatMap(lambda line: my_grep(line, searchTerm) ) | 'write' >> beam.io.WriteToText(output_prefix) ) p.run().wait_until_finish()#!/usr/bin/env python """ Copyright Google Inc. 2016 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and
def test_typed_flatmap(self): def fn(element: int) -> typehints.Iterable[int]: yield element * 2 result = [1, 2, 3] | beam.FlatMap(fn) self.assertCountEqual([2, 4, 6], result)
import apache_beam as beam project = 'teak-proton-148317' input_table = 'clouddataflow-readonly:samples.weather_stations' output_table = 'mydataset.weather_copy_from_dataflow1' p = beam.Pipeline(argv=['--project', project]) read = beam.Read(beam.io.BigQuerySource(input_table)) tornadoesMonths = beam.FlatMap(lambda row: [(int(row['month']), 1)] if row['tornado'] else []) monthlyCount = beam.CombinePerKey(sum) frmat = beam.Map(lambda (k, v): {'month': k, 'tornado_count': v}) sve = beam.Write( beam.io.BigQuerySink( output_table, schema='month:INTEGER, tornado_count:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)) (p | read | tornadoesMonths | monthlyCount | frmat | sve) p.run()
def make_beam_pipeline( root, input_filenames, sample_rate, debug, embedding_names, embedding_modules, module_output_keys, audio_key, sample_rate_key, label_key, speaker_id_key, average_over_time, delete_audio_from_output, output_filename, input_format='tfrecord', output_format='tfrecord', suffix='Main'): """Construct beam pipeline for mapping from audio to embeddings. Args: root: The beam root node. input_filenames: Python list. List of input files. sample_rate: Python int, or `None`. The sample rate for all embeddings, or `None` if this is a TFDS dataset, or if each example has its own sample rate. debug: Python bool. Whether to operate in debug mode. embedding_names: Python list of embeddings. embedding_modules: Python list of TF-Hub modules. module_output_keys: Python list of strings, names of output modules. audio_key: Python string, the key of the audio. sample_rate_key: Python string or `None`, the key for. label_key: Python string. Field for label. speaker_id_key: Python string or `None`. Key for speaker ID, or `None`. average_over_time: Python bool. If `True`, average over the time axis. delete_audio_from_output: Python bool. Whether to remove audio fromm outputs. output_filename: Python string. Output filename. input_format: Python string. Must correspond to a function in `reader_functions`. output_format: Python string. Must correspond to a function `writer_functions`. suffix: Python string. Suffix to stage names to make them unique. """ tf_examples_key_ = 'tf_examples' assert tf_examples_key_ not in embedding_names s = suffix # for code brevity. # Read from input. input_examples = reader_functions[input_format](root, input_filenames, s) # In debug mode, take one input example. if debug: input_examples = ( input_examples | f'TakeOne{s}' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) # Sampling generates lists, so flatten back into one collection. | f'DebugFlatten{s}' >> beam.FlatMap(lambda x: x)) # Compute all the embeddings simultaneously. embedding_tables = {} for name, mod, out_key in zip( embedding_names, embedding_modules, module_output_keys): logging.info('Adding signal: %s %s, %s', name, mod, out_key) tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo( ComputeEmbeddingMapFn( name=name, module=mod, output_key=out_key, audio_key=audio_key, sample_rate_key=sample_rate_key, sample_rate=sample_rate, average_over_time=average_over_time)) embedding_tables[name] = tbl assert tf_examples_key_ not in embedding_tables embedding_tables[tf_examples_key_] = input_examples logging.info('embedding_tables: %s', embedding_tables) # Combine embeddings and tf.train.Example, using the common key. combined_tbl = ( embedding_tables | f'CombineEmbeddingTables-{s}' >> beam.CoGroupByKey() | f'AddEmbeddings-{s}' >> beam.Map( _add_embedding_column_map_fn, original_example_key=tf_examples_key_, delete_audio_from_output=delete_audio_from_output, audio_key=audio_key, label_key=label_key, speaker_id_key=speaker_id_key)) output_filename = f'{output_filename}@*' logging.info('Writing to %s', output_filename) writer_functions[output_format](combined_tbl, output_filename, s)
def test_progress_metrics(self): p = self.create_pipeline() if not isinstance(p.runner, fn_api_runner.FnApiRunner): # This test is inherited by others that may not support the same # internal way of accessing progress metrics. self.skipTest('Progress metrics not supported.') return _ = (p | beam.Create([0, 0, 0, 5e-3 * DEFAULT_SAMPLING_PERIOD_MS]) | beam.Map(time.sleep) | beam.Map(lambda x: ('key', x)) | beam.GroupByKey() | 'm_out' >> beam.FlatMap(lambda x: [ 1, 2, 3, 4, 5, beam.pvalue.TaggedOutput('once', x), beam.pvalue.TaggedOutput('twice', x), beam.pvalue.TaggedOutput('twice', x) ])) res = p.run() res.wait_until_finish() def has_mi_for_ptransform(monitoring_infos, ptransform): for mi in monitoring_infos: if ptransform in mi.labels['PTRANSFORM']: return True return False try: # TODO(ajamato): Delete this block after deleting the legacy metrics code. # Test the DEPRECATED legacy metrics pregbk_metrics, postgbk_metrics = list( res._metrics_by_stage.values()) if 'Create/Read' not in pregbk_metrics.ptransforms: # The metrics above are actually unordered. Swap. pregbk_metrics, postgbk_metrics = postgbk_metrics, pregbk_metrics self.assertEqual( 4, pregbk_metrics.ptransforms['Create/Read']. processed_elements.measured.output_element_counts['out']) self.assertEqual( 4, pregbk_metrics.ptransforms['Map(sleep)'].processed_elements. measured.output_element_counts['None']) self.assertLessEqual( 4e-3 * DEFAULT_SAMPLING_PERIOD_MS, pregbk_metrics.ptransforms['Map(sleep)'].processed_elements. measured.total_time_spent) self.assertEqual( 1, postgbk_metrics.ptransforms['GroupByKey/Read']. processed_elements.measured.output_element_counts['None']) # The actual stage name ends up being something like 'm_out/lamdbda...' m_out, = [ metrics for name, metrics in list(postgbk_metrics.ptransforms.items()) if name.startswith('m_out') ] self.assertEqual( 5, m_out.processed_elements.measured. output_element_counts['None']) self.assertEqual( 1, m_out.processed_elements.measured. output_element_counts['once']) self.assertEqual( 2, m_out.processed_elements.measured. output_element_counts['twice']) # Test the new MonitoringInfo monitoring format. self.assertEqual(2, len(res._monitoring_infos_by_stage)) pregbk_mis, postgbk_mis = list( res._monitoring_infos_by_stage.values()) if not has_mi_for_ptransform(pregbk_mis, 'Create/Read'): # The monitoring infos above are actually unordered. Swap. pregbk_mis, postgbk_mis = postgbk_mis, pregbk_mis def assert_has_monitoring_info(monitoring_infos, urn, labels, value=None, ge_value=None): def contains_labels(monitoring_info, labels): return len([ x for x in labels.items() if x[0] in monitoring_info.labels and monitoring_info.labels[x[0]] == x[1] ]) == len(labels) # TODO(ajamato): Consider adding a matcher framework found = 0 for mi in monitoring_infos: if contains_labels(mi, labels) and mi.urn == urn: if (ge_value is not None and mi.metric.counter_data.int64_value >= ge_value): found = found + 1 elif (value is not None and mi.metric.counter_data.int64_value == value): found = found + 1 ge_value_str = {'ge_value': ge_value} if ge_value else '' value_str = {'value': value} if value else '' self.assertEqual( 1, found, "Found (%s) Expected only 1 monitoring_info for %s." % ( found, (urn, labels, value_str, ge_value_str), )) # pregbk monitoring infos labels = {'PCOLLECTION': 'ref_PCollection_PCollection_1'} assert_has_monitoring_info(pregbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=4) labels = {'PCOLLECTION': 'ref_PCollection_PCollection_2'} assert_has_monitoring_info(pregbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=4) labels = {'PTRANSFORM': 'Map(sleep)'} assert_has_monitoring_info(pregbk_mis, monitoring_infos.TOTAL_MSECS_URN, labels, ge_value=4 * DEFAULT_SAMPLING_PERIOD_MS) # postgbk monitoring infos labels = {'PCOLLECTION': 'ref_PCollection_PCollection_6'} assert_has_monitoring_info(postgbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=1) labels = {'PCOLLECTION': 'ref_PCollection_PCollection_7'} assert_has_monitoring_info(postgbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=5) except: print(res._monitoring_infos_by_stage) raise
publicdata.samples.natality WHERE year > 2000 AND weight_pounds > 0 AND mother_age > 0 AND plurality > 0 AND gestation_weeks > 0 AND month > 0 """ if in_test_mode: query = query + ' LIMIT 100' for step in ['train', 'eval']: if step == 'train': selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashmonth),4) < 3'.format(query) else: selquery = 'SELECT * FROM ({}) WHERE MOD(ABS(hashmonth),4) = 3'.format(query) (p | '{}_read'.format(step) >> beam.io.Read(beam.io.BigQuerySource(query = selquery, use_standard_sql = True)) | '{}_csv'.format(step) >> beam.FlatMap(to_csv) | '{}_out'.format(step) >> beam.io.Write(beam.io.WriteToText(os.path.join(OUTPUT_DIR, '{}.csv'.format(step)))) ) job = p.run() if in_test_mode: job.wait_until_finish() print("Done!") preprocess(in_test_mode = False)
def _load_data(self, partitions_using_temp_tables, partitions_direct_to_destination, load_job_name_pcv, copy_job_name_pcv, singleton_pc): """Load data to BigQuery Data is loaded into BigQuery in the following two ways: 1. Single partition: When there is a single partition of files destined to a single destination, a single load job is triggered. 2. Multiple partitions and/or Dynamic Destinations: When there are multiple partitions of files destined for a single destination or when Dynamic Destinations are used, multiple load jobs need to be triggered for each partition/destination. Load Jobs are triggered to temporary tables, and those are later copied to the actual appropriate destination table. This ensures atomicity when only some of the load jobs would fail but not other. If any of them fails, then copy jobs are not triggered. """ # Load data using temp tables trigger_loads_outputs = ( partitions_using_temp_tables | "TriggerLoadJobsWithTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=True, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format), load_job_name_pcv, * self.schema_side_inputs).with_outputs( TriggerLoadJobs.TEMP_TABLES, main='main')) temp_tables_load_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] destination_copy_job_ids_pc = ( singleton_pc | "WaitForTempTableLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(temp_tables_load_job_ids_pc)) | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, test_client=self.test_client), copy_job_name_pcv)) finished_copy_jobs_pc = ( singleton_pc | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_copy_job_ids_pc))) _ = ( finished_copy_jobs_pc | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda x, deleting_tables: deleting_tables, pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None)) | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0]) | "RemoveTempTables/Delete" >> beam.ParDo( DeleteTablesFn(self.test_client))) # Load data directly to destination table destination_load_job_ids_pc = ( partitions_direct_to_destination | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=False, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format), load_job_name_pcv, * self.schema_side_inputs)) _ = (singleton_pc | "WaitForDestinationLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_load_job_ids_pc))) destination_load_job_ids_pc = ( (temp_tables_load_job_ids_pc, destination_load_job_ids_pc) | beam.Flatten()) return destination_load_job_ids_pc, destination_copy_job_ids_pc
import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions p = beam.Pipeline(options=PipelineOptions()) class ComputeWordLengthFn(beam.DoFn): def process(self,element): element = element.split(' ') element = [(len(i),i) for i in element if len(i)] return element def find_rangeFn(element, lower, upper): if lower <= element[0] <= upper: return [element] (p | "Read the file" >> beam.io.ReadFromText('/mybeam/beam_programs/create_pipeline.py') | "lenght of line" >> beam.ParDo(ComputeWordLengthFn()) | "find word between range" >> beam.FlatMap(find_rangeFn,10,100) | "Write Output" >> beam.io.WriteToText('mybeam/sideinputs.txt') ) p.run().wait_until_finish()
def make_beam_pipeline(root, input_filenames, sample_rate, debug, embedding_names, embedding_modules, module_output_keys, audio_key, sample_rate_key, label_key, speaker_id_key, average_over_time, delete_audio_from_output, output_filename, split_embeddings_into_separate_tables=False, use_frontend_fn=False, input_format='tfrecord', output_format='tfrecord', suffix='Main'): """Construct beam pipeline for mapping from audio to embeddings. Args: root: The beam root node. input_filenames: Python list. List of input files. sample_rate: Python int, or `None`. The sample rate for all embeddings, or `None` if this is a TFDS dataset, or if each example has its own sample rate. debug: Python bool. Whether to operate in debug mode. embedding_names: Python list of embeddings. embedding_modules: Python list of TF-Hub modules. module_output_keys: Python list of strings, names of output modules. audio_key: Python string, the key of the audio. sample_rate_key: Python string or `None`, the key for. label_key: Python string. Field for label. speaker_id_key: Python string or `None`. Key for speaker ID, or `None`. average_over_time: Python bool. If `True`, average over the time axis. delete_audio_from_output: Python bool. Whether to remove audio fromm outputs. output_filename: Python string. Output filename. split_embeddings_into_separate_tables: Python bool. If true, write each embedding to a separate table. use_frontend_fn: If `true`, call frontend fn on audio before passing to the model. input_format: Python string. Must correspond to a function in `reader_functions`. output_format: Python string. Must correspond to a function `writer_functions`. suffix: Python string. Suffix to stage names to make them unique. """ tf_examples_key_ = 'tf_examples' assert tf_examples_key_ not in embedding_names s = suffix # for code brevity. # Read from input. input_examples = reader_functions[input_format](root, input_filenames, s) # In debug mode, take one input example. if debug: input_examples = ( input_examples | f'TakeOne{s}' >> beam.transforms.combiners.Sample.FixedSizeGlobally(1) # Sampling generates lists, so flatten back into one collection. | f'DebugFlatten{s}' >> beam.FlatMap(lambda x: x)) # Compute all the embeddings simultaneously. embedding_tables = {} for name, mod, out_key in zip(embedding_names, embedding_modules, module_output_keys): logging.info('Adding signal: %s %s, %s', name, mod, out_key) tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo( ComputeEmbeddingMapFn( name=name, module=mod, output_key=out_key, audio_key=audio_key, sample_rate_key=sample_rate_key, sample_rate=sample_rate, average_over_time=average_over_time, feature_fn=_default_feature_fn if use_frontend_fn else None)) embedding_tables[name] = tbl assert tf_examples_key_ not in embedding_tables embedding_tables[tf_examples_key_] = input_examples logging.info('embedding_tables: %s', embedding_tables) # Either write to one table with all embeddings, or one table per embedding. if split_embeddings_into_separate_tables: output_table_dicts = [(k, { k: v, tf_examples_key_: input_examples }) for k, v in embedding_tables.items() if k != tf_examples_key_] else: output_table_dicts = [('all', embedding_tables)] # Combine embeddings and tf.train.Example, using the common key. writer_function = writer_functions[output_format] for name, embedding_tables in output_table_dicts: if split_embeddings_into_separate_tables: cur_s = f'{name}-{s}' # Add `name` as a subdir. dirname, basename = os.path.split(output_filename) cur_output_filename = os.path.join(dirname, name, f'{basename}@*') else: cur_s = s cur_output_filename = f'{output_filename}@*' combined_tbl = ( embedding_tables | f'CombineEmbeddingTables-{cur_s}' >> beam.CoGroupByKey() | f'AddEmbeddings-{cur_s}' >> beam.Map( _add_embedding_column_map_fn, original_example_key=tf_examples_key_, delete_audio_from_output=delete_audio_from_output, audio_key=audio_key, label_key=label_key, speaker_id_key=speaker_id_key)) logging.info('Writing to %s', cur_output_filename) writer_function(combined_tbl, cur_output_filename, cur_s)
def ComputeQueryBasedMetrics( # pylint: disable=invalid-name extracts: beam.pvalue.PCollection, prediction_key: str, query_id: str, combine_fns: List[beam.CombineFn], ) -> beam.pvalue.PCollection: """Computes metrics and plots using the EvalSavedModel. Args: extracts: PCollection of Extracts. The extracts MUST contain a FeaturesPredictionsLabels extract keyed by tfma.FEATURE_PREDICTIONS_LABELS_KEY and a list of SliceKeyType extracts keyed by tfma.SLICE_KEY_TYPES_KEY. Typically these will be added by calling the default_extractors function. prediction_key: Key in predictions dictionary to use as the prediction (for sorting examples within the query). Use the empty string if the Estimator returns a predictions Tensor (not a dictionary). query_id: Key of query ID column in the features dictionary. combine_fns: List of query based metrics combine functions. Returns: PCollection of (slice key, query-based metrics). """ missing_query_id_counter = beam.metrics.Metrics.counter( constants.METRICS_NAMESPACE, 'missing_query_id') def key_by_query_id(extract: types.Extracts, query_id: str) -> Iterator[Tuple[str, types.Extracts]]: """Extract the query ID from the extract and key by that.""" features = extract[constants.FEATURES_PREDICTIONS_LABELS_KEY].features if query_id not in features: missing_query_id_counter.inc() return feature_value = features[query_id][encoding.NODE_SUFFIX] if isinstance(feature_value, tf.compat.v1.SparseTensorValue): feature_value = feature_value.values if feature_value.size != 1: raise ValueError( 'Query ID feature "%s" should have exactly 1 value, but ' 'found %d instead. Values were: %s' % (query_id, feature_value.size(), feature_value)) yield ('{}'.format(np.asscalar(feature_value)), extract) def merge_dictionaries( dictionaries: Tuple[Dict[str, Any], ...]) -> Dict[str, Any]: """Merge dictionaries in a tuple into a single dictionary.""" result = dict() for d in dictionaries: intersection = set(d.keys()) & set(result.keys()) if intersection: raise ValueError( 'Overlapping keys found when merging dictionaries. ' 'Intersection was: %s. Keys up to this point: %s ' 'keys from next dictionary: %s' % (intersection, result.keys(), d.keys())) result.update(d) return result # pylint: disable=no-value-for-parameter return ( extracts | 'KeyByQueryId' >> beam.FlatMap(key_by_query_id, query_id) | 'CreateQueryExamples' >> beam.CombinePerKey( CreateQueryExamples(prediction_key=prediction_key)) | 'DropQueryId' >> beam.Map(lambda kv: kv[1]._replace(query_id=kv[0])) | 'CombineGlobally' >> beam.CombineGlobally( beam.combiners.SingleInputTupleCombineFn(*combine_fns)) | 'MergeDictionaries' >> beam.Map(merge_dictionaries) | 'AddOverallSliceKey' >> beam.Map(lambda v: ((), v)))
#pipeline3.py: Read data from a file and give results back to another file import apache_beam as beam from apache_beam.io import WriteToText, ReadFromText with beam.Pipeline() as pipeline: lines = pipeline | ReadFromText('sample1.txt') subjects = (lines | 'Subjects' >> beam.FlatMap(str.split)) subjects | WriteToText(file_path_prefix='subjects', file_name_suffix='.txt', shard_name_template='')
""" This dataflow program will find all the winary exists in california on the basis of provided data """ import apache_beam as beam import sys def find_wineries(line, searchText): if searchText in line: yield line if __name__ == "__main__": p = beam.Pipeline(argv=sys.argv) input = '../data/spikey_winery_list.csv' output = '../output/calWineries' searchText = 'California' (p | 'ReadData' >> beam.io.ReadFromText(input) | 'GrepSearchText' >> beam.FlatMap(lambda line: find_wineries(line, searchText)) | 'WriteOutput' >> beam.io.WriteToText(output)) p.run().wait_until_finish()
def expand(self, dataset_and_transform_fn): """Transforms the dataset using the transform_fn. Args: dataset_and_transform_fn: A tuple of dataset and preprocessing function. Returns: A dataset transformed according to the transform_fn. """ (input_values, input_metadata), (transform_fn, output_metadata) = ( dataset_and_transform_fn) # If exclude_outputs is set, update the output metadata. if self._exclude_outputs is not None: if isinstance(output_metadata, beam_metadata_io.BeamDatasetMetadata): # Unwrap BeamDatasetMetadata into DatasetMetadata and pcollections dict. output_metadata, pcollections = output_metadata schema = output_metadata.schema # Update DatasetMetadata to remove excluded outputs output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems(schema.column_schemas) if key not in self._exclude_outputs })) # Update pcollections to keep only pcollections that resolve futures in # the updated metadata. unresolved_future_names = set( future.name for future in output_metadata.substitute_futures({})) pcollections = { name: pcollection for name, pcollection in six.iteritems(pcollections) if name in unresolved_future_names } # Wrap DatasetMetadata and pcollections as BeamDatasetMetadata output_metadata = beam_metadata_io.BeamDatasetMetadata( output_metadata, pcollections) else: schema = output_metadata.schema output_metadata = dataset_metadata.DatasetMetadata( schema=dataset_schema.Schema({ key: column_schema for key, column_schema in six.iteritems(schema.column_schemas) if key not in self._exclude_outputs })) def convert_and_unbatch(batch_dict): return impl_helper.to_instance_dicts(output_metadata.schema, batch_dict) serialized_tf_config = ( analyzer_impls._DEFAULT_TENSORFLOW_CONFIG_BY_RUNNER.get( # pylint: disable=protected-access self.pipeline.runner)) output_instances = ( input_values | 'Batch' >> _BatchElements() | 'Transform' >> beam.ParDo( _RunMetaGraphDoFn( input_metadata.schema, serialized_tf_config, shared_graph_state_handle=shared.Shared(), exclude_outputs=self._exclude_outputs), saved_model_dir=beam.pvalue.AsSingleton(transform_fn)) | 'ConvertAndUnbatch' >> beam.FlatMap(convert_and_unbatch)) _clear_shared_state_after_barrier(self.pipeline, output_instances) return (output_instances, output_metadata)
def prepare_tfrecord(input_audio_paths, output_tfrecord_path, num_shards=None, sample_rate=16000, frame_rate=250, window_secs=4, hop_secs=1, eval_split_fraction=0.0, coarse_chunk_secs=20.0, pipeline_options=''): """Prepares a TFRecord for use in training, evaluation, and prediction. Args: input_audio_paths: An iterable of paths to audio files to include in TFRecord. output_tfrecord_path: The prefix path to the output TFRecord. Shard numbers will be added to actual path(s). num_shards: The number of shards to use for the TFRecord. If None, this number will be determined automatically. sample_rate: The sample rate to use for the audio. frame_rate: The frame rate to use for f0 and loudness features. If set to None, these features will not be computed. window_secs: The size of the sliding window (in seconds) to use to split the audio and features. If 0, they will not be split. hop_secs: The number of seconds to hop when computing the sliding windows. eval_split_fraction: Fraction of the dataset to reserve for eval split. If set to 0, no eval split is created. coarse_chunk_secs: Chunk size in seconds used to split the input audio files. This is used to split large audio files into manageable chunks for better parallelization and to enable non-overlapping train/eval splits. pipeline_options: An iterable of command line arguments to be used as options for the Beam Pipeline. """ pipeline_options = beam.options.pipeline_options.PipelineOptions( pipeline_options) with beam.Pipeline(options=pipeline_options) as pipeline: examples = ( pipeline | beam.Create(input_audio_paths) | beam.Map(_load_audio, sample_rate)) if frame_rate: examples = ( examples | beam.Map(_add_f0_estimate, sample_rate, frame_rate) | beam.Map(add_loudness, sample_rate, frame_rate)) if coarse_chunk_secs: examples |= beam.FlatMap(split_example, sample_rate, frame_rate, coarse_chunk_secs, coarse_chunk_secs) def postprocess_pipeline(examples, output_path, stage_name=''): if stage_name: stage_name = f'_{stage_name}' if window_secs: examples |= f'create_batches{stage_name}' >> beam.FlatMap( split_example, sample_rate, frame_rate, window_secs, hop_secs) _ = ( examples | f'reshuffle{stage_name}' >> beam.Reshuffle() | f'make_tfexample{stage_name}' >> beam.Map(float_dict_to_tfexample) | f'write{stage_name}' >> beam.io.tfrecordio.WriteToTFRecord( output_path, num_shards=num_shards, coder=beam.coders.ProtoCoder(tf.train.Example))) if eval_split_fraction: examples |= beam.Map(add_key) keys = examples | beam.Keys() splits = examples | beam.Partition(eval_split_partition_fn, 2, eval_split_fraction, beam.pvalue.AsList(keys)) # Remove ids. eval_split = splits[0] | 'remove_id_eval' >> beam.Map(lambda x: x[1]) train_split = splits[1] | 'remove_id_train' >> beam.Map(lambda x: x[1]) postprocess_pipeline(eval_split, f'{output_tfrecord_path}-eval', 'eval') postprocess_pipeline(train_split, f'{output_tfrecord_path}-train', 'train') else: postprocess_pipeline(examples, output_tfrecord_path)
list_words_clean = list(map(cleanpunc, list_words)) list_words_lower = list( map(lambda x: x.lower().strip(), list_words_clean)) list_word_key = list(map(lambda x: (x, 1), list_words_lower)) return list_word_key except: pass # try: # return element.split() # except: # pass def cleanpunc( sentence ): #function to clean the word of any punctuation or special characters cleaned = re.sub(r'[?|%|!|+|*|@|&|^|`|~|\'|"|#|=]', r'', sentence) cleaned = re.sub(r'[:|;|.|)|(|,|\|/|_|-]', r'', cleaned) #cleaned = re.sub(r'\s+',r' ',cleaned) return cleaned p1 = beam.Pipeline() Word_Count = (p1 | beam.io.ReadFromText(input_file) | beam.FlatMap(split) | beam.CombinePerKey(sum) | beam.io.WriteToText(output_file)) p1.run()
def _build_pcollection(self, pipeline, filepaths, language): def _extract_content(filepath): # Extracts article content from a single WikiMedia XML file. context = etree.iterparse(filepath, events=("end", ), encoding="utf-8") context = iter(context) # To clear root, to free-up more memory than just `elem.clear()`. _, root = next(context) for _, elem in context: if not elem.tag.endswith("page"): continue namespace = elem.tag[:-4] title = elem.find("./{0}title".format(namespace)).text ns = elem.find("./{0}ns".format(namespace)).text id_ = elem.find("./{0}id".format(namespace)).text # Filter pages that are not in the "main" namespace. if ns != "0": root.clear() continue raw_content = elem.find( "./{0}revision/{0}text".format(namespace)).text root.clear() # Filter redirects. if raw_content is None or raw_content.lower().startswith( "#redirect"): beam.metrics.Metrics.counter(language, "filtered-redirects").inc() continue beam.metrics.Metrics.counter(language, "extracted-examples").inc() yield (id_, title, raw_content) def _clean_content(inputs): id_, title, raw_content = inputs try: text = _parse_and_clean_wikicode(raw_content) except (mwparserfromhell.parser.ParserError) as e: beam.metrics.Metrics.counter(language, "parser-error").inc() return if not text: beam.metrics.Metrics.counter(language, "empty-clean-examples").inc() return beam.metrics.Metrics.counter(language, "cleaned-examples").inc() yield id_, {"title": title, "text": text} feedback("Creating pipeline: extract => shuffle => parse/clean...") return (pipeline | beam.Create(filepaths) | beam.FlatMap(_extract_content) | beam.transforms.Reshuffle() | beam.FlatMap(_clean_content))
import re import apache_beam as beam # Run Dataflow pipeline pipeline = beam.Pipeline('DirectRunner') (pipeline | 'read file' >> beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt') | 'get words' >> beam.FlatMap(lambda x: re.findall(r'\w+', x)).with_output_types(unicode) | 'count words' >> beam.combiners.Count.PerElement() | 'save' >> beam.io.WriteToText('./wordcount_output')) pipeline.run()
def expand(self, pcolls): scalar_inputs = [ expr for expr in self.stage.inputs if is_scalar(expr) ] tabular_inputs = [ expr for expr in self.stage.inputs if not is_scalar(expr) ] if len(tabular_inputs) == 0: partitioned_pcoll = next( pcolls.values()).pipeline | beam.Create([{}]) elif self.stage.partitioning != partitionings.Nothing(): # Partitioning required for these operations. # Compute the number of partitions to use for the inputs based on # the estimated size of the inputs. if self.stage.partitioning == partitionings.Singleton(): # Always a single partition, don't waste time computing sizes. num_partitions = 1 else: # Estimate the sizes from the outputs of a *previous* stage such # that using these estimates will not cause a fusion break. input_sizes = [ estimate_size(input, same_stage_ok=False) for input in tabular_inputs ] if None in input_sizes: # We were unable to (cheaply) compute the size of one or more # inputs. num_partitions = DEFAULT_PARTITIONS else: num_partitions = beam.pvalue.AsSingleton( input_sizes | 'FlattenSizes' >> beam.Flatten() | 'SumSizes' >> beam.CombineGlobally(sum) | 'NumPartitions' >> beam.Map(lambda size: max( MIN_PARTITIONS, min(MAX_PARTITIONS, size // TARGET_PARTITION_SIZE)))) partition_fn = self.stage.partitioning.partition_fn class Partition(beam.PTransform): def expand(self, pcoll): return ( pcoll # Attempt to create batches of reasonable size. | beam.ParDo(_PreBatch()) # Actually partition. | beam.FlatMap(partition_fn, num_partitions) # Don't bother shuffling empty partitions. | beam.Filter(lambda k_df: len(k_df[1]))) # Arrange such that partitioned_pcoll is properly partitioned. main_pcolls = { expr._id: pcolls[expr._id] | 'Partition_%s_%s' % (self.stage.partitioning, expr._id) >> Partition() for expr in tabular_inputs } | beam.CoGroupByKey() partitioned_pcoll = main_pcolls | beam.ParDo(_ReBatch()) else: # Already partitioned, or no partitioning needed. assert len(tabular_inputs) == 1 tag = tabular_inputs[0]._id partitioned_pcoll = pcolls[tag] | beam.Map( lambda df: {tag: df}) side_pcolls = { expr._id: beam.pvalue.AsSingleton(pcolls[expr._id]) for expr in scalar_inputs } # Actually evaluate the expressions. def evaluate(partition, stage=self.stage, **side_inputs): def lookup(expr): # Use proxy if there's no data in this partition return expr.proxy().iloc[:0] if partition[ expr._id] is None else partition[expr._id] session = expressions.Session( dict([(expr, lookup(expr)) for expr in tabular_inputs] + [(expr, side_inputs[expr._id]) for expr in scalar_inputs])) for expr in stage.outputs: yield beam.pvalue.TaggedOutput( expr._id, expr.evaluate_at(session)) return partitioned_pcoll | beam.FlatMap( evaluate, **side_pcolls).with_outputs()
from apache_beam.options.pipeline_options import PipelineOptions with beam.Pipeline(options=PipelineOptions()) as p: table_schema={'fields':[{'name':'key','type':'STRING','mode':'NULLABLE'}, {'name':'value','type':'INTEGER','mode':'NULLABLE'} ]} table_spec = bigquery.TableReference( projectId='XXXXX', datasetId='XXXXx', tableId='word_cnt') def sum_val(tup): (key,val) = tup return {'key': key,'value': sum(val)}#'%s - %d'%(key, sum(val)) out= ( p |"read fro txt " >>ReadFromText("F:\codebase\Dataengineering_stuff\Dataflow\dee.txt.txt") |beam.FlatMap(lambda x: x.split(' ')) |beam.Map(lambda x: (x,1)) |beam.GroupByKey() |beam.Map(sum_val) #|WriteToText("F:\codebase\Dataengineering_stuff\Dataflow\dee1.txt") ) out|beam.io.WriteToBigQuery( table_spec, schema =table_schema, write_disposition = beam.io.BigQueryDisposition.WRITE_TRUNCATE, #create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED )
def run(argv=None): """Runs the workflow.""" parser = argparse.ArgumentParser() parser.add_argument('--input', required=True, help='Input file to process.') parser.add_argument('--output', required=True, help='Output BigQuery table: PROJECT:DATASET.TABLE') known_args, pipeline_args = parser.parse_known_args(argv) schema = bigquery.TableSchema() schema.fields.append(field('Alexa_rank', 'integer')) schema.fields.append(field('Alexa_domain')) schema.fields.append(field('DMOZ_title')) schema.fields.append(field('DMOZ_description')) schema.fields.append(field('DMOZ_url')) schema.fields.append(field('DMOZ_topic', 'string', 'repeated')) schema.fields.append(field('Host')) schema.fields.append(field('FinalLocation')) schema.fields.append(field('HTTPOk', 'boolean')) schema.fields.append(field('HTTPSOk', 'boolean')) schema.fields.append(field('HTTPSOnly', 'boolean')) schema.fields.append(build_response_schema('HTTPResponses')) schema.fields.append(build_response_schema('HTTPSResponses')) schema.fields.append(field('Error')) options = PipelineOptions(pipeline_args) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). options.view_as(SetupOptions).save_main_session = True # https://cloud.google.com/dataflow/pipelines/specifying-exec-params gc_options = options.view_as(GoogleCloudOptions) gc_options.project = 'httparchive' gc_options.job_name = 'host-scan-import-' + str(datetime.date.today()) gc_options.staging_location = 'gs://httparchive/dataflow-binaries' gc_options.temp_location = 'gs://httparchive/dataflow-tmp' wk_options = options.view_as(WorkerOptions) wk_options.num_workers = 10 # options.view_as(StandardOptions).runner = 'DirectPipelineRunner' options.view_as(StandardOptions).runner = 'DataflowPipelineRunner' p = beam.Pipeline(options=options) (p | 'read' >> beam.Read( beam.io.TextFileSource(known_args.input, coder=JsonCoder())) | 'process' >> beam.FlatMap(process_record) # | 'local-write' >> beam.Write(beam.io.TextFileSink('./results'))) | 'bq-write' >> beam.io.Write( beam.io.BigQuerySink( known_args.output, schema=schema, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p.run()
def expand(self, pc): return (pc | beam.Map(rotate_key) | beam.GroupByKey() | 'Ungroup' >> beam.FlatMap(lambda elm: [(elm[0], v) for v in elm[1]]))
def expand(self, pcoll): return pcoll | 'ReadGCSNotifications' >> beam.FlatMap( self.parse_element)
def expand(self, pc): return (pc | beam.Map(rotate_key) | beam.Map( lambda elem, ignored: elem, beam.pvalue.AsIter(pc | beam.FlatMap(lambda elem: None))))
# convert all times to UTC dep_airport_id = fields[6] arr_airport_id = fields[10] dep_timezone = airport_timezones[dep_airport_id][2] arr_timezone = airport_timezones[arr_airport_id][2] for f in [13, 14, 17]: #crsdeptime, deptime, wheelsoff fields[f] = as_utc(fields[0], fields[f], dep_timezone) for f in [18, 20, 21]: #wheelson, crsarrtime, arrtime fields[f] = as_utc(fields[0], fields[f], arr_timezone) yield ','.join(fields) if __name__ == '__main__': with beam.Pipeline('DirectRunner') as pipeline: airports = (pipeline | 'airports:read' >> beam.io.ReadFromText('airports.csv.gz') | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line]))) | 'airports:tz' >> beam.Map(lambda fields: (fields[0], addtimezone(fields[21], fields[26]))) ) flights = (pipeline | 'flights:read' >> beam.io.ReadFromText('201501_part.csv') | 'flights:tzcorr' >> beam.FlatMap(tz_correct, beam.pvalue.AsDict(airports)) ) flights | beam.io.textio.WriteToText('all_flights') pipeline.run()
def expand(self, pcoll): """Computes top-k most frequent values and number of uniques.""" # Convert input example to tuples of form # (slice_key, feature_name, feature_value_list, optional weight) # corresponding to each example. feature_values_with_weights = ( pcoll | 'TopKUniques_ConvertInputToFeatureValuesWithWeights' >> beam.FlatMap(_convert_input_to_feature_values_with_weights, categorical_features=self._categorical_features, weight_feature=self._weight_feature)) # Lambda to convert from ((slice_key, feature_name, feature_value), count) # to ((slice_key, feature_name), (feature_value, count)) modify_key = (lambda x: ((x[0][0], x[0][1]), FeatureValueCount(x[0][2], x[1]))) sliced_feature_name_value_count = ( feature_values_with_weights # Flatten (slice_key, feature_name, feature_value_list, optional weight) # to (slice_key, feature_name, feature_value) | 'TopKUniques_FlattenToSlicedFeatureNameValueTuples' >> beam.FlatMap(_flatten_value_list) # Compute the frequency of each feature_value per slice. Output is a # PCollection of ((slice_key, feature_name, feature_value), count) | 'TopKUniques_CountSlicedFeatureNameValueTuple' >> beam.combiners.Count().PerElement() # Convert from ((slice_key, feature_name, feature_value), count) to # ((slice_key, feature_name), (feature_value, count)) | 'TopKUniques_ModifyKeyToSlicedFeatureName' >> beam.Map(modify_key)) result_protos = [] # Find topk values for each feature. topk = ( sliced_feature_name_value_count # Obtain the top-k most frequent feature value for each feature in a # slice. | 'TopK_GetTopK' >> beam.combiners.Top().PerKey( max(self._num_top_values, self._num_rank_histogram_buckets), _feature_value_count_comparator) | 'TopK_ConvertToSingleFeatureStats' >> beam.Map( _make_dataset_feature_stats_proto_with_topk_for_single_feature, categorical_features=self._categorical_features, is_weighted_stats=False, num_top_values=self._num_top_values, frequency_threshold=self._frequency_threshold, num_rank_histogram_buckets=self._num_rank_histogram_buckets)) result_protos.append(topk) # If a weight feature is provided, find the weighted topk values for each # feature. if self._weight_feature is not None: weighted_topk = ( # Flatten (slice_key, feature_name, feature_value_list, weight) to # ((slice_key, feature_name, feature_value), weight) feature_values_with_weights | 'TopKWeighted_FlattenToSlicedFeatureNameValueTuples' >> beam.FlatMap(_flatten_weighted_value_list) # Sum the weights of each feature_value per slice. Output is a # PCollection of # ((slice_key, feature_name, feature_value), weighted_count) | 'TopKWeighted_CountSlicedFeatureNameValueTuple' >> beam.CombinePerKey(sum) # Convert from # ((slice_key, feature_name, feature_value), weighted_count) to # ((slice_key, feature_name), (feature_value, weighted_count)) | 'TopKWeighted_ModifyKeyToSlicedFeatureName' >> beam.Map(modify_key) # Obtain the top-k most frequent feature value for each feature in a # slice. | 'TopKWeighted_GetTopK' >> beam.combiners.Top().PerKey( max(self._num_top_values, self._num_rank_histogram_buckets), _feature_value_count_comparator) | 'TopKWeighted_ConvertToSingleFeatureStats' >> beam.Map( _make_dataset_feature_stats_proto_with_topk_for_single_feature, categorical_features=self._categorical_features, is_weighted_stats=True, num_top_values=self._num_top_values, frequency_threshold=self._weighted_frequency_threshold, num_rank_histogram_buckets=self._num_rank_histogram_buckets )) result_protos.append(weighted_topk) uniques = ( sliced_feature_name_value_count # Drop the values to only have the slice_key and feature_name with # each repeated the number of unique values times. | 'Uniques_DropValues' >> beam.Keys() | 'Uniques_CountPerFeatureName' >> beam.combiners.Count().PerElement() | 'Uniques_ConvertToSingleFeatureStats' >> beam.Map( _make_dataset_feature_stats_proto_with_uniques_for_single_feature, categorical_features=self._categorical_features)) result_protos.append(uniques) def _deserialize_sliced_feature_stats_proto(entry): feature_stats_proto = statistics_pb2.DatasetFeatureStatistics() feature_stats_proto.ParseFromString(entry[1]) return entry[0], feature_stats_proto return ( result_protos | 'FlattenTopKUniquesResults' >> beam.Flatten() # TODO(b/121152126): This deserialization stage is a workaround. # Remove this once it is no longer needed. | 'DeserializeTopKUniquesFeatureStatsProto' >> beam.Map(_deserialize_sliced_feature_stats_proto))