def expand(self, lifts: beam.pvalue.PCollection) -> beam.pvalue.PCollection: """Takes top k and bottom k x values (sorted by lift) per slice and y value. Args: lifts: A PCollection of tuples of the form: ( _SlicedFeatureKey(slice_key, x_path), _LiftInfo(x, y, lift, xy_count, x_count, y_count)). Returns: A PCollection resulting from a group by with the keys of the form (slice_key, x_path) and a stream of values of the form (y, y_count, [(x, lift, xy_count, x_count)], in which the stream of values has been limited to the top k and bottom k elements per key. """ def move_y_info_to_key(key, value): slice_key, x_path = key return (_LiftSeriesKey(slice_key=slice_key, x_path=x_path, y=value.y, y_count=value.y_count), _LiftValue(x=value.x, lift=value.lift, xy_count=value.xy_count, x_count=value.x_count)) # Push y_* into key so that we get per-slice, per-x-path, per-y top and # bottom k when calling {Largest,Smallest}PerKey. # (_LiftSequenceKey(slice, x_path, y, y_count), # _LiftValue(x, lift, xy_count, x_count)) lifts = lifts | 'MoveYToKey' >> beam.MapTuple(move_y_info_to_key) top_key = operator.attrgetter('lift', 'x') if self._top_k_per_y: # (_LiftSequenceKey(slice, x_path, y, y_count), # [_LiftValue(x, lift, xy_count, x_count)]) top_k = (lifts | 'TopK' >> beam.transforms.combiners.Top.PerKey( n=self._top_k_per_y, key=top_key)) if self._bottom_k_per_y: # (_LiftSequenceKey(slice, x_path, y, y_count), # [_LiftValue(x, lift, xy_count, x_count)]) bottom_k = (lifts | 'BottomK' >> beam.transforms.combiners.Top.PerKey( n=self._bottom_k_per_y, reverse=True, key=top_key)) if self._top_k_per_y and self._bottom_k_per_y: # (_LiftSeriesKey(slice, x_path, y, y_count), # [_LiftValue(x, lift, xy_count, x_count)]) grouped_lifts = ( (top_k, bottom_k) | 'MergeTopAndBottom' >> beam.Flatten() | 'FlattenTopAndBottomLifts' >> beam.FlatMapTuple(lambda k, vs: ((k, v) for v in vs)) | 'ReGroupTopAndBottom' >> beam.GroupByKey()) elif self._top_k_per_y: grouped_lifts = top_k elif self._bottom_k_per_y: grouped_lifts = bottom_k else: grouped_lifts = lifts | 'GroupByYs' >> beam.GroupByKey() def move_y_info_to_value(key, lift_values): return (_SlicedFeatureKey(key.slice_key, key.x_path), _LiftSeries(y=key.y, y_count=key.y_count, lift_values=lift_values)) # (_SlicedFeatureKey(slice, x_path), # _LiftSeries(y, y_count, [_LiftValue(x, lift, xy_count, x_count)])) return (grouped_lifts | 'MoveYInfoToValue' >> beam.MapTuple(move_y_info_to_value))
def _load_data(self, partitions_using_temp_tables, partitions_direct_to_destination, load_job_name_pcv, singleton_pc): """Load data to BigQuery Data is loaded into BigQuery in the following two ways: 1. Single partition: When there is a single partition of files destined to a single destination, a single load job is triggered. 2. Multiple partitions and/or Dynamic Destinations: When there are multiple partitions of files destined for a single destination or when Dynamic Destinations are used, multiple load jobs need to be triggered for each partition/destination. Load Jobs are triggered to temporary tables, and those are later copied to the actual appropriate destination table. This ensures atomicity when only some of the load jobs would fail but not other. If any of them fails, then copy jobs are not triggered. """ # Load data using temp tables trigger_loads_outputs = ( partitions_using_temp_tables | "TriggerLoadJobsWithTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=True, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format), load_job_name_pcv, * self.schema_side_inputs).with_outputs( TriggerLoadJobs.TEMP_TABLES, main='main')) temp_tables_load_job_ids_pc = trigger_loads_outputs['main'] temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES] destination_copy_job_ids_pc = ( singleton_pc | "WaitForTempTableLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(temp_tables_load_job_ids_pc)) | beam.ParDo( TriggerCopyJobs(create_disposition=self.create_disposition, write_disposition=self.write_disposition, test_client=self.test_client), load_job_name_pcv)) finished_copy_jobs_pc = ( singleton_pc | "WaitForCopyJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_copy_job_ids_pc))) _ = ( finished_copy_jobs_pc | "RemoveTempTables/PassTables" >> beam.FlatMap( lambda x, deleting_tables: deleting_tables, pvalue.AsIter(temp_tables_pc)) | "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None)) | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey() | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0]) | "RemoveTempTables/Delete" >> beam.ParDo( DeleteTablesFn(self.test_client))) # Load data directly to destination table destination_load_job_ids_pc = ( partitions_direct_to_destination | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo( TriggerLoadJobs( schema=self.schema, write_disposition=self.write_disposition, create_disposition=self.create_disposition, test_client=self.test_client, temporary_tables=False, additional_bq_parameters=self.additional_bq_parameters, source_format=self._temp_file_format), load_job_name_pcv, * self.schema_side_inputs)) _ = (singleton_pc | "WaitForDestinationLoadJobs" >> beam.ParDo( WaitForBQJobs(self.test_client), beam.pvalue.AsList(destination_load_job_ids_pc))) destination_load_job_ids_pc = ( (temp_tables_load_job_ids_pc, destination_load_job_ids_pc) | beam.Flatten()) return destination_load_job_ids_pc, destination_copy_job_ids_pc
def run(): PROJECT_ID = 'electric-spark-266716' # change to your project id BUCKET = 'gs://global_surface_temperatures' # change to your bucket name DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # Create and set your PipelineOptions. options = PipelineOptions(flags=None) # For Dataflow execution, set the project, job_name, # staging location, temp_location and specify DataflowRunner. google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT_ID google_cloud_options.job_name = 'city-beam-dataflow' google_cloud_options.staging_location = BUCKET + '/staging' google_cloud_options.temp_location = BUCKET + '/temp' options.view_as(StandardOptions).runner = 'DataflowRunner' # Create the Pipeline with the specified options. p = Pipeline(options=options) #create query to select all elements for cleansing sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, City, Country, Latitude,\ Longitude, major_city FROM kaggle_modeled.City as x' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) #read desired table from BigQuery query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) #write inputs to input.txt query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt') # apply ParDo to filter out dates formatted_date_pcoll = query_results | 'Filter Dates' >> beam.ParDo( FilterDateFn()) #write filtered dates to filtered.txt formatted_date_pcoll | 'Write filtered dates' >> WriteToText( DIR_PATH + 'filtered.txt') # group city records by (dt,city) tuple created grouped_city_pcoll = formatted_date_pcoll | 'Group by city, dt' >> beam.GroupByKey( ) # display grouped city records grouped_city_pcoll | 'Write group by' >> WriteToText(DIR_PATH + 'grouped.txt') #remove duplicate city records distinct_city_pcoll = grouped_city_pcoll | 'Delete duplicate records' >> beam.ParDo( DedupCityRecordsFn()) #write resulting PColleciton to output.txt distinct_city_pcoll | 'Write output' >> WriteToText(DIR_PATH + 'output.txt') #create new table in BigQuery dataset_id = 'kaggle_modeled' table_id = 'City_Beam_DF' schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,\ City:STRING,Country:STRING,Latitude:STRING,Longitude:STRING,major_city:INTEGER' # write PCollection to new BQ table distinct_city_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery( dataset=dataset_id, table=table_id, schema=schema_id, project=PROJECT_ID, create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE) result = p.run() result.wait_until_finish()
def Reshard(pcoll): # pylint: disable=invalid-name return ( pcoll | 'PairWithNone' >> beam.Map(lambda x: (None, x)) | 'GroupByNone' >> beam.GroupByKey() | 'ExtractValues' >> beam.FlatMap(lambda x: x[1]))
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each word. def count_ones(word_ones): (word, ones) = word_ones return (word, sum(ones)) counts = (lines | 'split' >> (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the counts into a PCollection of strings. def format_result(word_count): (word, count) = word_count return '%s: %d' % (word, count) output = counts | 'format' >> beam.Map(format_result) # Write the output using a "Write" transform that has side effects. # pylint: disable=expression-not-assigned output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def test_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) # Count the occurrences of each word. counts = (p | beam.Create(['to be or not to be that is the question']) | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that counts will be cached. ib.watch(locals()) result = p.run() result.wait_until_finish() actual = list(result.get(counts)) self.assertSetEqual( set(actual), set([ ('or', 1), ('that', 1), ('be', 2), ('is', 1), ('question', 1), ('to', 2), ('the', 1), ('not', 1), ])) # Truncate the precision to millis because the window coder uses millis # as units then gets upcast to micros. end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000 df_counts = ib.collect(counts, include_window_info=True) df_expected = pd.DataFrame( { 0: [e[0] for e in actual], 1: [e[1] for e in actual], 'event_time': [end_of_window for _ in actual], 'windows': [[GlobalWindow()] for _ in actual], 'pane_info': [ PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual ] }, columns=[0, 1, 'event_time', 'windows', 'pane_info']) pd.testing.assert_frame_equal(df_expected, df_counts) actual_reified = result.get(counts, include_window_info=True) expected_reified = [ WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual ] self.assertEqual(actual_reified, expected_reified)
events = (csv_formatted_data | "FormatToBigquery" >> beam.Map(format_csv_data_bq) | "WriteAllDataToBigQuery" >> beam.io.WriteToBigQuery( "samhitha-data228-project:Vaccine_Dataset.vaccine_format", write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) (csv_formatted_data | "FormatToBigquery old" >> beam.Map(format_data_old) | "Write all data to BigQuery asd" >> beam.io.WriteToBigQuery( "samhitha-data228-project:Vaccine_Dataset.Vaccination_Data_new", write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)) grouped_by_country = ( csv_formatted_data | "CollectingCountryKey" >> beam.ParDo(CollectLocationKey()) | "GroupingByCountry" >> beam.GroupByKey() #|beam.Map(print) ) # Which country Vaccinating more people? Total_vaccinations = ( grouped_by_country | "ExtractTotalVaccinations" >> beam.Map(lambda x: { 'country': x[0], 'total_vaccines': sum(x[1]) })) Top_10_country_vaccinations = ( Total_vaccinations | 'AddKey' >> beam.Map(addKey)
# setting input and output files input_filename = "./data/sp500.csv" output_filename = "./output/result.txt" # instantiate the pipeline options = PipelineOptions() with beam.Pipeline(options=options) as p: # reading the csv and splitting lines by elements we want to retain csv_lines = (p | beam.io.ReadFromText(input_filename, skip_header_lines=1) | beam.ParDo(Split())) # calculate the mean for Open values mean_open = (csv_lines | beam.ParDo(CollectOpen()) | "Grouping keys Open" >> beam.GroupByKey() | "Calculating mean for Open" >> beam.CombineValues( beam.combiners.MeanCombineFn())) # calculate the mean for Close values mean_close = (csv_lines | beam.ParDo(CollectClose()) | "Grouping keys Close" >> beam.GroupByKey() | "Calculating mean for Close" >> beam.CombineValues( beam.combiners.MeanCombineFn())) # writing results to file output = ({ 'Mean Open': mean_open, 'Mean Close': mean_close } | beam.CoGroupByKey() | beam.io.WriteToText(output_filename))
def test_multiple_outputs_with_watermark_advancement(self): """Tests that the TestStream can independently control output watermarks.""" # Purposely set the watermark of numbers to 20 then letters to 5 to test # that the watermark advancement is per PCollection. # # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be # emitted at different times so that they will have different windows. The # watermark advancement is checked by checking their windows. If the # watermark does not advance, then the windows will be [-inf, -inf). If the # windows do not advance separately, then the PCollections will both # windowed in [15, 30). letters_elements = [ TimestampedValue('a', 6), TimestampedValue('b', 7), TimestampedValue('c', 8), ] numbers_elements = [ TimestampedValue('1', 21), TimestampedValue('2', 22), TimestampedValue('3', 23), ] test_stream = (TestStream() .advance_watermark_to(0, tag='letters') .advance_watermark_to(0, tag='numbers') .advance_watermark_to(20, tag='numbers') .advance_watermark_to(5, tag='letters') .add_elements(letters_elements, tag='letters') .advance_watermark_to(10, tag='letters') .add_elements(numbers_elements, tag='numbers') .advance_watermark_to(30, tag='numbers')) # yapf: disable options = StandardOptions(streaming=True) options.view_as(DebugOptions).add_experiment( 'passthrough_pcollection_output_ids') p = TestPipeline(options=options) main = p | test_stream # Use an AfterWatermark trigger with an early firing to test that the # watermark is advancing properly and that the element is being emitted in # the correct window. letters = ( main['letters'] | 'letter windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey()) numbers = ( main['numbers'] | 'number windows' >> beam.WindowInto( FixedWindows(15), trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'number with key' >> beam.Map(lambda x: ('k', x)) | 'number gbk' >> beam.GroupByKey()) # The letters were emitted when the watermark was at 5, thus we expect to # see the elements in the [0, 15) window. We used an early trigger to make # sure that the ON_TIME empty pane was also emitted with a TestStream. # This pane has no data because of the early trigger causes the elements to # fire before the end of the window and because the accumulation mode # discards any data after the trigger fired. expected_letters = { window.IntervalWindow(0, 15): [ ('k', ['a', 'b', 'c']), ('k', []), ], } # Same here, except the numbers were emitted at watermark = 20, thus they # are in the [15, 30) window. expected_numbers = { window.IntervalWindow(15, 30): [ ('k', ['1', '2', '3']), ('k', []), ], } assert_that(letters, equal_to_per_window(expected_letters), label='letters assert per window') assert_that(numbers, equal_to_per_window(expected_numbers), label='numbers assert per window') p.run()
def pipeline(root): """Beam pipeline. Args: root: the root of the pipeline. """ stage1_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage1') stage2_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage2') # Create a collection of conformers with duplicate information equivalent_files = gfile.glob(FLAGS.input_equivalent_glob) equivalent_conformers = ( root | 'CreateEquivInputs' >> beam.Create(equivalent_files) | 'ParseEquiv' >> beam.FlatMap(parse_equivalent_file)) # Merge by bond_topology_id merged_results = ( (stage1_matched_conformers, stage2_matched_conformers, equivalent_conformers) | 'FlattenAllConformers' >> beam.Flatten() | 'GroupByCID' >> beam.GroupBy(lambda c: c.conformer_id) | 'MergeConformers' >> beam.ParDo(MergeConformersFn()).with_outputs( MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT, main='conformers')) merged_conformers = merged_results['conformers'] # Write out the merge conflicts _ = ( merged_results[MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT] | 'ConflictsCSVFormat' >> beam.Map(csv_format) | 'ConflictsReshuffle' >> beam.Reshuffle() | 'WriteConflictsCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_conflicts', header=csv_format(smu_utils_lib.MERGE_CONFLICT_FIELDS), num_shards=1, file_name_suffix='.csv')) cleaned_conformers = ( merged_conformers | 'CleanUpConformers' >> beam.Map(clean_up_conformer)) # Get the bond length distributions bond_length_dists_pcoll = ( cleaned_conformers | 'ExtractBondLengths' >> beam.FlatMap( extract_bond_lengths, dist_sig_digits=_BOND_LENGTHS_SIG_DIGITS, unbonded_max=_BOND_LENGTHS_UNBONDED_MAX) | 'CountBondLengths' >> beam.combiners.Count.PerElement() | 'ToListBondLengths' >> beam.combiners.ToList()) _ = ( bond_length_dists_pcoll | 'WriteBondLengths' >> beam.ParDo( write_bond_lengths, filename=f'{FLAGS.output_stem}_bond_lengths.csv')) # Get the SMILES to id mapping needed for UpdateConformerFn smiles_id_pcoll = ( root | 'BTInputForSmiles' >> beam.Create([FLAGS.input_bond_topology_csv]) | 'GenerateSmilesToID' >> beam.FlatMap(smiles_to_id)) smiles_id_dict = beam.pvalue.AsDict(smiles_id_pcoll) # Various per conformer processing update_results = ( cleaned_conformers | 'UpdateConformers' >> beam.ParDo( UpdateConformerFn(), beam.pvalue.AsSingleton(bond_length_dists_pcoll), smiles_id_dict).with_outputs( UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH, main='conformers')) updated_conformers = update_results['conformers'] # Output SMILES mismatches _ = ( update_results[UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH] | 'ReshuffleSmilesOutput' >> beam.Reshuffle() | 'SmilesCSVFormat' >> beam.Map(csv_format) | 'WriteSmilesCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_smiles_compare', header='conformer_id,compare,smiles_given,smiles_with_h,smiles_without_h', num_shards=1, file_name_suffix='.csv')) # Process duplicate information final_conformers = ( updated_conformers | 'KeyedForDuplicates' >> beam.FlatMap(generate_keyed_conformers_for_duplicates) | 'DupGroupByKey' >> beam.GroupByKey() | 'MergeDupInfo' >> beam.MapTuple(merge_duplicate_information)) # Pull the stats of various sorts write to a file _ = ( final_conformers | 'ExtractStats' >> beam.FlatMap(conformer_to_stat_values) | 'CountStats' >> beam.combiners.Count.PerElement() | 'StatsCSVFormat' >> beam.MapTuple(lambda x, c: f'{x[0]},{x[1]},{c}') | 'WriteStatsCSV' >> beam.io.WriteToText( FLAGS.output_stem + '_stats', header='primary_key,secondary_key,count', num_shards=1, file_name_suffix='.csv')) # Generate the summary by bond topology. bare_bt_summaries = ( root | 'BondTopologyInput' >> beam.Create([FLAGS.input_bond_topology_csv]) | 'GenerateBareBTSummaries' >> beam.FlatMap(bond_topology_summaries_from_csv)) real_bt_summaries = ( final_conformers | 'GenerateBTSummaries' >> beam.FlatMap(to_keyed_bond_topology_summary)) _ = ((bare_bt_summaries, real_bt_summaries) | 'FlattenAllBTSummaries' >> beam.Flatten() | 'FinishBTSummary' >> CombineAndWriteBondTopologySummary()) # Make the filtered versions of the dataset complete_conformers = ( final_conformers | 'MakeComplete' >> beam.Map(make_complete_conformer)) standard_conformers = ( final_conformers | 'MakeStandard' >> beam.FlatMap(make_standard_conformer)) # Write the complete and standard conformers as binary protobuf in TFRecord. for id_str, collection in [['complete', complete_conformers], ['standard', standard_conformers]]: _ = ( collection | ('TFRecordReshuffle_' + id_str) >> beam.Reshuffle() | ('WriteTFRecord_' + id_str) >> beam.io.tfrecordio.WriteToTFRecord( f'{FLAGS.output_stem}_{id_str}_tfrecord', coder=beam.coders.ProtoCoder(dataset_pb2.Conformer), num_shards=FLAGS.output_shards)) # Write the complete and standard conformers as JSON. # Bit of a hack here: the slowest part of the whole pipeline is writing out # the JSON for the complete conformers. So we just hard code a tripling of the # shards to get more parallelism. for id_str, collection, num_shards in [[ 'complete', complete_conformers, FLAGS.output_shards * 3 ], ['standard', standard_conformers, FLAGS.output_shards]]: _ = ( collection | ('JSONReshuffle_' + id_str) >> beam.Reshuffle() | ('ToJSON_' + id_str) >> beam.Map(conformer_to_json) | ('WriteJSON_' + id_str) >> beam.io.WriteToText( f'{FLAGS.output_stem}_{id_str}_json', compression_type='gzip', num_shards=num_shards, file_name_suffix='.json.gz'))
def Shuffle(examples): # pylint: disable=invalid-name return (examples | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x)) | 'GroupByRandom' >> beam.GroupByKey() | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
def test_progress_metrics(self): p = self.create_pipeline() if not isinstance(p.runner, fn_api_runner.FnApiRunner): # This test is inherited by others that may not support the same # internal way of accessing progress metrics. self.skipTest('Progress metrics not supported.') return _ = (p | beam.Create([0, 0, 0, 5e-3 * DEFAULT_SAMPLING_PERIOD_MS]) | beam.Map(time.sleep) | beam.Map(lambda x: ('key', x)) | beam.GroupByKey() | 'm_out' >> beam.FlatMap(lambda x: [ 1, 2, 3, 4, 5, beam.pvalue.TaggedOutput('once', x), beam.pvalue.TaggedOutput('twice', x), beam.pvalue.TaggedOutput('twice', x)])) res = p.run() res.wait_until_finish() def has_mi_for_ptransform(monitoring_infos, ptransform): for mi in monitoring_infos: if ptransform in mi.labels['PTRANSFORM']: return True return False try: # TODO(ajamato): Delete this block after deleting the legacy metrics code. # Test the DEPRECATED legacy metrics pregbk_metrics, postgbk_metrics = list( res._metrics_by_stage.values()) if 'Create/Read' not in pregbk_metrics.ptransforms: # The metrics above are actually unordered. Swap. pregbk_metrics, postgbk_metrics = postgbk_metrics, pregbk_metrics self.assertEqual( 4, pregbk_metrics.ptransforms['Create/Read'] .processed_elements.measured.output_element_counts['out']) self.assertEqual( 4, pregbk_metrics.ptransforms['Map(sleep)'] .processed_elements.measured.output_element_counts['None']) self.assertLessEqual( 4e-3 * DEFAULT_SAMPLING_PERIOD_MS, pregbk_metrics.ptransforms['Map(sleep)'] .processed_elements.measured.total_time_spent) self.assertEqual( 1, postgbk_metrics.ptransforms['GroupByKey/Read'] .processed_elements.measured.output_element_counts['None']) # The actual stage name ends up being something like 'm_out/lamdbda...' m_out, = [ metrics for name, metrics in list(postgbk_metrics.ptransforms.items()) if name.startswith('m_out')] self.assertEqual( 5, m_out.processed_elements.measured.output_element_counts['None']) self.assertEqual( 1, m_out.processed_elements.measured.output_element_counts['once']) self.assertEqual( 2, m_out.processed_elements.measured.output_element_counts['twice']) # Test the new MonitoringInfo monitoring format. self.assertEqual(2, len(res._monitoring_infos_by_stage)) pregbk_mis, postgbk_mis = list(res._monitoring_infos_by_stage.values()) if not has_mi_for_ptransform(pregbk_mis, 'Create/Read'): # The monitoring infos above are actually unordered. Swap. pregbk_mis, postgbk_mis = postgbk_mis, pregbk_mis def assert_has_monitoring_info( monitoring_infos, urn, labels, value=None, ge_value=None): def contains_labels(monitoring_info, labels): return len([x for x in labels.items() if x[0] in monitoring_info.labels and monitoring_info.labels[ x[0]] == x[1]]) == len(labels) # TODO(ajamato): Consider adding a matcher framework found = 0 for mi in monitoring_infos: if contains_labels(mi, labels) and mi.urn == urn: if (ge_value is not None and mi.metric.counter_data.int64_value >= ge_value): found = found + 1 elif (value is not None and mi.metric.counter_data.int64_value == value): found = found + 1 ge_value_str = {'ge_value' : ge_value} if ge_value else '' value_str = {'value' : value} if value else '' self.assertEqual( 1, found, "Found (%s) Expected only 1 monitoring_info for %s." % (found, (urn, labels, value_str, ge_value_str),)) # pregbk monitoring infos labels = {'PCOLLECTION' : 'ref_PCollection_PCollection_1'} assert_has_monitoring_info( pregbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=4) labels = {'PCOLLECTION' : 'ref_PCollection_PCollection_2'} assert_has_monitoring_info( pregbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=4) labels = {'PTRANSFORM' : 'Map(sleep)'} assert_has_monitoring_info( pregbk_mis, monitoring_infos.TOTAL_MSECS_URN, labels, ge_value=4 * DEFAULT_SAMPLING_PERIOD_MS) # postgbk monitoring infos labels = {'PCOLLECTION' : 'ref_PCollection_PCollection_6'} assert_has_monitoring_info( postgbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=1) labels = {'PCOLLECTION' : 'ref_PCollection_PCollection_7'} assert_has_monitoring_info( postgbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=5) except: print(res._monitoring_infos_by_stage) raise
def run_pipeline(root, input_note_events, input_ratings, output_path, vocab, section_markers, cur_augmentation_config): """Create beam pipeline to generate TF examples. Args: root: beam.Pipeline root. input_note_events: Path to csv of notes. input_ratings: Path to csv of ratings. output_path: Directory path to write output to. vocab: List of tokens in the vocabulary. section_markers: Dict of markers as accepted by note sectioning. cur_augmentation_config: AugmentationConfig dataclass instance, defines the kinds of augmentations to apply. """ # Load and process ratings: raw_ratings = data_lib.read_raw_ratings(root, input_ratings) ratings = (raw_ratings | "GetLabels" >> beam.Map(data_lib.convert_ratings) | "GroupRatingsByNoteId" >> beam.GroupByKey() | "UnpackRatings" >> beam.Map(lambda x: (x[0], list(x[1])))) # Load and process notes: notes = data_lib.read_filter_notes(root, input_note_events) note_partitions = ( raw_ratings | "PartitionMap" >> (beam.Map(lambda x: (str(x.note_id), x.partition))).with_output_types( Tuple[str, str]) | "DedupPartitionMap" >> beam.Distinct()) # Join. non_rated_notes, rated_notes = ( ({ "ratings": ratings, "notes": notes, "note_partition": note_partitions }) | "Join" >> beam.CoGroupByKey().with_output_types( Tuple[str, Dict[str, Any]]) | "SplitRated" >> beam.Partition( lambda x, n_part: int(bool(x[1]["ratings"])), 2)) # Downsample non-rated. non_rated_notes = data_lib.downsample(non_rated_notes, _N_DOWNSAMPLE.value, _RANDOM_SEED.value) # Process notes. features_and_labels = ( (non_rated_notes, rated_notes) | beam.Flatten() | "ReshuffleJoin" >> beam.Reshuffle() | "ProcessAPData" >> beam.ParDo(data_lib.ProcessAPData(), section_markers) | "FilterAPData" >> beam.Filter(data_lib.filter_by_labels) | "ReshuffleForSubjectId" >> beam.Reshuffle() | "RekeyBySubjectId" >> beam.Map(lambda x: (x[1].subject_id, x[1])) | "GroupBySubjectId" >> beam.GroupByKey() | "OneNoteIdPerRatedSubjectId" >> beam.ParDo( data_lib.OneNoteIdPerRatedSubjectId(), seed=_RANDOM_SEED.value) | "RekeyByNoteId" >> beam.Map(lambda x: (x.note_id, x)) | "ApplyAugmentations" >> beam.ParDo(data_lib.ApplyAugmentations(), cur_augmentation_config, _RANDOM_SEED.value) | "GetFeaturesAndLabels" >> beam.ParDo( data_lib.ProcessFeaturesAndLabels(vocab, _MAX_SEQ_LENGTH.value)) | "ReshuffleFeaturesAndLabels" >> beam.Reshuffle()) # Convert and save tf examples: data_lib.convert_and_save_tf_examples(features_and_labels, output_path, _DEBUG_OUTPUT.value)
def run(argv=None, save_main_session=True): """Main entry point to pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--corpus_home', dest='corpus_home', help='The directory or bucke of the corpus home') parser.add_argument('--input', dest='input', help='A single input file') parser.add_argument('--corpus_prefix', dest='corpus_prefix', help='Prefix after corpus home where the files are') parser.add_argument('--ignorelines', dest='ignorelines', help='Ignore lines containing these words') parser.add_argument('--output', dest='output', required=True, help='Output file') known_args, pipeline_args = parser.parse_known_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) ignorepatterns = [] if known_args.ignorelines: ignorepatterns = load_ignore(known_args.ignorelines) if known_args.corpus_home: logging.info('corpus_home: %s', known_args.corpus_home) corpus_data_dir = '{}/data/corpus'.format(known_args.corpus_home) corpus_index = '{}/collections.csv'.format(corpus_data_dir) corpus_dir = known_args.corpus_home if known_args.corpus_prefix: corpus_dir = '{}/{}'.format(known_args.corpus_home, known_args.corpus_prefix) lines = (p | 'read_top_index' >> ReadFromText(corpus_index) | 'split_top_index' >> beam.ParDo(ExtractIndexEntry()) | 'add_prefix_corpus_data' >> beam.FlatMap(add_prefix, corpus_data_dir) | 'read_secondary_index' >> ReadAllFromText() | 'split_secondary_index' >> beam.ParDo(ExtractIndexEntry()) | 'add_prefix_corpus_dir' >> beam.FlatMap(add_prefix, corpus_dir) | 'read_files' >> ReadAllFromText()) else: lines = p | 'read' >> ReadFromText(known_args.input) # Count the occurrences of each character. def count_ones(char_ones): (c, ones) = char_ones return (c, sum(ones)) # Ignore counts for lines that are boilerplate (copyright notices, etc) re_patterns = [] for val in ignorepatterns: pat = '.*{}.*'.format(val) re_patterns.append(re.compile(pat, re.IGNORECASE)) def not_boilerplate(line): """true if the line does not match a boilerplate pattern """ for re_pattern in re_patterns: if re_pattern.match(line) != None: return False return True counts = (lines | 'filter' >> beam.Filter(not_boilerplate) | 'split' >> (beam.ParDo(CharBigramExtractingDoFn()) .with_output_types(unicode)) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(count_ones)) # Format the result def format_result(char_bigram_count): (char_bigram, count) = char_bigram_count return '%s\t%d' % (char_bigram, count) output = counts | 'format' >> beam.Map(format_result) output | 'write' >> WriteToText(known_args.output) result = p.run() result.wait_until_finish() if (not hasattr(result, 'has_job') or result.has_job): char_bigram_filter = MetricsFilter().with_name('char_bigrams') query_result = result.metrics().query(char_bigram_filter) if query_result['counters']: char_bigram_counter = query_result['counters'][0] logging.info('Total char bigrams: %d', char_bigram_counter.result)
def ComputeWithConfidenceIntervals( # pylint: disable=invalid-name sliced_extracts: beam.pvalue.PCollection, compute_per_slice_metrics_cls: Type[beam.PTransform], num_bootstrap_samples: Optional[int] = DEFAULT_NUM_BOOTSTRAP_SAMPLES, random_seed_for_testing: Optional[int] = None, **kwargs) -> beam.pvalue.PCollection: """PTransform for computing metrics using T-Distribution values. Args: sliced_extracts: Incoming PCollection consisting of slice key and extracts. compute_per_slice_metrics_cls: PTransform class that takes a PCollection of (slice key, extracts) as input and returns (slice key, dict of metrics) as output. The class will be instantiated multiple times to compute metrics both with and without sampling. The class will be initialized using kwargs 'compute_with_sampling' and 'random_seed_for_testing' along with any kwargs passed in **kwargs. num_bootstrap_samples: Number of replicas to use in calculating uncertainty using bootstrapping. If 1 is provided (default), aggregate metrics will be calculated with no uncertainty. If num_bootstrap_samples is > 0, multiple samples of each slice will be calculated using the Poisson bootstrap method. To calculate standard errors, num_bootstrap_samples should be 20 or more in order to provide useful data. More is better, but you pay a performance cost. random_seed_for_testing: Seed to use for unit testing, because nondeterministic tests stink. Each partition will use this value + i. **kwargs: Additional args to pass to compute_per_slice_metrics_cls init. Returns: PCollection of (slice key, dict of metrics) """ if not num_bootstrap_samples: num_bootstrap_samples = 1 # TODO(ckuhn): Cap the number of bootstrap samples at 20. if num_bootstrap_samples < 1: raise ValueError('num_bootstrap_samples should be > 0, got %d' % num_bootstrap_samples) output_results = ( sliced_extracts | 'ComputeUnsampledMetrics' >> compute_per_slice_metrics_cls( compute_with_sampling=False, random_seed_for_testing=None, **kwargs)) if num_bootstrap_samples > 1: multicombine = [] for i in range(num_bootstrap_samples): seed = (None if random_seed_for_testing is None else random_seed_for_testing + i) multicombine.append( sliced_extracts | 'ComputeSampledMetrics%d' % i >> compute_per_slice_metrics_cls(compute_with_sampling=True, random_seed_for_testing=seed, **kwargs)) output_results = ( multicombine | 'FlattenBootstrapPartitions' >> beam.Flatten() | 'GroupBySlice' >> beam.GroupByKey() | 'MergeBootstrap' >> beam.ParDo( _MergeBootstrap(), beam.pvalue.AsDict(output_results))) return output_results
def test_windowing(self): test_stream = (TestStream() .advance_watermark_to(0) .add_elements(['a', 'b', 'c']) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_processing_time(1) .advance_watermark_to(5) .add_elements(['1', '2', '3']) .advance_processing_time(1) .advance_watermark_to(6) .advance_processing_time(1) .advance_watermark_to(7) .advance_processing_time(1) .advance_watermark_to(8) .advance_processing_time(1) .advance_watermark_to(9) .advance_processing_time(1) .advance_watermark_to(10) .advance_processing_time(1) .advance_watermark_to(11) .advance_processing_time(1) .advance_watermark_to(12) .advance_processing_time(1) .advance_watermark_to(13) .advance_processing_time(1) .advance_watermark_to(14) .advance_processing_time(1) .advance_watermark_to(15) .advance_processing_time(1) ) # yapf: disable options = StandardOptions(streaming=True) p = TestPipeline(options=options) records = (p | test_stream | 'letter windows' >> beam.WindowInto( FixedWindows(5), accumulation_mode=trigger.AccumulationMode.DISCARDING) | 'letter with key' >> beam.Map(lambda x: ('k', x)) | 'letter gbk' >> beam.GroupByKey() | ReverseTestStream(sample_resolution_sec=1, output_tag=None)) assert_that( records, equal_to_per_window({ beam.window.GlobalWindow(): [ [ProcessingTimeEvent(5), WatermarkEvent(4999998)], [ ElementEvent([ TimestampedValue(('k', ['a', 'b', 'c']), 4.999999) ]) ], [ProcessingTimeEvent(1), WatermarkEvent(5000000)], [ProcessingTimeEvent(1), WatermarkEvent(6000000)], [ProcessingTimeEvent(1), WatermarkEvent(7000000)], [ProcessingTimeEvent(1), WatermarkEvent(8000000)], [ProcessingTimeEvent(1), WatermarkEvent(9000000)], [ ElementEvent([ TimestampedValue(('k', ['1', '2', '3']), 9.999999) ]) ], [ProcessingTimeEvent(1), WatermarkEvent(10000000)], [ProcessingTimeEvent(1), WatermarkEvent(11000000)], [ProcessingTimeEvent(1), WatermarkEvent(12000000)], [ProcessingTimeEvent(1), WatermarkEvent(13000000)], [ProcessingTimeEvent(1), WatermarkEvent(14000000)], [ProcessingTimeEvent(1), WatermarkEvent(15000000)], ], })) p.run()
def test_streaming_wordcount(self): class WordExtractingDoFn(beam.DoFn): def process(self, element): text_line = element.strip() words = text_line.split() return words # Add the TestStream so that it can be cached. ib.options.capturable_sources.add(TestStream) ib.options.capture_duration = timedelta(seconds=1) p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(), options=StandardOptions(streaming=True)) data = ( p | TestStream() .advance_watermark_to(0) .advance_processing_time(1) .add_elements(['to', 'be', 'or', 'not', 'to', 'be']) .advance_watermark_to(20) .advance_processing_time(1) .add_elements(['that', 'is', 'the', 'question']) | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable counts = (data | 'split' >> beam.ParDo(WordExtractingDoFn()) | 'pair_with_one' >> beam.Map(lambda x: (x, 1)) | 'group' >> beam.GroupByKey() | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1])))) # Watch the local scope for Interactive Beam so that referenced PCollections # will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() # This tests that the data was correctly cached. pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0) expected_data_df = pd.DataFrame([ ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('or', 0, [IntervalWindow(0, 10)], pane_info), ('not', 0, [IntervalWindow(0, 10)], pane_info), ('to', 0, [IntervalWindow(0, 10)], pane_info), ('be', 0, [IntervalWindow(0, 10)], pane_info), ('that', 20000000, [IntervalWindow(20, 30)], pane_info), ('is', 20000000, [IntervalWindow(20, 30)], pane_info), ('the', 20000000, [IntervalWindow(20, 30)], pane_info), ('question', 20000000, [IntervalWindow(20, 30)], pane_info) ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable data_df = ib.collect(data, include_window_info=True) pd.testing.assert_frame_equal(expected_data_df, data_df) # This tests that the windowing was passed correctly so that all the data # is aggregated also correctly. pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0) expected_counts_df = pd.DataFrame([ ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info), ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info), ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info), ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable counts_df = ib.collect(counts, include_window_info=True) # The group by key has no guarantee of order. So we post-process the DF by # sorting so we can test equality. sorted_counts_df = (counts_df .sort_values(['event_time', 0], ascending=True) .reset_index(drop=True)) # yapf: disable pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
class Order(object): def __init__(self, date, time, transaction, item): self.date = date self.time = time self.transaction = transaction self.item = item class parse_item(beam.DoFn): def process(self, element): if element: date, time, transaction, item = element.split(',') return [Order(date, time, transaction, item)] class GetTotal(beam.DoFn): def process(self, element): # get the total transactions for one item return [(str(element[0]), sum(element[1]))] data_from_source = (p | 'ReadMyFile' >> ReadFromText('gs://play_with_data/BreadBasket_DMS.csv') | 'Splitter using beam.Map' >> beam.Map(lambda record: (record.split(','))[0]) | 'Map record to 1' >> beam.Map(lambda record: (record, 1)) | 'GroupBy the data' >> beam.GroupByKey() | 'Get the total in each day' >> beam.ParDo(GetTotal()) | 'Export results to new file' >> WriteToText('gs://play_with_data/output/day-list', '.txt') ) result = p.run()
def test_combine_values(self): occurences = [('cat', 1), ('cat', 5), ('cat', 9), ('dog', 5), ('dog', 2)] # [START combine_values] first_occurences = occurences | beam.GroupByKey() | beam.CombineValues(min) # [END combine_values] self.assertEqual({('cat', 1), ('dog', 2)}, set(first_occurences))
return window.TimestampedValue(v, int(time.time()) + random.randint(0, 1)) def modify_data3(kvpair): # groupbyによりkeyとそのkeyを持つデータのリストのタプルが渡される # windowで分割されているのでデータ数が少なくなる # kvpair = (u'word only', [4, 4, 6, 6, 7]) return {'count_type': kvpair[0], 'sum': sum(kvpair[1])} p7 = beam.Pipeline(options=options) query = 'SELECT * FROM [PROJECTID:testdataset.testtable3] LIMIT 20' (p7 | 'read' >> beam.io.Read( beam.io.BigQuerySource( project='PROJECTID', use_standard_sql=False, query=query)) | "assign tv" >> beam.Map(assign_timevalue) | 'window' >> beam.WindowInto(beam.window.FixedWindows(1)) | 'pair' >> beam.Map(lambda x: (x['count_type'], x['word_count'])) | "groupby" >> beam.GroupByKey() | 'modify' >> beam.Map(modify_data3) | 'write' >> beam.io.Write( beam.io.BigQuerySink( 'testdataset.testtable5', schema='count_type:STRING, sum:INTEGER', create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))) p7.run() # .wait_until_finish()
def test_multiple_destinations_transform(self): output_table_1 = '%s%s' % (self.output_table, 1) output_table_2 = '%s%s' % (self.output_table, 2) output_table_3 = '%s%s' % (self.output_table, 3) output_table_4 = '%s%s' % (self.output_table, 4) schema1 = bigquery.WriteToBigQuery.get_dict_table_schema( bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA)) schema2 = bigquery.WriteToBigQuery.get_dict_table_schema( bigquery_tools.parse_table_schema_from_json( self.BIG_QUERY_SCHEMA_2)) schema_kv_pairs = [ (output_table_1, schema1), (output_table_2, schema2), (output_table_3, schema1), (output_table_4, schema2) ] pipeline_verifiers = [ BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_1, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_2, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_3, data=[(d['name'], d['language']) for d in _ELEMENTS if 'language' in d]), BigqueryFullResultMatcher( project=self.project, query="SELECT * FROM %s" % output_table_4, data=[(d['name'], d['foundation']) for d in _ELEMENTS if 'foundation' in d]) ] args = self.test_pipeline.get_full_options_as_args( on_success_matcher=all_of(*pipeline_verifiers), experiments='use_beam_bq_sink') with beam.Pipeline(argv=args) as p: input = p | beam.Create(_ELEMENTS) schema_map_pcv = beam.pvalue.AsDict( p | "MakeSchemas" >> beam.Create(schema_kv_pairs)) table_record_pcv = beam.pvalue.AsDict( p | "MakeTables" >> beam.Create([('table1', output_table_1), ('table2', output_table_2)])) # Get all input in same machine input = (input | beam.Map(lambda x: (None, x)) | beam.GroupByKey() | beam.FlatMap(lambda elm: elm[1])) _ = ( input | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery( table=lambda x, tables: (tables['table1'] if 'language' in x else tables['table2']), table_side_inputs=(table_record_pcv, ), schema=lambda dest, schema_map: schema_map.get(dest, None), schema_side_inputs=(schema_map_pcv, ), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY)) _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery( table=lambda x: (output_table_3 if 'language' in x else output_table_4), schema=lambda dest, schema_map: schema_map.get(dest, None), schema_side_inputs=(schema_map_pcv, ), create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY, max_file_size=20, max_files_per_bundle=-1))
out_pcoll = in_pcoll | 'Extract Actor and Actress' >> beam.ParDo( ActorActressCountFn()).with_outputs( ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT, ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT) actor_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT] actress_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT] # write PCollections to files actor_pcoll | 'Write Actor File 1' >> WriteToText(DIR_PATH_OUT + 'actor_output.txt') actress_pcoll | 'Write Actress File 1' >> WriteToText(DIR_PATH_OUT + 'actress_output.txt') # apply GroupByKey grouped_actor_pcoll = actor_pcoll | 'Group by Actor' >> beam.GroupByKey() grouped_actress_pcoll = actress_pcoll | 'Group by Actress' >> beam.GroupByKey( ) # write PCollections to files grouped_actor_pcoll | 'Write Actor File 2' >> WriteToText( DIR_PATH_OUT + 'grouped_actor_output.txt') grouped_actress_pcoll | 'Write Actress File 2' >> WriteToText( DIR_PATH_OUT + 'grouped_actress_output.txt') # apply ParDo summed_actor_pcoll = grouped_actor_pcoll | 'Sum up Actor Nominations' >> beam.ParDo( SumNominationsFn()) summed_actress_pcoll = grouped_actress_pcoll | 'Sum up Actress Nominations' >> beam.ParDo( SumNominationsFn())
def _get_page_content(self, pipeline, file_paths, dl_manager): """Build PCollection of un-split page content.""" import apache_beam as beam wet_file_paths = pipeline | "create_wet_files" >> beam.Create( file_paths["wet_files"]) if "wet_urls" in file_paths: def download_url(url, downloader, pipeline): path = downloader.download(url) if not pipeline.is_local(): path = downloader.ship_files_with_pipeline(path, pipeline) return path dl_wet_file_paths = ( pipeline | "create_wet_urls" >> beam.Create(file_paths["wet_urls"]) | beam.Map( download_url, downloader=dl_manager, pipeline=pipeline)) wet_file_paths = (wet_file_paths, dl_wet_file_paths) | beam.Flatten() # Parse WET files and filter by length. # Output: url, text page_content = wet_file_paths | beam.FlatMap( split_wet_file) | beam.Filter(is_valid_length) # Optionally filter for RealNews domains. # Output: url, text if self.config.realnewslike: with open(file_paths["realnews_domains"], "r", encoding="utf-8") as f: realnews_domains = json.load(f) page_content = page_content | beam.Filter(is_realnews_domain, realnews_domains) # Normalize and deduplicate by URL. # Output: url, text page_content = (page_content | "normalize_url" >> beam.Map(normalize_url) | "group_url" >> beam.GroupByKey() | beam.Map(dedupe_urls)) # Optionally filter for WebText-like URLs. # Output: url, text if self.config.webtextlike: webtextlike_urls = ( pipeline | "read_webtextlike_urls" >> beam.io.ReadFromText( os.path.join(file_paths["openwebtext_urls_zip"], _OPENWEBTEXT_URLS_FILE_PATTERN)) | "add_dummy_page" >> beam.Map(lambda x: (x, "")) | "normal_webtext_url" >> beam.Map(normalize_url)) page_content = ({ "text": page_content, "webtextlike_urls": webtextlike_urls } | "group_webtextlike_urls" >> beam.CoGroupByKey() | beam.FlatMap(filter_by_webtextlike)) # Optionally clean pages of badwords, boilerpolate text, and duplicate # spans of sentences. # Output: url, text if self.config.clean: with open(file_paths["badwords"], "r", encoding="utf-8") as f: badwords = [l.strip() for l in f] page_content = page_content | "clean_pages" >> beam.FlatMap( get_clean_page_fn(badwords)) page_content = remove_duplicate_text(page_content) # Optionally filter out non-`language` pages. We do this after cleaning # since it may change the predominate language. if self.config.lang != "all": page_content |= beam.Filter(is_language, language=self.config.lang) return page_content
def _execute(self, window_fn, trigger_fn, accumulation_mode, timestamp_combiner, transcript, spec): runner_name = TestPipeline().runner.__class__.__name__ if runner_name in spec.get('broken_on', ()): self.skipTest('Known to be broken on %s' % runner_name) test_stream = TestStream() for action, params in transcript: if action == 'expect': test_stream.add_elements([('expect', params)]) else: test_stream.add_elements([('expect', [])]) if action == 'input': test_stream.add_elements([('input', e) for e in params]) elif action == 'watermark': test_stream.advance_watermark_to(params) elif action == 'clock': test_stream.advance_processing_time(params) elif action == 'state': pass # Requires inspection of implementation details. else: raise ValueError('Unexpected action: %s' % action) test_stream.add_elements([('expect', [])]) class Check(beam.DoFn): """A StatefulDoFn that verifies outputs are produced as expected. This DoFn takes in two kinds of inputs, actual outputs and expected outputs. When an actual output is received, it is buffered into state, and when an expected output is received, this buffered state is retrieved and compared against the expected value(s) to ensure they match. The key is ignored, but all items must be on the same key to share state. """ def process(self, element, seen=beam.DoFn.StateParam( beam.transforms.userstate.BagStateSpec( 'seen', beam.coders.FastPrimitivesCoder()))): _, (action, data) = element if action == 'actual': seen.add(data) elif action == 'expect': actual = list(seen.read()) seen.clear() if len(actual) > len(data): raise AssertionError( 'Unexpected output: expected %s but got %s' % (data, actual)) elif len(data) > len(actual): raise AssertionError( 'Unmatched output: expected %s but got %s' % (data, actual)) else: def diff(actual, expected): for key in sorted(expected.keys(), reverse=True): if key in actual: if actual[key] != expected[key]: return key for output in actual: diffs = [ diff(output, expected) for expected in data ] if all(diffs): raise AssertionError( 'Unmatched output: %s not found in %s (diffs in %s)' % (output, data, diffs)) else: raise ValueError('Unexpected action: %s' % action) with TestPipeline(options=PipelineOptions(streaming=True)) as p: # Split the test stream into a branch of to-be-processed elements, and # a branch of expected results. inputs, expected = ( p | test_stream | beam.MapTuple(lambda tag, value: beam.pvalue.TaggedOutput( tag, ('key', value))).with_outputs('input', 'expect')) # Process the inputs with the given windowing to produce actual outputs. outputs = ( inputs | beam.MapTuple(lambda key, value: TimestampedValue( (key, value), value)) | beam.WindowInto(window_fn, trigger=trigger_fn, accumulation_mode=accumulation_mode, timestamp_combiner=timestamp_combiner) | beam.GroupByKey() | beam.MapTuple( lambda k, vs, window=beam.DoFn.WindowParam, t=beam.DoFn. TimestampParam: (k, self._windowed_value_info( WindowedValue(vs, windows=[window], timestamp=t)))) # Place outputs back into the global window to allow flattening # and share a single state in Check. | 'Global' >> beam.WindowInto( beam.transforms.window.GlobalWindows())) # Feed both the expected and actual outputs to Check() for comparison. tagged_expected = ( expected | beam.MapTuple(lambda key, value: (key, ('expect', value)))) tagged_outputs = ( outputs | beam.MapTuple(lambda key, value: (key, ('actual', value)))) # pylint: disable=expression-not-assigned (tagged_expected, tagged_outputs) | beam.Flatten() | beam.ParDo( Check())
options.view_as(StandardOptions).runner = 'DirectRunner' #Set pipeline options p = beam.Pipeline(options=options) # Lines transform read the text from input file and to create a PCollection which contains all the text lines lines = p | "read" >> ReadFromText(inFile) #Counts is a ParDo transform that invokes a function process_lines #on each element that tokenizes the text lines into individual words #this is then transformed to a tuple ('word',count) and grouped and counted to #emit the outputs. debug = 2 if (debug == 0): counts = (lines | "split" >> beam.ParDo(classExample).with_output_types(str) | "pair_with_1" >> beam.Map(lambda x: (x, 1)) | "group" >> beam.GroupByKey() | "count" >> beam.Map(lambda x: (x[0], sum(x[1])))) output = counts | "format" >> beam.Map(lambda x: "%s: %s" % (x[0], x[1])) output | "write" >> WriteToText(outFile) elif (debug == 1): sum = (lines | "get_counts" >> beam.ParDo(countWordsOfLine2) | "pair_with_1" >> beam.Map(lambda x: (x, 1)) | "group" >> beam.GroupByKey() | "total" >> beam.Map(lambda x: (x[0], sum(x[1])))) output2 = sum | "format" >> beam.Map(lambda x: "%s: %s" % (x[0], x[1])) output2 | "write" >> WriteToText(outFile2) else: counts = (lines | "split" >> beam.ParDo(classExample).with_output_types(str)
def run(argv=None): """Main entry point; defines and runs the tfidf pipeline.""" parser = argparse.ArgumentParser() parser.add_argument('--input_baseline', required=False, help='baseline URIs to process.') parser.add_argument('--input_updates', required=False, help='updates URIs to process.') parser.add_argument('--input_enriched', required=False, help='updates URIs to process.') parser.add_argument('--output', required=False, help='Output file to write results to.') parser.add_argument('--output_enriched', required=False, help='Output file to write results to.') parser.add_argument('--output_splitted', required=False, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # bq_table_schema = parse_bq_json_schema(json.load(open('schemas/medline.papers.json'))) bq_table_schema = parse_bq_json_schema(json.loads(BQ_SCHEMA)) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=pipeline_options) as p: if known_args.input_baseline or known_args.input_updates: if known_args.input_baseline and known_args.input_updates: medline_articles_base = p | 'BaselineEmitXML' >> ReadMedlineFiles(known_args.input_baseline) medline_articles_updates = p | 'UpdatesEmitXML' >> ReadMedlineFiles(known_args.input_updates) medline_articles = ( (medline_articles_base, medline_articles_updates) | beam.Flatten()) elif known_args.input_baseline: medline_articles = p | 'BaselineEmitXML' >> ReadMedlineFiles(known_args.input_baseline) elif known_args.input_updates: medline_articles = p | 'UpdatesEmitXML' >> ReadMedlineFiles(known_args.input_updates) else: raise AttributeError('at least an XML input is required') parsed_medline_articles = medline_articles | 'ParseXMLtoDict' >> beam.ParDo(MedlineXMLParser()) medline_articles_grouped_by_id = parsed_medline_articles | 'GroupByPMID' >> beam.GroupByKey() unique_medline_articles = medline_articles_grouped_by_id | 'SortByFilename' >> beam.ParDo( GetLatestVersion()) enriched_articles = unique_medline_articles | 'NLPAnalysis' >> beam.ParDo(NLPAnalysis()) json_enriched_medline_articles = enriched_articles | 'EnrichedMedlineToJSON' >> beam.ParDo(ToJSON()) json_enriched_medline_articles | 'WriteEnrichedJSONToGS' >> WriteToText(known_args.output_enriched, file_name_suffix='_enriched.json.gz') elif known_args.input_enriched: json_enriched_medline_articles = p | 'GetEnrichedArticles' >> ReadFromText(known_args.input_enriched) else: raise AttributeError('missing json enriched data input') if known_args.output_splitted: concepts = json_enriched_medline_articles | 'ArticleToConcepts' >> beam.ParDo(ExtractConcepts()) concepts | 'WriteConceptJSONToGS' >> WriteToText(known_args.output_splitted, file_name_suffix='_concepts.json.gz') bioentities = json_enriched_medline_articles | 'ArticleToBioentities' >> beam.ParDo(ExtractBioentities()) bioentities | 'WriteBioentityJSONToGS' >> WriteToText(known_args.output_splitted, file_name_suffix='_bioentities.json.gz') taggedtext = json_enriched_medline_articles | 'ArticleToTaggedText' >> beam.ParDo(ExtractTaggedText()) taggedtext | 'WriteTaggedTextJSONToGS' >> WriteToText(known_args.output_splitted, file_name_suffix='_taggedtext.json.gz') smallarticles = json_enriched_medline_articles | 'ArticleToSmallArticles' >> beam.ParDo(CleanPublication()) smallarticles | 'WriteSmallArticleJSONToGS' >> WriteToText(known_args.output_splitted, file_name_suffix='_small.json.gz')
def expand(self, pcoll): p = pcoll.pipeline temp_location = p.options.view_as(GoogleCloudOptions).temp_location empty_pc = p | "ImpulseEmptyPC" >> beam.Create([]) singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None]) load_job_name_pcv = pvalue.AsSingleton( singleton_pc | beam.Map(lambda _: _generate_load_job_name())) file_prefix_pcv = pvalue.AsSingleton( singleton_pc | "GenerateFilePrefix" >> beam.Map( file_prefix_generator(self._validate, self._custom_gcs_temp_location, temp_location))) destination_data_kv_pc = ( pcoll | "RewindowIntoGlobal" >> self._window_fn() | "AppendDestination" >> beam.ParDo( bigquery_tools.AppendDestinationsFn(self.destination), * self.table_side_inputs)) all_destination_file_pairs_pc = self._write_files( destination_data_kv_pc, file_prefix_pcv) grouped_files_pc = ( all_destination_file_pairs_pc | "GroupFilesByTableDestinations" >> beam.GroupByKey()) partitions = ( grouped_files_pc | beam.ParDo( PartitionFiles(self.max_partition_size, self.max_files_per_partition)).with_outputs( PartitionFiles.MULTIPLE_PARTITIONS_TAG, PartitionFiles.SINGLE_PARTITION_TAG)) multiple_partitions_per_destination_pc = partitions[ PartitionFiles.MULTIPLE_PARTITIONS_TAG] single_partition_per_destination_pc = partitions[ PartitionFiles.SINGLE_PARTITION_TAG] # When using dynamic destinations, elements with both single as well as # multiple partitions are loaded into BigQuery using temporary tables to # ensure atomicity. if self.dynamic_destinations: all_partitions = ((multiple_partitions_per_destination_pc, single_partition_per_destination_pc) | "FlattenPartitions" >> beam.Flatten()) destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\ _load_data(all_partitions, empty_pc, load_job_name_pcv, singleton_pc) else: destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\ _load_data(multiple_partitions_per_destination_pc, single_partition_per_destination_pc, load_job_name_pcv, singleton_pc) return { self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc, self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc, self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc, }
def expand(self, pcoll): return pcoll | 'TestLabel' >> beam.GroupByKey()
# Get the BQ file I want to manipulate sql = 'SELECT * FROM seanlahman_modeled.players2' bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # write PCollection to log file query_results | 'Write to log 1' >> WriteToText(DIR_PATH + 'input.txt') # apply a ParDo to the PCollection t1_pcoll = query_results | 'Extract Players' >> beam.ParDo(CombineDOBFn()) # apply GroupByKey to the PCollection intermediate_pcoll = t1_pcoll | 'Group by players' >> beam.GroupByKey() # write PCollection to a file intermediate_pcoll | 'Write File Intermediately' >> WriteToText(DIR_PATH + 'unwindowed.txt') #nipulate the file to send to BQ done = intermediate_pcoll | 'Make BQ Record' >> beam.ParDo(MakeRecordFn()) # Make the clean data a txt file done | 'Write File' >> WriteToText(DIR_PATH + 'output.txt') # make the BQ table qualified_table_name = PROJECT_ID + ':seanlahman_modeled.players2_Beam_DF' dataset_id = 'seanlahman_modeled' table_id = 'players2_Beam_DF'
# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import apache_beam as beam from log_elements import LogElements p = beam.Pipeline() (p | beam.Create(['apple', 'ball', 'car', 'bear', 'cheetah', 'ant']) | beam.Map(lambda word: (word[0], word)) | beam.GroupByKey() | LogElements()) p.run()