def main(): # bq_source = BigQuerySource(query=""" # SELECT created_at, text # FROM got_sentiment.got_tweets # """, # validate=False, coder=None, # use_standard_sql=True, flatten_results=True, # kms_key=None) # Removed attributes from ReadFromPubSub: # with_attributes=False, # timestamp_attribute='created_at' # Create the Pipeline with the specified options. with Pipeline(options=options) as p: results = ( p | 'read_from_topic' >> ReadFromPubSub(topic=PUBSUB_TOPIC) | 'Window' >> WindowInto(window.FixedWindows(60)) | 'Emit_needed_values' >> FlatMap(emit_values, entity_map) | 'Combine' >> CombinePerKey(EntityScoreCombine()) | 'Add Window Timestamp' >> beam.ParDo(AddWindowTimestampFn()) | 'FormatForWrite' >> Map(format_for_write) | 'Write' >> WriteToBigQuery('streaming_scores', dataset=BQ_DATASET, project=PROJECT_ID, create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', batch_size=20))
def test(self): def format_record(record): import base64 return base64.b64encode(record[1]) def make_insert_mutations(element): import uuid # pylint: disable=reimported from apache_beam.io.gcp.experimental.spannerio import WriteMutation ins_mutation = WriteMutation.insert(table='test', columns=('id', 'data'), values=[(str(uuid.uuid1()), element)]) return [ins_mutation] ( # pylint: disable=expression-not-assigned self.pipeline | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Format' >> Map(format_record) | 'Make mutations' >> FlatMap(make_insert_mutations) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Write to Spanner' >> WriteToSpanner( project_id=self.project, instance_id=self.spanner_instance, database_id=self.TEST_DATABASE, max_batch_size_bytes=5120))
def _create_input_data(self): """ Runs an additional pipeline which creates test data and waits for its completion. """ def format_record(record): import base64 return base64.b64encode(record[1]) def make_insert_mutations(element): import uuid from apache_beam.io.gcp.experimental.spannerio import WriteMutation ins_mutation = WriteMutation.insert(table='test_data', columns=('id', 'data'), values=[(str(uuid.uuid1()), element)]) return [ins_mutation] with TestPipeline() as p: ( # pylint: disable=expression-not-assigned p | 'Produce rows' >> Read( SyntheticSource(self.parse_synthetic_source_options())) | 'Format' >> Map(format_record) | 'Make mutations' >> FlatMap(make_insert_mutations) | 'Write to Spanner' >> WriteToSpanner( project_id=self.project, instance_id=self.spanner_instance, database_id=self.spanner_database, max_batch_size_bytes=5120))
def _verify_data(self, pcol, init_size, data_size): read = pcol | 'read' >> ReadAllFromParquet() v1 = ( read | 'get_number' >> Map(lambda x: x['number']) | 'sum_globally' >> CombineGlobally(sum) | 'validate_number' >> FlatMap(lambda x: TestParquetIT._sum_verifier(init_size, data_size, x))) v2 = ( read | 'make_pair' >> Map(lambda x: (x['name'], x['number'])) | 'count_per_key' >> Count.PerKey() | 'validate_name' >> FlatMap( lambda x: TestParquetIT._count_verifier(init_size, data_size, x))) _ = ((v1, v2, pcol) | 'flatten' >> Flatten() | 'reshuffle' >> Reshuffle() | 'cleanup' >> Map(lambda x: FileSystems.delete([x])))
def main(): options = PipelineOptions() options.view_as(SetupOptions).save_main_session = True p = Pipeline(options=options) (p | Create(list(range(NUM_SHARDS))) | FlatMap(lambda _: (bytes(ELEMENT_BYTES) for _ in range(NUM_ELEMENTS_PER_SHARD))) | WithKeys('') | ParDo(BigBagDoFn())) p.run()
def expand(self, xs): return (xs | Map(self.tag_with_id) | "Group by id" >> GroupByKey() | Map(self.sort_by_time) | FlatMap(self.compute_encounters))