def _check_worker_count(self, pipeline_options, expected=0, exception=False): if exception: self.assertRaises( Exception, sdk_worker_main._get_worker_count, PipelineOptions.from_dictionary(json.loads(pipeline_options))) else: self.assertEquals( sdk_worker_main._get_worker_count( PipelineOptions.from_dictionary(json.loads(pipeline_options))), expected)
def _parse_pipeline_options(options_json): options = json.loads(options_json) # Check the options field first for backward compatibility. if 'options' in options: return PipelineOptions.from_dictionary(options.get('options')) else: # Remove extra urn part from the key. portable_option_regex = r'^beam:option:(?P<key>.*):v1$' return PipelineOptions.from_dictionary({ re.match(portable_option_regex, k).group('key') if re.match(portable_option_regex, k) else k: v for k, v in options.items() })
def main(unused_argv): """Main entry point for SDK Fn Harness.""" if 'LOGGING_API_SERVICE_DESCRIPTOR' in os.environ: logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(os.environ['LOGGING_API_SERVICE_DESCRIPTOR'], logging_service_descriptor) # Send all logs to the runner. fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor) # TODO(BEAM-5468): This should be picked up from pipeline options. logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(fn_log_handler) logging.info('Logging handler created.') else: fn_log_handler = None # Start status HTTP server thread. thread = threading.Thread(target=StatusServer().start) thread.daemon = True thread.setName('status-server-demon') thread.start() if 'PIPELINE_OPTIONS' in os.environ: sdk_pipeline_options = _parse_pipeline_options( os.environ['PIPELINE_OPTIONS']) else: sdk_pipeline_options = PipelineOptions.from_dictionary({}) if 'SEMI_PERSISTENT_DIRECTORY' in os.environ: semi_persistent_directory = os.environ['SEMI_PERSISTENT_DIRECTORY'] else: semi_persistent_directory = None logging.info('semi_persistent_directory: %s', semi_persistent_directory) try: _load_main_session(semi_persistent_directory) except Exception: # pylint: disable=broad-except exception_details = traceback.format_exc() logging.error( 'Could not load main session: %s', exception_details, exc_info=True) try: logging.info('Python sdk harness started with pipeline_options: %s', sdk_pipeline_options.get_all_options(drop_default=True)) service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(os.environ['CONTROL_API_SERVICE_DESCRIPTOR'], service_descriptor) # TODO(robertwb): Support credentials. assert not service_descriptor.oauth2_client_credentials_grant.url SdkHarness( control_address=service_descriptor.url, worker_count=_get_worker_count(sdk_pipeline_options)).run() logging.info('Python sdk harness exiting.') except: # pylint: disable=broad-except logging.exception('Python sdk harness failed: ') raise finally: if fn_log_handler: fn_log_handler.close()
def test__create_default_environment(self): docker_image = PortableRunner.default_docker_image() self.assertEqual( PortableRunner._create_environment(PipelineOptions.from_dictionary({})), beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=docker_image ).SerializeToString()))
def test_option_with_space(self): options = PipelineOptions(flags=['--option with space= value with space']) self.assertEqual( getattr(options.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space') options_from_dict = PipelineOptions.from_dictionary( options.get_all_options()) self.assertEqual( getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions), 'option with space'), ' value with space')
def test_from_dictionary(self): for case in PipelineOptionsTest.TEST_CASES: options = PipelineOptions(flags=case['flags']) all_options_dict = options.get_all_options() options_from_dict = PipelineOptions.from_dictionary(all_options_dict) self.assertEqual(options_from_dict.view_as( PipelineOptionsTest.MockOptions).mock_flag, case['expected']['mock_flag']) self.assertEqual(options.view_as( PipelineOptionsTest.MockOptions).mock_option, case['expected']['mock_option'])
def test__create_docker_environment(self): docker_image = 'py-docker' self.assertEqual( PortableRunner._create_environment(PipelineOptions.from_dictionary({ 'environment_type': 'DOCKER', 'environment_config': docker_image, })), beam_runner_api_pb2.Environment( urn=common_urns.environments.DOCKER.urn, payload=beam_runner_api_pb2.DockerPayload( container_image=docker_image ).SerializeToString()))
def create_options(self): def get_pipeline_name(): for _, _, _, method_name, _, _ in inspect.stack(): if method_name.find('test') != -1: return method_name return 'unknown_test' # Set the job name for better debugging. options = PipelineOptions.from_dictionary({ 'job_name': get_pipeline_name() + '_' + str(time.time()) }) options.view_as(PortableOptions).job_endpoint = self._get_job_endpoint() return options
def test__create_process_environment(self): self.assertEqual( PortableRunner._create_environment(PipelineOptions.from_dictionary({ 'environment_type': "PROCESS", 'environment_config': '{"os": "linux", "arch": "amd64", ' '"command": "run.sh", ' '"env":{"k1": "v1"} }', })), beam_runner_api_pb2.Environment( urn=common_urns.environments.PROCESS.urn, payload=beam_runner_api_pb2.ProcessPayload( os='linux', arch='amd64', command='run.sh', env={'k1': 'v1'}, ).SerializeToString())) self.assertEqual( PortableRunner._create_environment(PipelineOptions.from_dictionary({ 'environment_type': 'PROCESS', 'environment_config': '{"command": "run.sh"}', })), beam_runner_api_pb2.Environment( urn=common_urns.environments.PROCESS.urn, payload=beam_runner_api_pb2.ProcessPayload( command='run.sh', ).SerializeToString()))
def create_options(self): def get_pipeline_name(): for _, _, _, method_name, _, _ in inspect.stack(): if method_name.find('test') != -1: return method_name return 'unknown_test' # Set the job name for better debugging. options = PipelineOptions.from_dictionary({ 'job_name': get_pipeline_name() + '_' + str(time.time()) }) options.view_as(PortableOptions).job_endpoint = self._get_job_endpoint() # Override the default environment type for testing. options.view_as(PortableOptions).environment_type = ( python_urns.EMBEDDED_PYTHON) return options
def main(unused_argv): """Main entry point for SDK Fn Harness.""" if 'LOGGING_API_SERVICE_DESCRIPTOR' in os.environ: try: logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(os.environ['LOGGING_API_SERVICE_DESCRIPTOR'], logging_service_descriptor) # Send all logs to the runner. fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor) # TODO(BEAM-5468): This should be picked up from pipeline options. logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(fn_log_handler) logging.info('Logging handler created.') except Exception: logging.error( "Failed to set up logging handler, continuing without.", exc_info=True) fn_log_handler = None else: fn_log_handler = None # Start status HTTP server thread. thread = threading.Thread(name='status_http_server', target=StatusServer().start) thread.daemon = True thread.setName('status-server-demon') thread.start() if 'PIPELINE_OPTIONS' in os.environ: sdk_pipeline_options = _parse_pipeline_options( os.environ['PIPELINE_OPTIONS']) else: sdk_pipeline_options = PipelineOptions.from_dictionary({}) if 'SEMI_PERSISTENT_DIRECTORY' in os.environ: semi_persistent_directory = os.environ['SEMI_PERSISTENT_DIRECTORY'] else: semi_persistent_directory = None logging.info('semi_persistent_directory: %s', semi_persistent_directory) _worker_id = os.environ.get('WORKER_ID', None) try: _load_main_session(semi_persistent_directory) except Exception: # pylint: disable=broad-except exception_details = traceback.format_exc() logging.error('Could not load main session: %s', exception_details, exc_info=True) try: logging.info('Python sdk harness started with pipeline_options: %s', sdk_pipeline_options.get_all_options(drop_default=True)) service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(os.environ['CONTROL_API_SERVICE_DESCRIPTOR'], service_descriptor) # TODO(robertwb): Support credentials. assert not service_descriptor.oauth2_client_credentials_grant.url SdkHarness(control_address=service_descriptor.url, worker_count=_get_worker_count(sdk_pipeline_options), worker_id=_worker_id, profiler_factory=profiler.Profile.factory_from_options( sdk_pipeline_options.view_as(ProfilingOptions))).run() logging.info('Python sdk harness exiting.') except: # pylint: disable=broad-except logging.exception('Python sdk harness failed: ') raise finally: if fn_log_handler: fn_log_handler.close()
def run(in_args=None): """Runs the pre-processing pipeline.""" pipeline_options = PipelineOptions.from_dictionary(vars(in_args)) with beam.Pipeline(options=pipeline_options) as p: configure_pipeline(p, in_args)
def run(): client_bq = bigquery.Client.from_service_account_json( os.environ['GOOGLE_APPLICATION_CREDENTIALS'], location=args.location) bigquery_asset_list = [ # (dataset, table_name, table_schema, table_partition_column) ('logs', f'native_events_{args.environment}', 'logs', 'event_ds'), ('logs', f'native_events_debug_{args.environment}', 'logs', 'event_ds'), ('logs', f'dataflow_backfill_{args.environment}', 'logs', 'event_ds'), ('native', f'events_{args.event_schema}_{args.environment}', args.event_schema, 'event_timestamp') ] try: source_bigquery_assets(client_bq, bigquery_asset_list) except Exception: generate_bigquery_assets(client_bq, bigquery_asset_list) # https://github.com/apache/beam/blob/master/sdks/python/apache_beam/options/pipeline_options.py po, event_category = PipelineOptions(), args.event_category.replace( '_', '-') job_name = f'p1-gcs-to-bq-backfill-{args.event_schema}-{event_category}-{args.event_ds_start}-to-{args.event_ds_stop}-{time_part_name}-{int(time.time())}' # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params pipeline_options = po.from_dictionary({ 'project': args.gcp, 'staging_location': f'gs://{args.bucket_name}/data_type=dataflow/batch/staging/{job_name}/', 'temp_location': f'gs://{args.bucket_name}/data_type=dataflow/batch/temp/{job_name}/', 'runner': args.execution_environment, # {DirectRunner, DataflowRunner} 'setup_file': args.setup_file, 'service_account_email': f'dataflow-batch-{args.environment}@{args.gcp}.iam.gserviceaccount.com', 'job_name': job_name, 'region': args.gcp_region }) pipeline_options.view_as(SetupOptions).save_main_session = True p1 = beam.Pipeline(options=pipeline_options) fileListGcs = (p1 | 'CreateGcsIterators' >> beam.Create( list( generate_gcs_file_list(args.bucket_name, args.event_schema, args.event_environment, category_list, args.event_ds_start, args.event_ds_stop, time_part_list, args.scale_test_name))) | 'GetGcsFileList' >> beam.ParDo(GetGcsFileList()) | 'GcsListPairWithOne' >> beam.Map(lambda x: (x, 1))) fileListBq = ( p1 | 'ParseBqFileList' >> beam.io.Read( beam.io.BigQuerySource( # "What is already in BQ?" query=generate_backfill_query( args.gcp, args.environment, args.event_schema, args.event_environment, (safe_convert_list_to_sql_tuple(category_list), category_name), args.event_ds_start, args.event_ds_stop, (safe_convert_list_to_sql_tuple(time_part_list), time_part_name), args.scale_test_name), use_standard_sql=True)) | 'BqListPairWithOne' >> beam.Map(lambda x: (x['gspath'], 1))) parseList = ({ 'fileListGcs': fileListGcs, 'fileListBq': fileListBq } | 'CoGroupByKey' >> beam.CoGroupByKey() | 'UnionMinusIntersect' >> beam.Filter(lambda x: (len(x[1][ 'fileListGcs']) == 1 and len(x[1]['fileListBq']) == 0)) | 'ExtractKeysParseList' >> beam.Map(lambda x: x[0])) # Write to BigQuery: logsList = ( parseList | 'AddParseInitiatedInfo' >> beam.Map( lambda gspath: { 'job_name': job_name, 'processed_timestamp': time.time(), 'batch_id': hashlib.md5(gspath.encode('utf-8')).hexdigest(), 'event_schema': parse_gspath(gspath, 'event_schema='), 'event_category': parse_gspath(gspath, 'event_category='), 'event_environment': parse_gspath(gspath, 'event_environment=' ), 'event_ds': parse_gspath(gspath, 'event_ds='), 'event_time': parse_gspath(gspath, 'event_time='), 'event': 'parse_initiated', 'gspath': gspath }) | 'WriteParseInitiated' >> beam.io.WriteToBigQuery( table=f'dataflow_backfill_{args.environment}', dataset='logs', project=args.gcp, method='FILE_LOADS', create_disposition=beam.io.gcp.bigquery.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.gcp.bigquery.BigQueryDisposition. WRITE_APPEND, insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy. RETRY_ON_TRANSIENT_ERROR, schema= 'job_name:STRING,processed_timestamp:TIMESTAMP,batch_id:STRING,event_schema:STRING,event_environment:STRING,event_category:STRING,event_ds:DATE,event_time:STRING,event:STRING,gspath:STRING' )) # Write to Pub/Sub: PDone = (parseList | 'DumpParseListPubSub' >> beam.io.WriteToText( f'gs://{args.bucket_name}/data_type=dataflow/batch/output/{job_name}/parselist' ) | 'WriteToPubSub' >> beam.ParDo(WriteToPubSub(), job_name, args.topic, args.gcp, args.bucket_name)) p1.run().wait_until_finish() return job_name
EntityWrapper("", "processedTweets", "root").make_entity) | 'processed tweet write' >> WriteToDatastore(project)) # Actually run the pipeline. return p.run() if __name__ == '__main__': PROJECT = os.environ['PROJECT'] BUCKET = os.environ['BUCKET'] pipeline_options = { 'project': PROJECT, 'staging_location': 'gs://' + BUCKET + '/staging', 'runner': 'direct', 'setup_file': './setup.py', 'job_name': PROJECT + '-twcount', 'temp_location': 'gs://' + BUCKET + '/temp', 'template_location': 'gs://' + BUCKET + '/templates/' + PROJECT + '-twproc_tmpl' } # define and launch the pipeline (non-blocking), which will create the template. pipeline_options = PipelineOptions.from_dictionary(pipeline_options) process_datastore_tweets(PROJECT, pipeline_options)
def test__create_default_environment(self): docker_image = environments.DockerEnvironment.default_docker_image() self.assertEqual( PortableRunner._create_environment( PipelineOptions.from_dictionary({'sdk_location': 'container'})), environments.DockerEnvironment(container_image=docker_image))
def create_harness(environment, dry_run=False): """Creates SDK Fn Harness.""" if 'LOGGING_API_SERVICE_DESCRIPTOR' in environment: try: logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(environment['LOGGING_API_SERVICE_DESCRIPTOR'], logging_service_descriptor) # Send all logs to the runner. fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor) # TODO(BEAM-5468): This should be picked up from pipeline options. logging.getLogger().setLevel(logging.INFO) logging.getLogger().addHandler(fn_log_handler) _LOGGER.info('Logging handler created.') except Exception: _LOGGER.error( "Failed to set up logging handler, continuing without.", exc_info=True) fn_log_handler = None else: fn_log_handler = None pipeline_options_dict = _load_pipeline_options( environment.get('PIPELINE_OPTIONS')) # These are used for dataflow templates. RuntimeValueProvider.set_runtime_options(pipeline_options_dict) sdk_pipeline_options = PipelineOptions.from_dictionary( pipeline_options_dict) filesystems.FileSystems.set_options(sdk_pipeline_options) pickler.set_library( sdk_pipeline_options.view_as(SetupOptions).pickle_library) if 'SEMI_PERSISTENT_DIRECTORY' in environment: semi_persistent_directory = environment['SEMI_PERSISTENT_DIRECTORY'] else: semi_persistent_directory = None _LOGGER.info('semi_persistent_directory: %s', semi_persistent_directory) _worker_id = environment.get('WORKER_ID', None) try: _load_main_session(semi_persistent_directory) except CorruptMainSessionException: exception_details = traceback.format_exc() _LOGGER.error('Could not load main session: %s', exception_details, exc_info=True) raise except Exception: # pylint: disable=broad-except exception_details = traceback.format_exc() _LOGGER.error('Could not load main session: %s', exception_details, exc_info=True) _LOGGER.info('Pipeline_options: %s', sdk_pipeline_options.get_all_options(drop_default=True)) control_service_descriptor = endpoints_pb2.ApiServiceDescriptor() status_service_descriptor = endpoints_pb2.ApiServiceDescriptor() text_format.Merge(environment['CONTROL_API_SERVICE_DESCRIPTOR'], control_service_descriptor) if 'STATUS_API_SERVICE_DESCRIPTOR' in environment: text_format.Merge(environment['STATUS_API_SERVICE_DESCRIPTOR'], status_service_descriptor) # TODO(robertwb): Support authentication. assert not control_service_descriptor.HasField('authentication') experiments = sdk_pipeline_options.view_as(DebugOptions).experiments or [] enable_heap_dump = 'enable_heap_dump' in experiments if dry_run: return sdk_harness = SdkHarness( control_address=control_service_descriptor.url, status_address=status_service_descriptor.url, worker_id=_worker_id, state_cache_size=_get_state_cache_size(experiments), data_buffer_time_limit_ms=_get_data_buffer_time_limit_ms(experiments), profiler_factory=profiler.Profile.factory_from_options( sdk_pipeline_options.view_as(ProfilingOptions)), enable_heap_dump=enable_heap_dump) return fn_log_handler, sdk_harness, sdk_pipeline_options
def _parse_pipeline_options(options_json): return PipelineOptions.from_dictionary( _load_pipeline_options(options_json))
def run(in_args=None): """Runs the pre-processing pipeline.""" pipeline_options = PipelineOptions.from_dictionary(vars(in_args)) with beam.Pipeline(options=pipeline_options) as p: configure_pipeline(p, in_args)