def _check_worker_count(self, pipeline_options, expected=0, exception=False):
   if exception:
     self.assertRaises(
         Exception, sdk_worker_main._get_worker_count,
         PipelineOptions.from_dictionary(json.loads(pipeline_options)))
   else:
     self.assertEquals(
         sdk_worker_main._get_worker_count(
             PipelineOptions.from_dictionary(json.loads(pipeline_options))),
         expected)
def _parse_pipeline_options(options_json):
  options = json.loads(options_json)
  # Check the options field first for backward compatibility.
  if 'options' in options:
    return PipelineOptions.from_dictionary(options.get('options'))
  else:
    # Remove extra urn part from the key.
    portable_option_regex = r'^beam:option:(?P<key>.*):v1$'
    return PipelineOptions.from_dictionary({
        re.match(portable_option_regex, k).group('key')
        if re.match(portable_option_regex, k) else k: v
        for k, v in options.items()
    })
def main(unused_argv):
  """Main entry point for SDK Fn Harness."""
  if 'LOGGING_API_SERVICE_DESCRIPTOR' in os.environ:
    logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
    text_format.Merge(os.environ['LOGGING_API_SERVICE_DESCRIPTOR'],
                      logging_service_descriptor)

    # Send all logs to the runner.
    fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
    # TODO(BEAM-5468): This should be picked up from pipeline options.
    logging.getLogger().setLevel(logging.INFO)
    logging.getLogger().addHandler(fn_log_handler)
    logging.info('Logging handler created.')
  else:
    fn_log_handler = None

  # Start status HTTP server thread.
  thread = threading.Thread(target=StatusServer().start)
  thread.daemon = True
  thread.setName('status-server-demon')
  thread.start()

  if 'PIPELINE_OPTIONS' in os.environ:
    sdk_pipeline_options = _parse_pipeline_options(
        os.environ['PIPELINE_OPTIONS'])
  else:
    sdk_pipeline_options = PipelineOptions.from_dictionary({})

  if 'SEMI_PERSISTENT_DIRECTORY' in os.environ:
    semi_persistent_directory = os.environ['SEMI_PERSISTENT_DIRECTORY']
  else:
    semi_persistent_directory = None

  logging.info('semi_persistent_directory: %s', semi_persistent_directory)

  try:
    _load_main_session(semi_persistent_directory)
  except Exception:  # pylint: disable=broad-except
    exception_details = traceback.format_exc()
    logging.error(
        'Could not load main session: %s', exception_details, exc_info=True)

  try:
    logging.info('Python sdk harness started with pipeline_options: %s',
                 sdk_pipeline_options.get_all_options(drop_default=True))
    service_descriptor = endpoints_pb2.ApiServiceDescriptor()
    text_format.Merge(os.environ['CONTROL_API_SERVICE_DESCRIPTOR'],
                      service_descriptor)
    # TODO(robertwb): Support credentials.
    assert not service_descriptor.oauth2_client_credentials_grant.url
    SdkHarness(
        control_address=service_descriptor.url,
        worker_count=_get_worker_count(sdk_pipeline_options)).run()
    logging.info('Python sdk harness exiting.')
  except:  # pylint: disable=broad-except
    logging.exception('Python sdk harness failed: ')
    raise
  finally:
    if fn_log_handler:
      fn_log_handler.close()
 def test__create_default_environment(self):
   docker_image = PortableRunner.default_docker_image()
   self.assertEqual(
       PortableRunner._create_environment(PipelineOptions.from_dictionary({})),
       beam_runner_api_pb2.Environment(
           urn=common_urns.environments.DOCKER.urn,
           payload=beam_runner_api_pb2.DockerPayload(
               container_image=docker_image
           ).SerializeToString()))
 def test_option_with_space(self):
   options = PipelineOptions(flags=['--option with space= value with space'])
   self.assertEqual(
       getattr(options.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
   options_from_dict = PipelineOptions.from_dictionary(
       options.get_all_options())
   self.assertEqual(
       getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
               'option with space'), ' value with space')
 def test_from_dictionary(self):
   for case in PipelineOptionsTest.TEST_CASES:
     options = PipelineOptions(flags=case['flags'])
     all_options_dict = options.get_all_options()
     options_from_dict = PipelineOptions.from_dictionary(all_options_dict)
     self.assertEqual(options_from_dict.view_as(
         PipelineOptionsTest.MockOptions).mock_flag,
                      case['expected']['mock_flag'])
     self.assertEqual(options.view_as(
         PipelineOptionsTest.MockOptions).mock_option,
                      case['expected']['mock_option'])
 def test__create_docker_environment(self):
   docker_image = 'py-docker'
   self.assertEqual(
       PortableRunner._create_environment(PipelineOptions.from_dictionary({
           'environment_type': 'DOCKER',
           'environment_config': docker_image,
       })), beam_runner_api_pb2.Environment(
           urn=common_urns.environments.DOCKER.urn,
           payload=beam_runner_api_pb2.DockerPayload(
               container_image=docker_image
           ).SerializeToString()))
  def create_options(self):
    def get_pipeline_name():
      for _, _, _, method_name, _, _ in inspect.stack():
        if method_name.find('test') != -1:
          return method_name
      return 'unknown_test'

    # Set the job name for better debugging.
    options = PipelineOptions.from_dictionary({
        'job_name': get_pipeline_name() + '_' + str(time.time())
    })
    options.view_as(PortableOptions).job_endpoint = self._get_job_endpoint()
    return options
 def test__create_process_environment(self):
   self.assertEqual(
       PortableRunner._create_environment(PipelineOptions.from_dictionary({
           'environment_type': "PROCESS",
           'environment_config': '{"os": "linux", "arch": "amd64", '
                                 '"command": "run.sh", '
                                 '"env":{"k1": "v1"} }',
       })), beam_runner_api_pb2.Environment(
           urn=common_urns.environments.PROCESS.urn,
           payload=beam_runner_api_pb2.ProcessPayload(
               os='linux',
               arch='amd64',
               command='run.sh',
               env={'k1': 'v1'},
           ).SerializeToString()))
   self.assertEqual(
       PortableRunner._create_environment(PipelineOptions.from_dictionary({
           'environment_type': 'PROCESS',
           'environment_config': '{"command": "run.sh"}',
       })), beam_runner_api_pb2.Environment(
           urn=common_urns.environments.PROCESS.urn,
           payload=beam_runner_api_pb2.ProcessPayload(
               command='run.sh',
           ).SerializeToString()))
Exemple #10
0
  def create_options(self):
    def get_pipeline_name():
      for _, _, _, method_name, _, _ in inspect.stack():
        if method_name.find('test') != -1:
          return method_name
      return 'unknown_test'

    # Set the job name for better debugging.
    options = PipelineOptions.from_dictionary({
        'job_name': get_pipeline_name() + '_' + str(time.time())
    })
    options.view_as(PortableOptions).job_endpoint = self._get_job_endpoint()
    # Override the default environment type for testing.
    options.view_as(PortableOptions).environment_type = (
        python_urns.EMBEDDED_PYTHON)
    return options
def main(unused_argv):
    """Main entry point for SDK Fn Harness."""
    if 'LOGGING_API_SERVICE_DESCRIPTOR' in os.environ:
        try:
            logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
            text_format.Merge(os.environ['LOGGING_API_SERVICE_DESCRIPTOR'],
                              logging_service_descriptor)

            # Send all logs to the runner.
            fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
            # TODO(BEAM-5468): This should be picked up from pipeline options.
            logging.getLogger().setLevel(logging.INFO)
            logging.getLogger().addHandler(fn_log_handler)
            logging.info('Logging handler created.')
        except Exception:
            logging.error(
                "Failed to set up logging handler, continuing without.",
                exc_info=True)
            fn_log_handler = None
    else:
        fn_log_handler = None

    # Start status HTTP server thread.
    thread = threading.Thread(name='status_http_server',
                              target=StatusServer().start)
    thread.daemon = True
    thread.setName('status-server-demon')
    thread.start()

    if 'PIPELINE_OPTIONS' in os.environ:
        sdk_pipeline_options = _parse_pipeline_options(
            os.environ['PIPELINE_OPTIONS'])
    else:
        sdk_pipeline_options = PipelineOptions.from_dictionary({})

    if 'SEMI_PERSISTENT_DIRECTORY' in os.environ:
        semi_persistent_directory = os.environ['SEMI_PERSISTENT_DIRECTORY']
    else:
        semi_persistent_directory = None

    logging.info('semi_persistent_directory: %s', semi_persistent_directory)
    _worker_id = os.environ.get('WORKER_ID', None)

    try:
        _load_main_session(semi_persistent_directory)
    except Exception:  # pylint: disable=broad-except
        exception_details = traceback.format_exc()
        logging.error('Could not load main session: %s',
                      exception_details,
                      exc_info=True)

    try:
        logging.info('Python sdk harness started with pipeline_options: %s',
                     sdk_pipeline_options.get_all_options(drop_default=True))
        service_descriptor = endpoints_pb2.ApiServiceDescriptor()
        text_format.Merge(os.environ['CONTROL_API_SERVICE_DESCRIPTOR'],
                          service_descriptor)
        # TODO(robertwb): Support credentials.
        assert not service_descriptor.oauth2_client_credentials_grant.url
        SdkHarness(control_address=service_descriptor.url,
                   worker_count=_get_worker_count(sdk_pipeline_options),
                   worker_id=_worker_id,
                   profiler_factory=profiler.Profile.factory_from_options(
                       sdk_pipeline_options.view_as(ProfilingOptions))).run()
        logging.info('Python sdk harness exiting.')
    except:  # pylint: disable=broad-except
        logging.exception('Python sdk harness failed: ')
        raise
    finally:
        if fn_log_handler:
            fn_log_handler.close()
Exemple #12
0
def run(in_args=None):
  """Runs the pre-processing pipeline."""

  pipeline_options = PipelineOptions.from_dictionary(vars(in_args))
  with beam.Pipeline(options=pipeline_options) as p:
    configure_pipeline(p, in_args)
def run():

    client_bq = bigquery.Client.from_service_account_json(
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'], location=args.location)
    bigquery_asset_list = [
        # (dataset, table_name, table_schema, table_partition_column)
        ('logs', f'native_events_{args.environment}', 'logs', 'event_ds'),
        ('logs', f'native_events_debug_{args.environment}', 'logs',
         'event_ds'),
        ('logs', f'dataflow_backfill_{args.environment}', 'logs', 'event_ds'),
        ('native', f'events_{args.event_schema}_{args.environment}',
         args.event_schema, 'event_timestamp')
    ]
    try:
        source_bigquery_assets(client_bq, bigquery_asset_list)
    except Exception:
        generate_bigquery_assets(client_bq, bigquery_asset_list)

    # https://github.com/apache/beam/blob/master/sdks/python/apache_beam/options/pipeline_options.py
    po, event_category = PipelineOptions(), args.event_category.replace(
        '_', '-')
    job_name = f'p1-gcs-to-bq-backfill-{args.event_schema}-{event_category}-{args.event_ds_start}-to-{args.event_ds_stop}-{time_part_name}-{int(time.time())}'
    # https://cloud.google.com/dataflow/docs/guides/specifying-exec-params
    pipeline_options = po.from_dictionary({
        'project': args.gcp,
        'staging_location':
        f'gs://{args.bucket_name}/data_type=dataflow/batch/staging/{job_name}/',
        'temp_location':
        f'gs://{args.bucket_name}/data_type=dataflow/batch/temp/{job_name}/',
        'runner': args.execution_environment,  # {DirectRunner, DataflowRunner}
        'setup_file': args.setup_file,
        'service_account_email':
        f'dataflow-batch-{args.environment}@{args.gcp}.iam.gserviceaccount.com',
        'job_name': job_name,
        'region': args.gcp_region
    })
    pipeline_options.view_as(SetupOptions).save_main_session = True

    p1 = beam.Pipeline(options=pipeline_options)
    fileListGcs = (p1 | 'CreateGcsIterators' >> beam.Create(
        list(
            generate_gcs_file_list(args.bucket_name, args.event_schema,
                                   args.event_environment, category_list,
                                   args.event_ds_start, args.event_ds_stop,
                                   time_part_list, args.scale_test_name)))
                   | 'GetGcsFileList' >> beam.ParDo(GetGcsFileList())
                   | 'GcsListPairWithOne' >> beam.Map(lambda x: (x, 1)))

    fileListBq = (
        p1 | 'ParseBqFileList' >> beam.io.Read(
            beam.io.BigQuerySource(
                # "What is already in BQ?"
                query=generate_backfill_query(
                    args.gcp, args.environment, args.event_schema,
                    args.event_environment,
                    (safe_convert_list_to_sql_tuple(category_list),
                     category_name), args.event_ds_start, args.event_ds_stop,
                    (safe_convert_list_to_sql_tuple(time_part_list),
                     time_part_name), args.scale_test_name),
                use_standard_sql=True))
        | 'BqListPairWithOne' >> beam.Map(lambda x: (x['gspath'], 1)))

    parseList = ({
        'fileListGcs': fileListGcs,
        'fileListBq': fileListBq
    }
                 | 'CoGroupByKey' >> beam.CoGroupByKey()
                 | 'UnionMinusIntersect' >> beam.Filter(lambda x: (len(x[1][
                     'fileListGcs']) == 1 and len(x[1]['fileListBq']) == 0))
                 | 'ExtractKeysParseList' >> beam.Map(lambda x: x[0]))

    # Write to BigQuery:
    logsList = (
        parseList | 'AddParseInitiatedInfo' >> beam.Map(
            lambda gspath: {
                'job_name': job_name,
                'processed_timestamp': time.time(),
                'batch_id': hashlib.md5(gspath.encode('utf-8')).hexdigest(),
                'event_schema': parse_gspath(gspath, 'event_schema='),
                'event_category': parse_gspath(gspath, 'event_category='),
                'event_environment': parse_gspath(gspath, 'event_environment='
                                                  ),
                'event_ds': parse_gspath(gspath, 'event_ds='),
                'event_time': parse_gspath(gspath, 'event_time='),
                'event': 'parse_initiated',
                'gspath': gspath
            })
        | 'WriteParseInitiated' >> beam.io.WriteToBigQuery(
            table=f'dataflow_backfill_{args.environment}',
            dataset='logs',
            project=args.gcp,
            method='FILE_LOADS',
            create_disposition=beam.io.gcp.bigquery.BigQueryDisposition.
            CREATE_IF_NEEDED,
            write_disposition=beam.io.gcp.bigquery.BigQueryDisposition.
            WRITE_APPEND,
            insert_retry_strategy=beam.io.gcp.bigquery_tools.RetryStrategy.
            RETRY_ON_TRANSIENT_ERROR,
            schema=
            'job_name:STRING,processed_timestamp:TIMESTAMP,batch_id:STRING,event_schema:STRING,event_environment:STRING,event_category:STRING,event_ds:DATE,event_time:STRING,event:STRING,gspath:STRING'
        ))

    # Write to Pub/Sub:
    PDone = (parseList | 'DumpParseListPubSub' >> beam.io.WriteToText(
        f'gs://{args.bucket_name}/data_type=dataflow/batch/output/{job_name}/parselist'
    )
             | 'WriteToPubSub' >>
             beam.ParDo(WriteToPubSub(), job_name, args.topic, args.gcp,
                        args.bucket_name))

    p1.run().wait_until_finish()
    return job_name
Exemple #14
0
         EntityWrapper("", "processedTweets", "root").make_entity)
     | 'processed tweet write' >> WriteToDatastore(project))

    # Actually run the pipeline.
    return p.run()


if __name__ == '__main__':
    PROJECT = os.environ['PROJECT']
    BUCKET = os.environ['BUCKET']

    pipeline_options = {
        'project':
        PROJECT,
        'staging_location':
        'gs://' + BUCKET + '/staging',
        'runner':
        'direct',
        'setup_file':
        './setup.py',
        'job_name':
        PROJECT + '-twcount',
        'temp_location':
        'gs://' + BUCKET + '/temp',
        'template_location':
        'gs://' + BUCKET + '/templates/' + PROJECT + '-twproc_tmpl'
    }
    # define and launch the pipeline (non-blocking), which will create the template.
    pipeline_options = PipelineOptions.from_dictionary(pipeline_options)
    process_datastore_tweets(PROJECT, pipeline_options)
Exemple #15
0
 def test__create_default_environment(self):
   docker_image = environments.DockerEnvironment.default_docker_image()
   self.assertEqual(
       PortableRunner._create_environment(
           PipelineOptions.from_dictionary({'sdk_location': 'container'})),
       environments.DockerEnvironment(container_image=docker_image))
Exemple #16
0
def create_harness(environment, dry_run=False):
    """Creates SDK Fn Harness."""
    if 'LOGGING_API_SERVICE_DESCRIPTOR' in environment:
        try:
            logging_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
            text_format.Merge(environment['LOGGING_API_SERVICE_DESCRIPTOR'],
                              logging_service_descriptor)

            # Send all logs to the runner.
            fn_log_handler = FnApiLogRecordHandler(logging_service_descriptor)
            # TODO(BEAM-5468): This should be picked up from pipeline options.
            logging.getLogger().setLevel(logging.INFO)
            logging.getLogger().addHandler(fn_log_handler)
            _LOGGER.info('Logging handler created.')
        except Exception:
            _LOGGER.error(
                "Failed to set up logging handler, continuing without.",
                exc_info=True)
            fn_log_handler = None
    else:
        fn_log_handler = None

    pipeline_options_dict = _load_pipeline_options(
        environment.get('PIPELINE_OPTIONS'))
    # These are used for dataflow templates.
    RuntimeValueProvider.set_runtime_options(pipeline_options_dict)
    sdk_pipeline_options = PipelineOptions.from_dictionary(
        pipeline_options_dict)
    filesystems.FileSystems.set_options(sdk_pipeline_options)
    pickler.set_library(
        sdk_pipeline_options.view_as(SetupOptions).pickle_library)

    if 'SEMI_PERSISTENT_DIRECTORY' in environment:
        semi_persistent_directory = environment['SEMI_PERSISTENT_DIRECTORY']
    else:
        semi_persistent_directory = None

    _LOGGER.info('semi_persistent_directory: %s', semi_persistent_directory)
    _worker_id = environment.get('WORKER_ID', None)

    try:
        _load_main_session(semi_persistent_directory)
    except CorruptMainSessionException:
        exception_details = traceback.format_exc()
        _LOGGER.error('Could not load main session: %s',
                      exception_details,
                      exc_info=True)
        raise
    except Exception:  # pylint: disable=broad-except
        exception_details = traceback.format_exc()
        _LOGGER.error('Could not load main session: %s',
                      exception_details,
                      exc_info=True)

    _LOGGER.info('Pipeline_options: %s',
                 sdk_pipeline_options.get_all_options(drop_default=True))
    control_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
    status_service_descriptor = endpoints_pb2.ApiServiceDescriptor()
    text_format.Merge(environment['CONTROL_API_SERVICE_DESCRIPTOR'],
                      control_service_descriptor)
    if 'STATUS_API_SERVICE_DESCRIPTOR' in environment:
        text_format.Merge(environment['STATUS_API_SERVICE_DESCRIPTOR'],
                          status_service_descriptor)
    # TODO(robertwb): Support authentication.
    assert not control_service_descriptor.HasField('authentication')

    experiments = sdk_pipeline_options.view_as(DebugOptions).experiments or []
    enable_heap_dump = 'enable_heap_dump' in experiments
    if dry_run:
        return
    sdk_harness = SdkHarness(
        control_address=control_service_descriptor.url,
        status_address=status_service_descriptor.url,
        worker_id=_worker_id,
        state_cache_size=_get_state_cache_size(experiments),
        data_buffer_time_limit_ms=_get_data_buffer_time_limit_ms(experiments),
        profiler_factory=profiler.Profile.factory_from_options(
            sdk_pipeline_options.view_as(ProfilingOptions)),
        enable_heap_dump=enable_heap_dump)
    return fn_log_handler, sdk_harness, sdk_pipeline_options
Exemple #17
0
def _parse_pipeline_options(options_json):
    return PipelineOptions.from_dictionary(
        _load_pipeline_options(options_json))
Exemple #18
0
def run(in_args=None):
    """Runs the pre-processing pipeline."""

    pipeline_options = PipelineOptions.from_dictionary(vars(in_args))
    with beam.Pipeline(options=pipeline_options) as p:
        configure_pipeline(p, in_args)