Esempio n. 1
0
def main():
    """Configures pipeline and spawns preprocessing job."""

    args = _parse_arguments(sys.argv)
    config_path = os.path.abspath(
        os.path.join(__file__, os.pardir, 'preprocessing_config.ini'))
    config = _parse_config('CLOUD' if args.cloud else 'LOCAL', config_path)
    ml_project = args.project_id
    options = {'project': ml_project}

    if args.cloud:
        if not args.job_name:
            raise ValueError('Job name must be specified for cloud runs.')
        options.update({
            'job_name':
            args.job_name,
            'num_workers':
            int(config.get('num_workers')),
            'max_num_workers':
            int(config.get('max_num_workers')),
            'staging_location':
            os.path.join(args.job_dir, 'staging'),
            'temp_location':
            os.path.join(args.job_dir, 'tmp'),
            'region':
            config.get('region'),
            'setup_file':
            os.path.abspath(
                os.path.join(__file__, '../..', 'dataflow_setup.py')),
        })
    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
    _set_logging(config.get('log_level'))

    with beam.Pipeline(config.get('runner'), options=pipeline_options) as p:
        preprocess.run(p, args.input_data, args.job_dir)
def main():
  """Configures pipeline and spawns preprocessing job."""

  args = _parse_arguments(sys.argv)
  config_path = os.path.abspath(
      os.path.join(__file__, os.pardir, 'preprocessing_config.ini'))
  config = _parse_config('CLOUD' if args.cloud else 'LOCAL',
                         config_path)
  ml_project = args.project_id
  options = {'project': ml_project}

  if args.cloud:
    if not args.job_name:
      raise ValueError('Job name must be specified for cloud runs.')
    options.update({
        'job_name': args.job_name,
        'num_workers': int(config.get('num_workers')),
        'max_num_workers': int(config.get('max_num_workers')),
        'staging_location': os.path.join(args.job_dir, 'staging'),
        'temp_location': os.path.join(args.job_dir, 'tmp'),
        'region': config.get('region'),
        'setup_file': os.path.abspath(
            os.path.join(__file__, '../..', 'dataflow_setup.py')),
        })
  pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
  _set_logging(config.get('log_level'))

  with beam.Pipeline(
      config.get('runner'), options=pipeline_options) as pipeline:
    preprocess.run(pipeline, args.input_data, args.job_dir)
def main():
    """Configures and runs a pipeline."""
    args = parse_arguments(sys.argv)
    config = parse_config("CLOUD" if args.cloud else "LOCAL",
                          get_relative_path("config.ini"))
    set_logging(config.get("log_level"))
    options = get_pipeline_options(args, config)
    runner = str(config.get("runner"))

    with beam.Pipeline(runner, options=options) as pipeline:
        with beam_impl.Context(
                temp_dir=os.path.join(args.tft_dir, constants.TMP_DIR)):
            preprocess.run(pipeline, args)
def run(params):
    """Sets and runs Beam preprocessing pipeline.

  Args:
    params: Object holding a set of parameters as name-value pairs.

  Raises:
    ValueError: If `gcp` argument is `True` and `project_id` or `job_name` are
      not specified.
  """

    options = {}
    if params.gcp:
        options = {
            'project':
            params.project_id,
            'job_name':
            params.job_name,
            'temp_location':
            os.path.join(params.output_dir, 'temp'),
            'staging_location':
            os.path.join(params.output_dir, 'staging'),
            'setup_file':
            os.path.abspath(os.path.join(os.path.dirname(__file__),
                                         'setup.py'))
        }

        def _update(param_name):
            param_value = getattr(params, param_name)
            if param_value:
                options.update({param_name: param_value})

        _update('worker_machine_type')
        _update('num_workers')
        _update('region')

    pipeline_options = beam.pipeline.PipelineOptions(flags=[], **options)
    runner = _DATAFLOW_RUNNER if params.gcp else _DIRECT_RUNNER
    with beam.Pipeline(runner, options=pipeline_options) as p:
        preprocess.run(p=p, params=params)