Exemple #1
0
def run(argv=None):
  known_args, pipeline_args = get_args(argv)
  options = PipelineOptions(pipeline_args)

  run_count1(known_args, options)
  run_count2(known_args, options)
  run_count3(known_args, options)
    def test_with_setup_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = os.path.join(
            source_dir, 'setup.py')

        self.assertEqual(
            [
                dependency.WORKFLOW_TARBALL_FILE,
                names.PICKLED_MAIN_SESSION_FILE
            ],
            dependency.stage_job_resources(
                options,
                # We replace the build setup command because a realistic one would
                # require the setuptools package to be installed. Note that we can't
                # use "touch" here to create the expected output tarball file, since
                # touch is not available on Windows, so we invoke python to produce
                # equivalent behavior.
                build_setup_args=[
                    'python', '-c', 'open(__import__("sys").argv[1], "a")',
                    os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)
                ],
                temp_dir=source_dir))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
Exemple #3
0
 def test_table_schema_without_project(self):
     # Writer should pick executing project by default.
     sink = df.io.BigQuerySink(table='mydataset.mytable')
     options = PipelineOptions(flags=['--project', 'myproject'])
     sink.pipeline_options = options
     writer = sink.writer()
     self.assertEquals('myproject', writer.project_id)
    def _run_write_test(self,
                        data,
                        return_init_result=True,
                        return_write_results=True):
        write_to_test_sink = WriteToTestSink(return_init_result,
                                             return_write_results)
        p = Pipeline(options=PipelineOptions([]))
        result = p | df.Create('start', data) | write_to_test_sink

        assert_that(result, is_empty())
        p.run()

        sink = write_to_test_sink.last_sink
        self.assertIsNotNone(sink)

        self.assertEqual(sink.state, _TestSink.STATE_FINALIZED)
        if data:
            self.assertIsNotNone(sink.last_writer)
            self.assertEqual(sink.last_writer.state, _TestWriter.STATE_CLOSED)
            self.assertEqual(sink.last_writer.write_output, data)
            if return_init_result:
                self.assertEqual(sink.last_writer.init_result,
                                 _TestSink.TEST_INIT_RESULT)
                self.assertEqual(sink.init_result_at_finalize,
                                 _TestSink.TEST_INIT_RESULT)
            self.assertIsNotNone(sink.last_writer.uid)
            if return_write_results:
                self.assertEqual(sink.write_results_at_finalize,
                                 [_TestWriter.TEST_WRITE_RESULT])
        else:
            self.assertIsNone(sink.last_writer)
Exemple #5
0
 def test_table_schema_without_project(self):
     # Reader should pick executing project by default.
     source = df.io.BigQuerySource(table='mydataset.mytable')
     options = PipelineOptions(flags=['--project', 'myproject'])
     source.pipeline_options = options
     reader = source.reader()
     self.assertEquals('SELECT * FROM [myproject:mydataset.mytable];',
                       reader.query)
Exemple #6
0
    def test_override_options(self):
        base_flags = ['--num_workers', '5']
        options = PipelineOptions(base_flags)
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertEqual(options.get_all_options()['mock_flag'], False)

        options.view_as(PipelineOptionsTest.MockOptions).mock_flag = True
        self.assertEqual(options.get_all_options()['num_workers'], 5)
        self.assertEqual(options.get_all_options()['mock_flag'], True)
    def test_no_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = False
        self.update_options(options)

        self.assertEqual([], dependency.stage_job_resources(options))
    def test_missing_required_options(self):
        options = PipelineOptions([''])
        runner = MockRunners.DataflowPipelineRunner()
        validator = PipelineOptionsValidator(options, runner)
        errors = validator.validate()

        self.assertEqual(
            self.check_errors_for_arguments(
                errors,
                ['project', 'job_name', 'staging_location', 'temp_location']),
            [])
Exemple #9
0
 def test_get_all_options(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         self.assertDictContainsSubset(case['expected'],
                                       options.get_all_options())
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_flag,
             case['expected']['mock_flag'])
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_option,
             case['expected']['mock_option'])
Exemple #10
0
 def test_option_with_spcae(self):
     options = PipelineOptions(
         flags=['--option with space= value with space'])
     self.assertEqual(
         getattr(options.view_as(PipelineOptionsTest.MockOptions),
                 'option with space'), ' value with space')
     options_from_dict = PipelineOptions.from_dictionary(
         options.get_all_options())
     self.assertEqual(
         getattr(options_from_dict.view_as(PipelineOptionsTest.MockOptions),
                 'option with space'), ' value with space')
    def test_default_resources(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)

        self.assertEqual([names.PICKLED_MAIN_SESSION_FILE],
                         dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
 def test_no_temp_location(self):
     staging_dir = tempfile.mkdtemp()
     options = PipelineOptions()
     google_cloud_options = options.view_as(GoogleCloudOptions)
     google_cloud_options.staging_location = staging_dir
     self.update_options(options)
     google_cloud_options.temp_location = None
     with self.assertRaises(RuntimeError) as cm:
         dependency.stage_job_resources(options)
     self.assertEqual('The --temp_location option must be specified.',
                      cm.exception.message)
 def test_requirements_file_not_present(self):
     staging_dir = tempfile.mkdtemp()
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).requirements_file = 'nosuchfile'
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The file %s cannot be found. It was specified in the '
         '--requirements_file command line option.' % 'nosuchfile')
    def test_sdk_location_gcs(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
        self.override_file_copy(sdk_location, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(options))
Exemple #15
0
 def test_from_dictionary(self):
     for case in PipelineOptionsTest.TEST_CASES:
         options = PipelineOptions(flags=case['flags'])
         all_options_dict = options.get_all_options()
         options_from_dict = PipelineOptions.from_dictionary(
             all_options_dict)
         self.assertEqual(
             options_from_dict.view_as(
                 PipelineOptionsTest.MockOptions).mock_flag,
             case['expected']['mock_flag'])
         self.assertEqual(
             options.view_as(PipelineOptionsTest.MockOptions).mock_option,
             case['expected']['mock_option'])
        def get_validator(temp_location):
            options = [
                '--project=example:example', '--job_name=job',
                '--staging_location=gs://foo/bar'
            ]

            if temp_location is not None:
                options.append('--temp_location=' + temp_location)

            pipeline_options = PipelineOptions(options)
            runner = MockRunners.DataflowPipelineRunner()
            validator = PipelineOptionsValidator(pipeline_options, runner)
            return validator
    def run_pipeline(self, count_implementation, factor=1):
        input_path = self.create_temp_file('CAT\nDOG\nCAT\nCAT\nDOG\n')
        output_path = input_path + '.result'

        known_args, pipeline_args = custom_ptransform.get_args(
            ['--input=%s*' % input_path,
             '--output=%s' % output_path])

        count_implementation(known_args, PipelineOptions(pipeline_args))
        self.assertEqual([
            """(u'CAT', %d)""" % (3 * factor),
            """(u'DOG', %d)""" % (2 * factor)
        ], self.get_output(output_path))
Exemple #18
0
def get_service_credentials():
    """Get credentials to access Google services."""
    user_agent = 'dataflow-python-sdk/1.0'
    if is_running_in_gce:
        # We are currently running as a GCE taskrunner worker.
        #
        # TODO(ccy): It's not entirely clear if these credentials are thread-safe.
        # If so, we can cache these credentials to save the overhead of creating
        # them again.
        return GCEMetadataCredentials(user_agent=user_agent)
    else:
        # We are currently being run from the command line.
        google_cloud_options = PipelineOptions(
            sys.argv).view_as(GoogleCloudOptions)
        if google_cloud_options.service_account_name:
            if not google_cloud_options.service_account_key_file:
                raise AuthenticationException(
                    'key file not provided for service account.')
            if not os.path.exists(
                    google_cloud_options.service_account_key_file):
                raise AuthenticationException(
                    'Specified service account key file does not exist.')
            client_scopes = [
                'https://www.googleapis.com/auth/bigquery',
                'https://www.googleapis.com/auth/cloud-platform',
                'https://www.googleapis.com/auth/devstorage.full_control',
                'https://www.googleapis.com/auth/userinfo.email',
                'https://www.googleapis.com/auth/datastore'
            ]

            # The following code uses oauth2client >=2.0.0 functionality and if this
            # is not available due to import errors will use 1.5.2 functionality.
            try:
                from oauth2client.service_account import ServiceAccountCredentials
                return ServiceAccountCredentials.from_p12_keyfile(
                    google_cloud_options.service_account_name,
                    google_cloud_options.service_account_key_file,
                    client_scopes,
                    user_agent=user_agent)
            except ImportError:
                with file(google_cloud_options.service_account_key_file) as f:
                    service_account_key = f.read()
                from oauth2client.client import SignedJwtAssertionCredentials
                return SignedJwtAssertionCredentials(
                    google_cloud_options.service_account_name,
                    service_account_key,
                    client_scopes,
                    user_agent=user_agent)

        else:
            return _GCloudWrapperCredentials(user_agent)
    def test_sdk_location_local_not_present(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'nosuchdir'
        with self.assertRaises(RuntimeError) as cm:
            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).sdk_location = sdk_location

            dependency.stage_job_resources(options)
        self.assertEqual(
            'The file "%s" cannot be found. Its '
            'location was specified by the --sdk_location command-line option.'
            % sdk_location, cm.exception.message)
Exemple #20
0
    def _run_write_test(self,
                        data,
                        return_init_result=True,
                        return_write_results=True):
        write_to_test_sink = WriteToTestSink(return_init_result,
                                             return_write_results)
        p = Pipeline(options=PipelineOptions([]))
        result = p | df.Create('start', data) | write_to_test_sink

        assert_that(result, is_empty())
        p.run()

        sink = write_to_test_sink.last_sink
        self.assertIsNotNone(sink)
Exemple #21
0
 def __ror__(self, left):
     """Used to apply this PTransform to non-PValues, e.g., a tuple."""
     pvalueish, pvalues = self._extract_input_pvalues(left)
     pipelines = [
         v.pipeline for v in pvalues if isinstance(v, pvalue.PValue)
     ]
     if pvalues and not pipelines:
         deferred = False
         # pylint: disable=g-import-not-at-top
         from google.cloud.dataflow import pipeline
         from google.cloud.dataflow.utils.options import PipelineOptions
         # pylint: enable=g-import-not-at-top
         p = pipeline.Pipeline('DirectPipelineRunner',
                               PipelineOptions(sys.argv))
     else:
         if not pipelines:
             if self.pipeline is not None:
                 p = self.pipeline
             else:
                 raise ValueError(
                     '"%s" requires a pipeline to be specified '
                     'as there are no deferred inputs.' % self.label)
         else:
             p = self.pipeline or pipelines[0]
             for pp in pipelines:
                 if p != pp:
                     raise ValueError(
                         'Mixing value from different pipelines not allowed.'
                     )
         deferred = not getattr(p.runner, 'is_eager', False)
     # pylint: disable=g-import-not-at-top
     from google.cloud.dataflow.transforms.core import Create
     # pylint: enable=g-import-not-at-top
     replacements = {
         id(v): p | Create('CreatePInput%s' % ix, v)
         for ix, v in enumerate(pvalues)
         if not isinstance(v, pvalue.PValue) and v is not None
     }
     pvalueish = _SetInputPValues().visit(pvalueish, replacements)
     self.pipeline = p
     result = p.apply(self, pvalueish)
     if deferred:
         return result
     else:
         # Get a reference to the runners internal cache, otherwise runner may
         # clean it after run.
         cache = p.runner.cache
         p.run()
         return _MaterializePValues(cache).visit(result)
    def test_remote_runner_translation(self):
        remote_runner = DataflowPipelineRunner()
        p = Pipeline(remote_runner,
                     options=PipelineOptions([
                         '--dataflow_endpoint=ignored', '--job_name=test-job',
                         '--project=test-project',
                         '--staging_location=ignored',
                         '--temp_location=/dev/null', '--no_auth=True'
                     ]))

        res = (p | ptransform.Create('create', [1, 2, 3])
               | ptransform.FlatMap('do', lambda x: [(x, x)])
               | ptransform.GroupByKey('gbk'))
        remote_runner.job = apiclient.Job(p.options)
        super(DataflowPipelineRunner, remote_runner).run(p)
Exemple #23
0
    def test_deferred_side_input_iterable(self):
        @typehints.with_input_types(str, typehints.Iterable[str])
        def concat(glue, items):
            return glue.join(sorted(items))

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', ['x', 'y', 'z'])
        result = main_input | df.Map(concat, pvalue.AsIter(side_input))
        assert_that(result, equal_to(['xayaz', 'xbbybbz', 'xcycz']))
        p.run()

        bad_side_input = p | df.Create('bad_side', [1, 2, 3])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('fail', concat, pvalue.AsIter(bad_side_input))
 def test_with_extra_packages_invalid_file_name(self):
     staging_dir = tempfile.mkdtemp()
     source_dir = tempfile.mkdtemp()
     self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing')
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).extra_packages = [
             os.path.join(source_dir, 'abc.tgz')
         ]
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The --extra_packages option expects a full path ending with '
         '\'.tar.gz\' instead of %s' % os.path.join(source_dir, 'abc.tgz'))
Exemple #25
0
    def test_deferred_side_inputs(self):
        @typehints.with_input_types(str, int)
        def repeat(s, times):
            return s * times

        p = df.Pipeline(options=PipelineOptions([]))
        main_input = p | df.Create(['a', 'bb', 'c'])
        side_input = p | df.Create('side', [3])
        result = main_input | df.Map(repeat, pvalue.AsSingleton(side_input))
        assert_that(result, equal_to(['aaa', 'bbbbbb', 'ccc']))
        p.run()

        bad_side_input = p | df.Create('bad_side', ['z'])
        with self.assertRaises(typehints.TypeCheckError):
            main_input | df.Map('again', repeat,
                                pvalue.AsSingleton(bad_side_input))
    def test_with_extra_packages_missing_files(self):
        staging_dir = tempfile.mkdtemp()
        with self.assertRaises(RuntimeError) as cm:

            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).extra_packages = [
                'nosuchfile.tar.gz'
            ]

            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--extra_packages command line option.' % 'nosuchfile.tar.gz')
    def test_with_extra_packages(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'),
                              'nothing')
        self.create_temp_file(
            os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE),
            'nothing')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).extra_packages = [
            os.path.join(source_dir, 'abc.tar.gz'),
            os.path.join(source_dir, 'xyz.tar.gz'),
            'gs://my-gcs-bucket/gcs.tar.gz'
        ]

        gcs_copied_files = []

        def file_copy(from_path, to_path):
            if from_path.startswith('gs://'):
                gcs_copied_files.append(from_path)
                _, from_name = os.path.split(from_path)
                self.create_temp_file(os.path.join(to_path, from_name),
                                      'nothing')
                logging.info('Fake copied GCS file: %s to %s', from_path,
                             to_path)
            elif to_path.startswith('gs://'):
                logging.info('Faking file_copy(%s, %s)', from_path, to_path)
            else:
                shutil.copyfile(from_path, to_path)

        dependency._dependency_file_copy = file_copy

        self.assertEqual([
            'abc.tar.gz', 'xyz.tar.gz', 'gcs.tar.gz',
            dependency.EXTRA_PACKAGES_FILE, names.PICKLED_MAIN_SESSION_FILE
        ], dependency.stage_job_resources(options))
        with open(os.path.join(staging_dir,
                               dependency.EXTRA_PACKAGES_FILE)) as f:
            self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'gcs.tar.gz\n'],
                             f.readlines())
        self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
    def test_sdk_location_default(self):
        staging_dir = tempfile.mkdtemp()
        expected_from_url = '%s/v%s.tar.gz' % (dependency.PACKAGES_URL_PREFIX,
                                               __version__)
        expected_from_path = self.override_file_download(
            expected_from_url, staging_dir)
        self.override_file_copy(expected_from_path, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        self.assertEqual(
            [names.PICKLED_MAIN_SESSION_FILE, names.DATAFLOW_SDK_TARBALL_FILE],
            dependency.stage_job_resources(
                options, file_copy=dependency._dependency_file_copy))
    def test_with_requirements_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).requirements_file = os.path.join(
            source_dir, dependency.REQUIREMENTS_FILE)
        self.create_temp_file(
            os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
        self.assertEqual(
            [dependency.REQUIREMENTS_FILE, names.PICKLED_MAIN_SESSION_FILE],
            dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
    def test_setup_file_not_named_setup_dot_py(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = (os.path.join(
            source_dir, 'xyz-setup.py'))

        self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'),
                              'notused')
        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertTrue(
            cm.exception.message.startswith(
                'The --setup_file option expects the full path to a file named '
                'setup.py instead of '))