Example #1
0
 def test_no_temp_location(self):
   staging_dir = self.make_temp_dir()
   options = PipelineOptions()
   google_cloud_options = options.view_as(GoogleCloudOptions)
   google_cloud_options.staging_location = staging_dir
   self.update_options(options)
   google_cloud_options.temp_location = None
   with self.assertRaises(RuntimeError) as cm:
     dependency.stage_job_resources(options)
   self.assertEqual('The --temp_location option must be specified.',
                    cm.exception.message)
Example #2
0
 def test_no_temp_location(self):
     staging_dir = tempfile.mkdtemp()
     options = PipelineOptions()
     google_cloud_options = options.view_as(GoogleCloudOptions)
     google_cloud_options.staging_location = staging_dir
     self.update_options(options)
     google_cloud_options.temp_location = None
     with self.assertRaises(RuntimeError) as cm:
         dependency.stage_job_resources(options)
     self.assertEqual('The --temp_location option must be specified.',
                      cm.exception.message)
Example #3
0
 def test_requirements_file_not_present(self):
   staging_dir = self.make_temp_dir()
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).requirements_file = 'nosuchfile'
     dependency.stage_job_resources(
         options, populate_requirements_cache=self.populate_requirements_cache)
   self.assertEqual(
       cm.exception.message,
       'The file %s cannot be found. It was specified in the '
       '--requirements_file command line option.' % 'nosuchfile')
Example #4
0
  def test_with_extra_packages_missing_files(self):
    staging_dir = self.make_temp_dir()
    with self.assertRaises(RuntimeError) as cm:

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).extra_packages = ['nosuchfile.tar.gz']

      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--extra_packages command line option.' % 'nosuchfile.tar.gz')
  def test_setup_file_not_present(self):
    staging_dir = tempfile.mkdtemp()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = 'nosuchfile'

    with self.assertRaises(RuntimeError) as cm:
      dependency.stage_job_resources(options)
    self.assertEqual(
        cm.exception.message,
        'The file %s cannot be found. It was specified in the '
        '--setup_file command line option.' % 'nosuchfile')
Example #6
0
    def test_setup_file_not_present(self):
        staging_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = 'nosuchfile'

        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertEqual(
            cm.exception.message,
            'The file %s cannot be found. It was specified in the '
            '--setup_file command line option.' % 'nosuchfile')
Example #7
0
    def test_sdk_location_local_not_present(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'nosuchdir'
        with self.assertRaises(RuntimeError) as cm:
            options = PipelineOptions()
            options.view_as(GoogleCloudOptions).staging_location = staging_dir
            self.update_options(options)
            options.view_as(SetupOptions).sdk_location = sdk_location

            dependency.stage_job_resources(options)
        self.assertEqual(
            'The file "%s" cannot be found. Its '
            'location was specified by the --sdk_location command-line option.'
            % sdk_location, cm.exception.message)
Example #8
0
  def test_sdk_location_local_not_present(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'nosuchdir'
    with self.assertRaises(RuntimeError) as cm:
      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).sdk_location = sdk_location

      dependency.stage_job_resources(options)
    self.assertEqual(
        'The file "%s" cannot be found. Its '
        'location was specified by the --sdk_location command-line option.' %
        sdk_location,
        cm.exception.message)
Example #9
0
    def test_default_resources(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)

        self.assertEqual([], dependency.stage_job_resources(options))
Example #10
0
    def test_sdk_location_http(self):
        staging_dir = self.make_temp_dir()
        sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        def file_download(_, to_folder):
            tarball_path = os.path.join(to_folder, 'sdk-tarball')
            with open(tarball_path, 'w') as f:
                f.write('Package content.')
            return tarball_path

        with mock.patch(
                'apache_beam.runners.dataflow.internal.'
                'dependency._dependency_file_download', file_download):
            self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                             dependency.stage_job_resources(options))

        tarball_path = os.path.join(staging_dir,
                                    names.DATAFLOW_SDK_TARBALL_FILE)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'Package content.')
Example #11
0
    def test_with_requirements_file_and_cache(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).requirements_file = os.path.join(
            source_dir, dependency.REQUIREMENTS_FILE)
        options.view_as(SetupOptions).requirements_cache = os.path.join(
            tempfile.gettempdir(), 'alternative-cache-dir')
        self.create_temp_file(
            os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
        self.assertEqual(
            sorted([dependency.REQUIREMENTS_FILE, 'abc.txt', 'def.txt']),
            sorted(
                dependency.stage_job_resources(
                    options,
                    populate_requirements_cache=self.
                    populate_requirements_cache)))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
        self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
        self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
  def test_sdk_location_http(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'http://storage.googleapis.com/my-gcs-bucket/tarball.tar.gz'

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    def file_download(_, to_folder):
      tarball_path = os.path.join(to_folder, 'sdk-tarball')
      with open(tarball_path, 'w') as f:
        f.write('Package content.')
      return tarball_path

    with mock.patch('apache_beam.runners.dataflow.internal.'
                    'dependency._dependency_file_download', file_download):
      self.assertEqual(
          [names.DATAFLOW_SDK_TARBALL_FILE],
          dependency.stage_job_resources(options))

    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'Package content.')
Example #13
0
    def test_with_setup_file(self):
        staging_dir = tempfile.mkdtemp()
        source_dir = tempfile.mkdtemp()
        self.create_temp_file(os.path.join(source_dir, 'setup.py'), 'notused')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = os.path.join(
            source_dir, 'setup.py')

        self.assertEqual(
            [dependency.WORKFLOW_TARBALL_FILE],
            dependency.stage_job_resources(
                options,
                # We replace the build setup command because a realistic one would
                # require the setuptools package to be installed. Note that we can't
                # use "touch" here to create the expected output tarball file, since
                # touch is not available on Windows, so we invoke python to produce
                # equivalent behavior.
                build_setup_args=[
                    'python', '-c', 'open(__import__("sys").argv[1], "a")',
                    os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)
                ],
                temp_dir=source_dir))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
Example #14
0
 def test_with_extra_packages_invalid_file_name(self):
     staging_dir = tempfile.mkdtemp()
     source_dir = tempfile.mkdtemp()
     self.create_temp_file(os.path.join(source_dir, 'abc.tgz'), 'nothing')
     with self.assertRaises(RuntimeError) as cm:
         options = PipelineOptions()
         options.view_as(GoogleCloudOptions).staging_location = staging_dir
         self.update_options(options)
         options.view_as(SetupOptions).extra_packages = [
             os.path.join(source_dir, 'abc.tgz')
         ]
         dependency.stage_job_resources(options)
     self.assertEqual(
         cm.exception.message,
         'The --extra_package option expects a full path ending with ".tar" or '
         '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
Example #15
0
  def test_with_requirements_file(self):
    try:
      staging_dir = tempfile.mkdtemp()
      requirements_cache_dir = tempfile.mkdtemp()
      source_dir = tempfile.mkdtemp()

      options = PipelineOptions()
      options.view_as(GoogleCloudOptions).staging_location = staging_dir
      self.update_options(options)
      options.view_as(SetupOptions).requirements_cache = requirements_cache_dir
      options.view_as(SetupOptions).requirements_file = os.path.join(
          source_dir, dependency.REQUIREMENTS_FILE)
      self.create_temp_file(
          os.path.join(source_dir, dependency.REQUIREMENTS_FILE), 'nothing')
      self.assertEqual(
          sorted([dependency.REQUIREMENTS_FILE,
                  'abc.txt', 'def.txt']),
          sorted(dependency.stage_job_resources(
              options,
              populate_requirements_cache=self.populate_requirements_cache)))
      self.assertTrue(
          os.path.isfile(
              os.path.join(staging_dir, dependency.REQUIREMENTS_FILE)))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'abc.txt')))
      self.assertTrue(os.path.isfile(os.path.join(staging_dir, 'def.txt')))
    finally:
      shutil.rmtree(staging_dir)
      shutil.rmtree(requirements_cache_dir)
      shutil.rmtree(source_dir)
Example #16
0
 def test_with_extra_packages_invalid_file_name(self):
   staging_dir = tempfile.mkdtemp()
   source_dir = tempfile.mkdtemp()
   self.create_temp_file(
       os.path.join(source_dir, 'abc.tgz'), 'nothing')
   with self.assertRaises(RuntimeError) as cm:
     options = PipelineOptions()
     options.view_as(GoogleCloudOptions).staging_location = staging_dir
     self.update_options(options)
     options.view_as(SetupOptions).extra_packages = [
         os.path.join(source_dir, 'abc.tgz')]
     dependency.stage_job_resources(options)
   self.assertEqual(
       cm.exception.message,
       'The --extra_package option expects a full path ending with ".tar" or '
       '".tar.gz" instead of %s' % os.path.join(source_dir, 'abc.tgz'))
Example #17
0
  def test_with_setup_file(self):
    staging_dir = self.make_temp_dir()
    source_dir = self.make_temp_dir()
    self.create_temp_file(
        os.path.join(source_dir, 'setup.py'), 'notused')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = os.path.join(
        source_dir, 'setup.py')

    self.assertEqual(
        [dependency.WORKFLOW_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            # We replace the build setup command because a realistic one would
            # require the setuptools package to be installed. Note that we can't
            # use "touch" here to create the expected output tarball file, since
            # touch is not available on Windows, so we invoke python to produce
            # equivalent behavior.
            build_setup_args=[
                'python', '-c', 'open(__import__("sys").argv[1], "a")',
                os.path.join(source_dir, dependency.WORKFLOW_TARBALL_FILE)],
            temp_dir=source_dir))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, dependency.WORKFLOW_TARBALL_FILE)))
Example #18
0
 def create_job_description(self, job):
   """Creates a job described by the workflow proto."""
   resources = dependency.stage_job_resources(
       job.options, file_copy=self._gcs_file_copy)
   job.proto.environment = Environment(
       packages=resources, options=job.options,
       environment_version=self.environment_version).proto
   logging.debug('JOB: %s', job)
Example #19
0
    def test_setup_file_not_named_setup_dot_py(self):
        staging_dir = self.make_temp_dir()
        source_dir = self.make_temp_dir()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).setup_file = (os.path.join(
            source_dir, 'xyz-setup.py'))

        self.create_temp_file(os.path.join(source_dir, 'xyz-setup.py'),
                              'notused')
        with self.assertRaises(RuntimeError) as cm:
            dependency.stage_job_resources(options)
        self.assertTrue(cm.exception.args[0].startswith(
            'The --setup_file option expects the full path to a file named '
            'setup.py instead of '))
Example #20
0
 def create_job_description(self, job):
   """Creates a job described by the workflow proto."""
   resources = dependency.stage_job_resources(
       job.options, file_copy=self._gcs_file_copy)
   job.proto.environment = Environment(
       packages=resources, options=job.options,
       environment_version=self.environment_version).proto
   logging.debug('JOB: %s', job)
Example #21
0
  def test_default_resources(self):
    staging_dir = self.make_temp_dir()
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
Example #22
0
    def test_no_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = False
        self.update_options(options)

        self.assertEqual([], dependency.stage_job_resources(options))
Example #23
0
  def test_setup_file_not_named_setup_dot_py(self):
    staging_dir = self.make_temp_dir()
    source_dir = self.make_temp_dir()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).setup_file = (
        os.path.join(source_dir, 'xyz-setup.py'))

    self.create_temp_file(
        os.path.join(source_dir, 'xyz-setup.py'), 'notused')
    with self.assertRaises(RuntimeError) as cm:
      dependency.stage_job_resources(options)
    self.assertTrue(
        cm.exception.message.startswith(
            'The --setup_file option expects the full path to a file named '
            'setup.py instead of '))
Example #24
0
 def create_job_description(self, job):
   """Creates a job described by the workflow proto."""
   resources = dependency.stage_job_resources(
       job.options, file_copy=self._gcs_file_copy)
   job.proto.environment = Environment(
       packages=resources, options=job.options,
       environment_version=self.environment_version).proto
   # TODO(silviuc): Remove the debug logging eventually.
   logging.info('JOB: %s', job)
Example #25
0
 def create_job_description(self, job):
     """Creates a job described by the workflow proto."""
     resources = dependency.stage_job_resources(
         job.options, file_copy=self._gcs_file_copy)
     job.proto.environment = Environment(
         packages=resources,
         options=job.options,
         environment_version=self.environment_version).proto
     # TODO(silviuc): Remove the debug logging eventually.
     logging.info('JOB: %s', job)
Example #26
0
  def test_no_main_session(self):
    staging_dir = self.make_temp_dir()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = False
    self.update_options(options)

    self.assertEqual(
        [],
        dependency.stage_job_resources(options))
Example #27
0
    def test_with_extra_packages(self):
        staging_dir = self.make_temp_dir()
        source_dir = self.make_temp_dir()
        self.create_temp_file(os.path.join(source_dir, 'abc.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz.tar.gz'),
                              'nothing')
        self.create_temp_file(os.path.join(source_dir, 'xyz2.tar'), 'nothing')
        self.create_temp_file(os.path.join(source_dir, 'whl.whl'), 'nothing')
        self.create_temp_file(
            os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE),
            'nothing')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).extra_packages = [
            os.path.join(source_dir, 'abc.tar.gz'),
            os.path.join(source_dir, 'xyz.tar.gz'),
            os.path.join(source_dir, 'xyz2.tar'),
            os.path.join(source_dir, 'whl.whl'),
            'gs://my-gcs-bucket/gcs.tar.gz'
        ]

        gcs_copied_files = []

        def file_copy(from_path, to_path):
            if from_path.startswith('gs://'):
                gcs_copied_files.append(from_path)
                _, from_name = os.path.split(from_path)
                if os.path.isdir(to_path):
                    to_path = os.path.join(to_path, from_name)
                self.create_temp_file(to_path, 'nothing')
                logging.info('Fake copied GCS file: %s to %s', from_path,
                             to_path)
            elif to_path.startswith('gs://'):
                logging.info('Faking file_copy(%s, %s)', from_path, to_path)
            else:
                shutil.copyfile(from_path, to_path)

        dependency._dependency_file_copy = file_copy

        self.assertEqual([
            'abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz',
            dependency.EXTRA_PACKAGES_FILE
        ], dependency.stage_job_resources(options))
        with open(os.path.join(staging_dir,
                               dependency.EXTRA_PACKAGES_FILE)) as f:
            self.assertEqual([
                'abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n', 'whl.whl\n',
                'gcs.tar.gz\n'
            ], f.readlines())
        self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
Example #28
0
    def test_sdk_location_gcs(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
        self.override_file_copy(sdk_location, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(options))
Example #29
0
    def test_with_main_session(self):
        staging_dir = tempfile.mkdtemp()
        options = PipelineOptions()

        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        options.view_as(SetupOptions).save_main_session = True
        self.update_options(options)

        self.assertEqual([names.PICKLED_MAIN_SESSION_FILE],
                         dependency.stage_job_resources(options))
        self.assertTrue(
            os.path.isfile(
                os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
Example #30
0
    def test_sdk_location_gcs_source_file(self):
        staging_dir = self.make_temp_dir()
        sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        with mock.patch('apache_beam.runners.dataflow.internal.'
                        'dependency._dependency_file_copy'):
            self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                             dependency.stage_job_resources(options))
Example #31
0
  def test_sdk_location_gcs(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'
    self.override_file_copy(sdk_location, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
Example #32
0
    def test_sdk_location_gcs_wheel_file(self):
        staging_dir = self.make_temp_dir()
        sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
        sdk_location = 'gs://my-gcs-bucket/' + sdk_filename

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        with mock.patch('apache_beam.runners.dataflow.internal.'
                        'dependency._dependency_file_copy'):
            self.assertEqual([sdk_filename],
                             dependency.stage_job_resources(options))
Example #33
0
  def test_with_main_session(self):
    staging_dir = self.make_temp_dir()
    options = PipelineOptions()

    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    options.view_as(SetupOptions).save_main_session = True
    self.update_options(options)

    self.assertEqual(
        [names.PICKLED_MAIN_SESSION_FILE],
        dependency.stage_job_resources(options))
    self.assertTrue(
        os.path.isfile(
            os.path.join(staging_dir, names.PICKLED_MAIN_SESSION_FILE)))
  def test_sdk_location_gcs_source_file(self):
    staging_dir = self.make_temp_dir()
    sdk_location = 'gs://my-gcs-bucket/tarball.tar.gz'

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    with mock.patch('apache_beam.runners.dataflow.internal.'
                    'dependency._dependency_file_copy'):
      self.assertEqual(
          [names.DATAFLOW_SDK_TARBALL_FILE],
          dependency.stage_job_resources(options))
Example #35
0
  def test_with_extra_packages(self):
    staging_dir = self.make_temp_dir()
    source_dir = self.make_temp_dir()
    self.create_temp_file(
        os.path.join(source_dir, 'abc.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz.tar.gz'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'xyz2.tar'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, 'whl.whl'), 'nothing')
    self.create_temp_file(
        os.path.join(source_dir, dependency.EXTRA_PACKAGES_FILE), 'nothing')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).extra_packages = [
        os.path.join(source_dir, 'abc.tar.gz'),
        os.path.join(source_dir, 'xyz.tar.gz'),
        os.path.join(source_dir, 'xyz2.tar'),
        os.path.join(source_dir, 'whl.whl'),
        'gs://my-gcs-bucket/gcs.tar.gz']

    gcs_copied_files = []

    def file_copy(from_path, to_path):
      if from_path.startswith('gs://'):
        gcs_copied_files.append(from_path)
        _, from_name = os.path.split(from_path)
        if os.path.isdir(to_path):
          to_path = os.path.join(to_path, from_name)
        self.create_temp_file(to_path, 'nothing')
        logging.info('Fake copied GCS file: %s to %s', from_path, to_path)
      elif to_path.startswith('gs://'):
        logging.info('Faking file_copy(%s, %s)', from_path, to_path)
      else:
        shutil.copyfile(from_path, to_path)

    dependency._dependency_file_copy = file_copy

    self.assertEqual(
        ['abc.tar.gz', 'xyz.tar.gz', 'xyz2.tar', 'whl.whl', 'gcs.tar.gz',
         dependency.EXTRA_PACKAGES_FILE],
        dependency.stage_job_resources(options))
    with open(os.path.join(staging_dir, dependency.EXTRA_PACKAGES_FILE)) as f:
      self.assertEqual(['abc.tar.gz\n', 'xyz.tar.gz\n', 'xyz2.tar\n',
                        'whl.whl\n', 'gcs.tar.gz\n'], f.readlines())
    self.assertEqual(['gs://my-gcs-bucket/gcs.tar.gz'], gcs_copied_files)
  def test_sdk_location_gcs_wheel_file(self):
    staging_dir = self.make_temp_dir()
    sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
    sdk_location = 'gs://my-gcs-bucket/' + sdk_filename

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    with mock.patch('apache_beam.runners.dataflow.internal.'
                    'dependency._dependency_file_copy'):
      self.assertEqual(
          [sdk_filename],
          dependency.stage_job_resources(options))
Example #37
0
    def test_sdk_location_default(self):
        staging_dir = tempfile.mkdtemp()
        expected_from_url = 'pypi'
        expected_from_path = self.override_pypi_download(
            expected_from_url, staging_dir)
        self.override_file_copy(expected_from_path, staging_dir)

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(
                             options,
                             file_copy=dependency._dependency_file_copy))
Example #38
0
  def test_sdk_location_default(self):
    staging_dir = self.make_temp_dir()
    expected_from_url = 'pypi'
    expected_from_path = self.override_pypi_download(
        expected_from_url, staging_dir)
    self.override_file_copy(expected_from_path, staging_dir)

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = 'default'

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(
            options,
            file_copy=dependency._dependency_file_copy))
Example #39
0
    def test_sdk_location_local_wheel_file(self):
        staging_dir = self.make_temp_dir()
        sdk_directory = self.make_temp_dir()
        sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
        sdk_location = os.path.join(sdk_directory, sdk_filename)
        self.create_temp_file(sdk_location, 'Package content.')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual([sdk_filename],
                         dependency.stage_job_resources(options))
        tarball_path = os.path.join(staging_dir, sdk_filename)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'Package content.')
Example #40
0
    def test_sdk_location_local_source_file(self):
        staging_dir = self.make_temp_dir()
        sdk_directory = self.make_temp_dir()
        sdk_filename = 'apache-beam-3.0.0.tar.gz'
        sdk_location = os.path.join(sdk_directory, sdk_filename)
        self.create_temp_file(sdk_location, 'Package content.')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(options))
        tarball_path = os.path.join(staging_dir,
                                    names.DATAFLOW_SDK_TARBALL_FILE)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'Package content.')
Example #41
0
    def test_sdk_location_local(self):
        staging_dir = tempfile.mkdtemp()
        sdk_location = tempfile.mkdtemp()
        self.create_temp_file(
            os.path.join(sdk_location, names.DATAFLOW_SDK_TARBALL_FILE),
            'contents')

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = sdk_location

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE],
                         dependency.stage_job_resources(options))
        tarball_path = os.path.join(staging_dir,
                                    names.DATAFLOW_SDK_TARBALL_FILE)
        with open(tarball_path) as f:
            self.assertEqual(f.read(), 'contents')
Example #42
0
  def create_job_description(self, job):
    """Creates a job described by the workflow proto."""

    # Stage the pipeline for the runner harness
    self.stage_file(job.google_cloud_options.staging_location,
                    names.STAGED_PIPELINE_FILENAME,
                    StringIO(job.proto_pipeline.SerializeToString()))

    # Stage other resources for the SDK harness
    resources = dependency.stage_job_resources(
        job.options, file_copy=self._gcs_file_copy)

    job.proto.environment = Environment(
        pipeline_url=FileSystems.join(job.google_cloud_options.staging_location,
                                      names.STAGED_PIPELINE_FILENAME),
        packages=resources, options=job.options,
        environment_version=self.environment_version).proto
    logging.debug('JOB: %s', job)
  def test_sdk_location_local_source_file(self):
    staging_dir = self.make_temp_dir()
    sdk_directory = self.make_temp_dir()
    sdk_filename = 'apache-beam-3.0.0.tar.gz'
    sdk_location = os.path.join(sdk_directory, sdk_filename)
    self.create_temp_file(sdk_location, 'Package content.')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'Package content.')
  def test_sdk_location_local_wheel_file(self):
    staging_dir = self.make_temp_dir()
    sdk_directory = self.make_temp_dir()
    sdk_filename = 'apache_beam-1.0.0-cp27-cp27mu-manylinux1_x86_64.whl'
    sdk_location = os.path.join(sdk_directory, sdk_filename)
    self.create_temp_file(sdk_location, 'Package content.')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [sdk_filename],
        dependency.stage_job_resources(options))
    tarball_path = os.path.join(
        staging_dir, sdk_filename)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'Package content.')
Example #45
0
    def test_sdk_location_default(self):
        staging_dir = self.make_temp_dir()
        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        with mock.patch(
                'apache_beam.utils.processes.check_call',
                self.build_fake_pip_download_command_handler(
                    has_wheels=False)):
            staged_resources = dependency.stage_job_resources(
                options, temp_dir=self.make_temp_dir())

        self.assertEqual([names.DATAFLOW_SDK_TARBALL_FILE], staged_resources)

        with open(os.path.join(staging_dir,
                               names.DATAFLOW_SDK_TARBALL_FILE)) as f:
            self.assertEqual(f.read(), 'Package content.')
  def test_sdk_location_default(self):
    staging_dir = self.make_temp_dir()
    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = 'default'

    with mock.patch('apache_beam.utils.processes.check_call',
                    self.build_fake_pip_download_command_handler(
                        has_wheels=False)):
      staged_resources = dependency.stage_job_resources(
          options, temp_dir=self.make_temp_dir())

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE], staged_resources)

    with open(os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)) as f:
      self.assertEqual(f.read(), 'Package content.')
Example #47
0
  def test_sdk_location_local(self):
    staging_dir = self.make_temp_dir()
    sdk_location = self.make_temp_dir()
    self.create_temp_file(
        os.path.join(
            sdk_location,
            names.DATAFLOW_SDK_TARBALL_FILE),
        'contents')

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = sdk_location

    self.assertEqual(
        [names.DATAFLOW_SDK_TARBALL_FILE],
        dependency.stage_job_resources(options))
    tarball_path = os.path.join(
        staging_dir, names.DATAFLOW_SDK_TARBALL_FILE)
    with open(tarball_path) as f:
      self.assertEqual(f.read(), 'contents')
Example #48
0
    def test_sdk_location_default_with_wheels(self):
        staging_dir = self.make_temp_dir()

        options = PipelineOptions()
        options.view_as(GoogleCloudOptions).staging_location = staging_dir
        self.update_options(options)
        options.view_as(SetupOptions).sdk_location = 'default'

        with mock.patch(
                'apache_beam.utils.processes.check_call',
                self.build_fake_pip_download_command_handler(has_wheels=True)):
            staged_resources = dependency.stage_job_resources(
                options, temp_dir=self.make_temp_dir())

            self.assertTrue(len(staged_resources), 2)
            self.assertEqual(staged_resources[0],
                             names.DATAFLOW_SDK_TARBALL_FILE)
            # Exact name depends on the version of the SDK.
            self.assertTrue(staged_resources[1].endswith('whl'))
            for name in staged_resources:
                with open(os.path.join(staging_dir, name)) as f:
                    self.assertEqual(f.read(), 'Package content.')
  def test_sdk_location_default_with_wheels(self):
    staging_dir = self.make_temp_dir()

    options = PipelineOptions()
    options.view_as(GoogleCloudOptions).staging_location = staging_dir
    self.update_options(options)
    options.view_as(SetupOptions).sdk_location = 'default'

    with mock.patch(
        'apache_beam.utils.processes.check_call',
        self.build_fake_pip_download_command_handler(has_wheels=True)):
      staged_resources = dependency.stage_job_resources(
          options,
          temp_dir=self.make_temp_dir())

      self.assertTrue(len(staged_resources), 2)
      self.assertEqual(staged_resources[0], names.DATAFLOW_SDK_TARBALL_FILE)
      # Exact name depends on the version of the SDK.
      self.assertTrue(staged_resources[1].endswith('whl'))
      for name in staged_resources:
        with open(os.path.join(staging_dir, name)) as f:
          self.assertEqual(f.read(), 'Package content.')
Example #50
0
 def test_no_staging_location(self):
   with self.assertRaises(RuntimeError) as cm:
     dependency.stage_job_resources(PipelineOptions())
   self.assertEqual('The --staging_location option must be specified.',
                    cm.exception.message)
Example #51
0
 def test_no_staging_location(self):
     with self.assertRaises(RuntimeError) as cm:
         dependency.stage_job_resources(PipelineOptions())
     self.assertEqual('The --staging_location option must be specified.',
                      cm.exception.message)