def stage_file(self, gcs_or_local_path, file_name, stream, mime_type='application/octet-stream', total_size=None): """Stages a file at a GCS or local path with stream-supplied contents.""" if not gcs_or_local_path.startswith('gs://'): local_path = FileSystems.join(gcs_or_local_path, file_name) _LOGGER.info('Staging file locally to %s', local_path) with open(local_path, 'wb') as f: f.write(stream.read()) return gcs_location = FileSystems.join(gcs_or_local_path, file_name) bucket, name = gcs_location[5:].split('/', 1) request = storage.StorageObjectsInsertRequest( bucket=bucket, name=name) start_time = time.time() _LOGGER.info('Starting GCS upload to %s...', gcs_location) upload = storage.Upload(stream, mime_type, total_size) try: response = self._storage_client.objects.Insert(request, upload=upload) except exceptions.HttpError as e: reportable_errors = { 403: 'access denied', 404: 'bucket not found', } if e.status_code in reportable_errors: raise IOError(('Could not upload to GCS path %s: %s. Please verify ' 'that credentials are valid and that you have write ' 'access to the specified path.') % (gcs_or_local_path, reportable_errors[e.status_code])) raise _LOGGER.info('Completed GCS upload to %s in %s seconds.', gcs_location, int(time.time() - start_time)) return response
def stage_file(self, gcs_or_local_path, file_name, stream, mime_type='application/octet-stream'): """Stages a file at a GCS or local path with stream-supplied contents.""" if not gcs_or_local_path.startswith('gs://'): local_path = FileSystems.join(gcs_or_local_path, file_name) logging.info('Staging file locally to %s', local_path) with open(local_path, 'wb') as f: f.write(stream.read()) return gcs_location = FileSystems.join(gcs_or_local_path, file_name) bucket, name = gcs_location[5:].split('/', 1) request = storage.StorageObjectsInsertRequest( bucket=bucket, name=name) logging.info('Starting GCS upload to %s...', gcs_location) upload = storage.Upload(stream, mime_type) try: response = self._storage_client.objects.Insert(request, upload=upload) except exceptions.HttpError as e: reportable_errors = { 403: 'access denied', 404: 'bucket not found', } if e.status_code in reportable_errors: raise IOError(('Could not upload to GCS path %s: %s. Please verify ' 'that credentials are valid and that you have write ' 'access to the specified path.') % (gcs_or_local_path, reportable_errors[e.status_code])) raise logging.info('Completed GCS upload to %s', gcs_location) return response
def __init__(self, options, proto_pipeline): self.options = options self.proto_pipeline = proto_pipeline self.google_cloud_options = options.view_as(GoogleCloudOptions) if not self.google_cloud_options.job_name: self.google_cloud_options.job_name = self.default_job_name( self.google_cloud_options.job_name) required_google_cloud_options = [ 'project', 'job_name', 'temp_location' ] missing = [ option for option in required_google_cloud_options if not getattr(self.google_cloud_options, option) ] if missing: raise ValueError('Missing required configuration parameters: %s' % missing) if not self.google_cloud_options.staging_location: logging.info( 'Defaulting to the temp_location as staging_location: %s', self.google_cloud_options.temp_location) (self.google_cloud_options.staging_location ) = self.google_cloud_options.temp_location # Make the staging and temp locations job name and time specific. This is # needed to avoid clashes between job submissions using the same staging # area or team members using same job names. This method is not entirely # foolproof since two job submissions with same name can happen at exactly # the same time. However the window is extremely small given that # time.time() has at least microseconds granularity. We add the suffix only # for GCS staging locations where the potential for such clashes is high. if self.google_cloud_options.staging_location.startswith('gs://'): path_suffix = '%s.%f' % (self.google_cloud_options.job_name, time.time()) self.google_cloud_options.staging_location = FileSystems.join( self.google_cloud_options.staging_location, path_suffix) self.google_cloud_options.temp_location = FileSystems.join( self.google_cloud_options.temp_location, path_suffix) self.proto = dataflow.Job(name=self.google_cloud_options.job_name) if self.options.view_as(StandardOptions).streaming: self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING else: self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH # Labels. if self.google_cloud_options.labels: self.proto.labels = dataflow.Job.LabelsValue() for label in self.google_cloud_options.labels: parts = label.split('=', 1) key = parts[0] value = parts[1] if len(parts) > 1 else '' self.proto.labels.additionalProperties.append( dataflow.Job.LabelsValue.AdditionalProperty(key=key, value=value)) self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$') self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
def bigquery_export_destination_uri( gcs_location_vp: Optional[ValueProvider], temp_location: Optional[str], unique_id: str, directory_only: bool = False, ) -> str: """Returns the fully qualified Google Cloud Storage URI where the extracted table should be written. """ file_pattern = 'bigquery-table-dump-*.json' gcs_location = None if gcs_location_vp is not None: gcs_location = gcs_location_vp.get() if gcs_location is not None: gcs_base = gcs_location elif temp_location is not None: gcs_base = temp_location _LOGGER.debug("gcs_location is empty, using temp_location instead") else: raise ValueError( 'ReadFromBigQuery requires a GCS location to be provided. Neither ' 'gcs_location in the constructor nor the fallback option ' '--temp_location is set.') if not unique_id: unique_id = uuid.uuid4().hex if directory_only: return FileSystems.join(gcs_base, unique_id) else: return FileSystems.join(gcs_base, unique_id, file_pattern)
def _stage_beam_sdk(self, sdk_remote_location, staging_location, temp_dir): """Stages a Beam SDK file with the appropriate version. Args: sdk_remote_location: A URL from which thefile can be downloaded or a remote file location. The SDK file can be a tarball or a wheel. Set to 'pypi' to download and stage a wheel and source SDK from PyPi. staging_location: Location where the SDK file should be copied. temp_dir: path to temporary location where the file should be downloaded. Returns: A list of SDK files that were staged to the staging location. Raises: RuntimeError: if staging was not successful. """ if sdk_remote_location == 'pypi': sdk_local_file = Stager._download_pypi_sdk_package(temp_dir) sdk_sources_staged_name = Stager.\ _desired_sdk_filename_in_staging_location(sdk_local_file) staged_path = FileSystems.join(staging_location, sdk_sources_staged_name) logging.info('Staging SDK sources from PyPI to %s', staged_path) self.stage_artifact(sdk_local_file, staged_path) staged_sdk_files = [sdk_sources_staged_name] try: # Stage binary distribution of the SDK, for now on a best-effort basis. sdk_local_file = Stager._download_pypi_sdk_package( temp_dir, fetch_binary=True) sdk_binary_staged_name = Stager.\ _desired_sdk_filename_in_staging_location(sdk_local_file) staged_path = FileSystems.join(staging_location, sdk_binary_staged_name) logging.info( 'Staging binary distribution of the SDK from PyPI to %s', staged_path) self.stage_artifact(sdk_local_file, staged_path) staged_sdk_files.append(sdk_binary_staged_name) except RuntimeError as e: logging.warn( 'Failed to download requested binary distribution ' 'of the SDK: %s', repr(e)) return staged_sdk_files elif Stager._is_remote_path(sdk_remote_location): local_download_file = os.path.join(temp_dir, 'beam-sdk.tar.gz') Stager._download_file(sdk_remote_location, local_download_file) staged_name = Stager._desired_sdk_filename_in_staging_location( sdk_remote_location) staged_path = FileSystems.join(staging_location, staged_name) logging.info('Staging Beam SDK from %s to %s', sdk_remote_location, staged_path) self.stage_artifact(local_download_file, staged_path) return [staged_name] else: raise RuntimeError( 'The --sdk_location option was used with an unsupported ' 'type of location: %s' % sdk_remote_location)
def test_windows_path_join(self, *unused_mocks): # Test joining of Windows paths. localfilesystem.os.path.join.side_effect = _gen_fake_join('\\') self.assertEqual(r'C:\tmp\path\to\file', FileSystems.join(r'C:\tmp\path', 'to', 'file')) self.assertEqual(r'C:\tmp\path\to\file', FileSystems.join(r'C:\tmp\path', r'to\file')) self.assertEqual(r'C:\tmp\path\to\file', FileSystems.join(r'C:\tmp\path\\', 'to', 'file'))
def __init__(self, options, proto_pipeline): self.options = options self.proto_pipeline = proto_pipeline self.google_cloud_options = options.view_as(GoogleCloudOptions) if not self.google_cloud_options.job_name: self.google_cloud_options.job_name = self.default_job_name( self.google_cloud_options.job_name) required_google_cloud_options = ['project', 'job_name', 'temp_location'] missing = [ option for option in required_google_cloud_options if not getattr(self.google_cloud_options, option)] if missing: raise ValueError( 'Missing required configuration parameters: %s' % missing) if not self.google_cloud_options.staging_location: logging.info('Defaulting to the temp_location as staging_location: %s', self.google_cloud_options.temp_location) (self.google_cloud_options .staging_location) = self.google_cloud_options.temp_location # Make the staging and temp locations job name and time specific. This is # needed to avoid clashes between job submissions using the same staging # area or team members using same job names. This method is not entirely # foolproof since two job submissions with same name can happen at exactly # the same time. However the window is extremely small given that # time.time() has at least microseconds granularity. We add the suffix only # for GCS staging locations where the potential for such clashes is high. if self.google_cloud_options.staging_location.startswith('gs://'): path_suffix = '%s.%f' % (self.google_cloud_options.job_name, time.time()) self.google_cloud_options.staging_location = FileSystems.join( self.google_cloud_options.staging_location, path_suffix) self.google_cloud_options.temp_location = FileSystems.join( self.google_cloud_options.temp_location, path_suffix) self.proto = dataflow.Job(name=self.google_cloud_options.job_name) if self.options.view_as(StandardOptions).streaming: self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_STREAMING else: self.proto.type = dataflow.Job.TypeValueValuesEnum.JOB_TYPE_BATCH if self.google_cloud_options.update: self.proto.replaceJobId = self.job_id_for_name(self.proto.name) # Labels. if self.google_cloud_options.labels: self.proto.labels = dataflow.Job.LabelsValue() for label in self.google_cloud_options.labels: parts = label.split('=', 1) key = parts[0] value = parts[1] if len(parts) > 1 else '' self.proto.labels.additionalProperties.append( dataflow.Job.LabelsValue.AdditionalProperty(key=key, value=value)) self.base64_str_re = re.compile(r'^[A-Za-z0-9+/]*=*$') self.coder_str_re = re.compile(r'^([A-Za-z]+\$)([A-Za-z0-9+/]*=*)$')
def _stage_beam_sdk(self, sdk_remote_location, staging_location, temp_dir): """Stages a Beam SDK file with the appropriate version. Args: sdk_remote_location: A URL from which thefile can be downloaded or a remote file location. The SDK file can be a tarball or a wheel. Set to 'pypi' to download and stage a wheel and source SDK from PyPi. staging_location: Location where the SDK file should be copied. temp_dir: path to temporary location where the file should be downloaded. Returns: A list of SDK files that were staged to the staging location. Raises: RuntimeError: if staging was not successful. """ if sdk_remote_location == 'pypi': sdk_local_file = Stager._download_pypi_sdk_package(temp_dir) sdk_sources_staged_name = Stager.\ _desired_sdk_filename_in_staging_location(sdk_local_file) staged_path = FileSystems.join(staging_location, sdk_sources_staged_name) logging.info('Staging SDK sources from PyPI to %s', staged_path) self.stage_artifact(sdk_local_file, staged_path) staged_sdk_files = [sdk_sources_staged_name] try: # Stage binary distribution of the SDK, for now on a best-effort basis. sdk_local_file = Stager._download_pypi_sdk_package( temp_dir, fetch_binary=True) sdk_binary_staged_name = Stager.\ _desired_sdk_filename_in_staging_location(sdk_local_file) staged_path = FileSystems.join(staging_location, sdk_binary_staged_name) logging.info('Staging binary distribution of the SDK from PyPI to %s', staged_path) self.stage_artifact(sdk_local_file, staged_path) staged_sdk_files.append(sdk_binary_staged_name) except RuntimeError as e: logging.warn( 'Failed to download requested binary distribution ' 'of the SDK: %s', repr(e)) return staged_sdk_files elif Stager._is_remote_path(sdk_remote_location): local_download_file = os.path.join(temp_dir, 'beam-sdk.tar.gz') Stager._download_file(sdk_remote_location, local_download_file) staged_name = Stager._desired_sdk_filename_in_staging_location( sdk_remote_location) staged_path = FileSystems.join(staging_location, staged_name) logging.info('Staging Beam SDK from %s to %s', sdk_remote_location, staged_path) self.stage_artifact(local_download_file, staged_path) return [staged_name] else: raise RuntimeError( 'The --sdk_location option was used with an unsupported ' 'type of location: %s' % sdk_remote_location)
def test_unix_path_join(self, *unused_mocks): # Test joining of Unix paths. localfilesystem.os.path.join.side_effect = _gen_fake_join('/') self.assertEqual('/tmp/path/to/file', FileSystems.join('/tmp/path', 'to', 'file')) self.assertEqual('/tmp/path/to/file', FileSystems.join('/tmp/path', 'to/file')) self.assertEqual('/tmp/path/to/file', FileSystems.join('/', 'tmp/path', 'to/file')) self.assertEqual('/tmp/path/to/file', FileSystems.join('/tmp/', 'path', 'to/file'))
def write_orphaned_file(temp_dir, writer_key): temp_dir_path = FileSystems.join(dir, temp_dir) file_prefix_dir = FileSystems.join(temp_dir_path, str(abs(hash(writer_key)))) file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4()) with FileSystems.create(file_name) as f: f.write(b'Hello y\'all') return file_name
def _stage_jar_packages(self, jar_packages, staging_location, temp_dir): # type: (...) -> List[str] """Stages a list of local jar packages for Java SDK Harness. :param jar_packages: Ordered list of local paths to jar packages to be staged. Only packages on localfile system and GCS are supported. :param staging_location: Staging location for the packages. :param temp_dir: Temporary folder where the resource building can happen. :return: A list of file names (no paths) for the resource staged. All the files are assumed to be staged in staging_location. :raises: RuntimeError: If files specified are not found or do not have expected name patterns. """ resources = [] # type: List[str] staging_temp_dir = tempfile.mkdtemp(dir=temp_dir) local_packages = [] # type: List[str] for package in jar_packages: if not os.path.basename(package).endswith('.jar'): raise RuntimeError( 'The --experiment=\'jar_packages=\' option expects a full path ' 'ending with ".jar" instead of %s' % package) if not os.path.isfile(package): if Stager._is_remote_path(package): # Download remote package. _LOGGER.info( 'Downloading jar package: %s locally before staging', package) _, last_component = FileSystems.split(package) local_file_path = FileSystems.join(staging_temp_dir, last_component) Stager._download_file(package, local_file_path) else: raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--experiment=\'jar_packages=\' command line option.' % package) else: local_packages.append(package) local_packages.extend([ FileSystems.join(staging_temp_dir, f) for f in os.listdir(staging_temp_dir) ]) for package in local_packages: basename = os.path.basename(package) staged_path = FileSystems.join(staging_location, basename) self.stage_artifact(package, staged_path) resources.append(basename) return resources
def run(argv=None): pipeline_options = PipelineOptions(argv) options = pipeline_options.view_as(ParkdataPipelineOptions) # Save the main session that defines global import, functions and variables. Otherwise they are not saved during # the serialization. Details see https://cloud.google.com/dataflow/docs/resources/faq#how_do_i_handle_nameerrors pipeline_options.view_as( SetupOptions).save_main_session = options.save_session with beam.Pipeline(options=pipeline_options) as p: wikidata_data, commons_ids = ( p | "wikidata_query/create" >> beam.Create(wd_queries()) | "wikidata/query" >> wikidata.Query( FileSystems.join(options.base_path, "wikidata_query_cache.sqlite"), user_agent=options.user_agent, ) | "wikidata/group" >> beam.GroupByKey() | "wikidata/fetch" >> wikidata.Transform( options.supported_languages(), cache_file=FileSystems.join(options.base_path, "wikidata_cache.sqlite"), user_agent=options.user_agent, )) commons_data = commons_ids | "commons" >> commons.Transform( FileSystems.join(options.base_path, "commons_cache.sqlite"), user_agent=options.user_agent) wikipedia_data = wikidata_data | "wikipedia" >> wikipedia.Transform( FileSystems.join(options.base_path, "wikipedia_qache.sqlite"), user_agent=options.user_agent) changed_places = ( { Combine.TAG_COMMONS: commons_data, Combine.TAG_WIKIDATA: wikidata_data, Combine.TAG_WIKIPEDIA: wikipedia_data, } | "combine/group_by_key" >> beam.CoGroupByKey() | "combine/combine" >> beam.ParDo(Combine()) | "combine/changed" >> beam.ParDo( OutputNewOrChangedEntires( FileSystems.join(options.base_path, "output.sqlite")))) (changed_places | "firestore_output/convert_types" >> beam.MapTuple(use_firestore_types) | "firestore_output/write" >> beam.ParDo( FirestoreWrite(project=options.project_id, collection="places_v4", credentials="gcp-service-account.json")))
def test_store_fileio_file_small_buffer_flush(self, FakeClient): input_dict = {} input_dict['project_id'] = "test_project" input_dict['region'] = "test_region" input_dict['dataset_id'] = "test_dataset_id" input_dict['dicom_store_id'] = "test_dicom_store_id" fc = FakeHttpClient() FakeClient.return_value = fc temp_dir = '%s%s' % (self._new_tempdir(), os.sep) dict_input_1 = { 'PatientName': 'George', 'Age': 23, 'TestResult': 'Negative' } str_input_1 = json.dumps(dict_input_1) self._create_temp_file(dir=temp_dir, content=str_input_1) dict_input_2 = {'PatientName': 'Peter', 'Age': 54, 'TestResult': 'Positive'} str_input_2 = json.dumps(dict_input_2) self._create_temp_file(dir=temp_dir, content=str_input_2) dict_input_3 = {'PatientName': 'Zen', 'Age': 27, 'TestResult': 'Negative'} str_input_3 = json.dumps(dict_input_3) self._create_temp_file(dir=temp_dir, content=str_input_3) with TestPipeline() as p: results = ( p | beam.Create([FileSystems.join(temp_dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | UploadToDicomStore(input_dict, 'fileio', buffer_size=1) | beam.Map(lambda x: x['success'])) assert_that(results, equal_to([True] * 3)) self.assertTrue(dict_input_1 in fc.dicom_metadata) self.assertTrue(dict_input_2 in fc.dicom_metadata) self.assertTrue(dict_input_3 in fc.dicom_metadata)
def create_job(self, job): """Creates job description. May stage and/or submit for remote execution.""" self.create_job_description(job) # Stage and submit the job when necessary dataflow_job_file = job.options.view_as(DebugOptions).dataflow_job_file template_location = ( job.options.view_as(GoogleCloudOptions).template_location) job_location = template_location or dataflow_job_file if job_location: gcs_or_local_path = os.path.dirname(job_location) file_name = os.path.basename(job_location) self.stage_file(gcs_or_local_path, file_name, io.BytesIO(job.json().encode('utf-8'))) if job.options.view_as(DebugOptions).lookup_experiment('upload_graph'): self.stage_file( job.options.view_as(GoogleCloudOptions).staging_location, "dataflow_graph.json", io.BytesIO(job.json().encode('utf-8'))) del job.proto.steps[:] job.proto.stepsLocation = FileSystems.join( job.options.view_as(GoogleCloudOptions).staging_location, "dataflow_graph.json") if not template_location: return self.submit_job_description(job) _LOGGER.info('A template was just created at location %s', template_location) return None
def test_write_to_dynamic_destination(self): sink_params = [ fileio.TextSink, # pass a type signature fileio.TextSink() # pass a FileSink object ] for sink in sink_params: dir = self._new_tempdir() with TestPipeline() as p: _ = (p | "Create" >> beam.Create(range(100)) | beam.Map(lambda x: str(x)) | fileio.WriteToFiles( path=dir, destination=lambda n: "odd" if int(n) % 2 else "even", sink=sink, file_naming=fileio.destination_prefix_naming("test"))) with TestPipeline() as p: result = ( p | fileio.MatchFiles(FileSystems.join(dir, '*')) | fileio.ReadMatches() | beam.Map(lambda f: ( os.path.basename(f.metadata.path).split('-')[0], sorted(map(int, f.read_utf8().strip().split('\n')))))) assert_that( result, equal_to([('odd', list(range(1, 100, 2))), ('even', list(range(0, 100, 2)))]))
def test_valid(self): file_pattern = FileSystems.join(self.test_data_dir, 'detail.json') expected_valid = [(1, { 'error': [], 'first_name': 'Bart', 'last_name': 'Bruck', 'email': '*****@*****.**', 'id': 1 }), (3, { 'error': [u"email 'wtuppeny2bandcamp.com' is invalid"], 'first_name': 'Winny', 'last_name': 'Tuppeny', 'email': None, 'id': 3 })] expected_broken = [{ 'error': 'id is missing', 'element': '{"first_name":"Alfonso","last_name":"Koenen","email":"*****@*****.**"}' }] # Make use of the TestPipeline from the Beam testing util. with TestPipeline() as p: actual_valid, actual_broken = (p | Prepare(file_pattern)) # The labels are required because otherwise the assert_that Transform does not have a stable unique label. assert_that(actual_valid, equal_to(expected_valid), label='valid') assert_that(actual_broken, equal_to(expected_broken), label='broken')
def stage_job_resources(self, resources, # type: List[Tuple[str, str]] staging_location=None # type: Optional[str] ): """For internal use only; no backwards-compatibility guarantees. Stages job resources to staging_location. Args: resources: A list of tuples of local file paths and file names (no paths) to be used for staging resources. staging_location: Location to stage the file. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged at staging_location. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ # Make sure that all required options are specified. if staging_location is None: raise RuntimeError('The staging_location must be specified.') staged_resources = [] for file_path, staged_path in resources: self.stage_artifact( file_path, FileSystems.join(staging_location, staged_path)) staged_resources.append(staged_path) return staged_resources
def test_write_to_different_file_types_some_spilling(self): dir = self._new_tempdir() with TestPipeline() as p: _ = ( p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | beam.io.fileio.WriteToFiles( path=dir, destination=lambda record: record['foundation'], sink=lambda dest: ( WriteFilesTest.CsvSink(WriteFilesTest.CSV_HEADERS) if dest == 'apache' else WriteFilesTest.JsonSink()), file_naming=fileio.destination_prefix_naming(), max_writers_per_bundle=1)) with TestPipeline() as p: cncf_res = ( p | fileio.MatchFiles(FileSystems.join(dir, 'cncf*')) | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read_utf8().strip().split('\n')) | beam.Map(json.loads)) apache_res = ( p | "MatchApache" >> fileio.MatchFiles(FileSystems.join(dir, 'apache*')) | "ReadApache" >> fileio.ReadMatches() | "MapApache" >> beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that( cncf_res, equal_to([ row for row in self.SIMPLE_COLLECTION if row['foundation'] == 'cncf' ]), label='verifyCNCF') assert_that( apache_res, equal_to([[row['project'], row['foundation']] for row in self.SIMPLE_COLLECTION if row['foundation'] == 'apache']), label='verifyApache')
class TestPrepare(unittest.TestCase): test_data_dir = FileSystems.join(os.path.dirname(os.path.realpath(__file__)), 'testdata') def test_valid(self): file_pattern = FileSystems.join(self.test_data_dir, 'order.json') expected_valid = [ (880, { 'id': 880, 'customer_id': 1, 'total_price': Decimal('287.69'), 'error': [], }), (1342, { 'id': 1342, 'customer_id': 2, 'total_price': Decimal('194.52'), 'error': [], }), (1766, { 'id': 1766, 'customer_id': 2, 'total_price': Decimal('985.00'), 'error': [], }), (2924, { 'id': 2924, 'customer_id': 2, 'total_price': Decimal('837.23'), 'error': [], }), (3607, { 'id': 3607, 'customer_id': 3, 'total_price': Decimal('373.02'), 'error': [], }), (3949, { 'id': 3949, 'customer_id': 3, 'total_price': Decimal('702.88'), 'error': [], }), ] expected_broken = [ { 'error': 'id is missing', 'element': '{"customer_id":3,"total_price":"707.16"}' } ] # Make use of the TestPipeline from the Beam testing util. with TestPipeline() as p: actual_valid, actual_broken = ( p | Prepare(file_pattern) ) # The labels are required because otherwise the assert_that Transform does not have a stable unique label. assert_that(actual_valid, equal_to(expected_valid), label='valid') assert_that(actual_broken, equal_to(expected_broken), label='broken')
def file_copy(from_path, to_path): if not from_path.endswith(names.PICKLED_MAIN_SESSION_FILE): self.assertEqual(expected_from_path, from_path) self.assertEqual(FileSystems.join(expected_to_dir, names.DATAFLOW_SDK_TARBALL_FILE), to_path) if from_path.startswith('gs://') or to_path.startswith('gs://'): logging.info('Faking file_copy(%s, %s)', from_path, to_path) else: shutil.copyfile(from_path, to_path)
def open_writer(self, init_result, uid): # A proper suffix is needed for AUTO compression detection. # We also ensure there will be no collisions with uid and a # (possibly unsharded) file_path_prefix and a (possibly empty) # file_name_suffix. file_path_prefix = self.file_path_prefix.get() file_name_suffix = self.file_name_suffix.get() suffix = ('.' + os.path.basename(file_path_prefix) + file_name_suffix) writer_path = FileSystems.join(init_result, uid) + suffix return FileBasedSinkWriter(self, writer_path)
def _create_temp_dir(self, file_path_prefix): base_path, last_component = FileSystems.split(file_path_prefix) if not last_component: # Trying to re-split the base_path to check if it's a root. new_base_path, _ = FileSystems.split(base_path) if base_path == new_base_path: raise ValueError('Cannot create a temporary directory for root path ' 'prefix %s. Please specify a file path prefix with ' 'at least two components.' % file_path_prefix) path_components = [base_path, 'beam-temp-' + last_component + '-' + uuid.uuid1().hex] return FileSystems.join(*path_components)
def _check_state_for_finalize_write(self, writer_results, num_shards): """Checks writer output files' states. Returns: src_files, dst_files: Lists of files to rename. For each i, finalize_write should rename(src_files[i], dst_files[i]). delete_files: Src files to delete. These could be leftovers from an incomplete (non-atomic) rename operation. num_skipped: Tally of writer results files already renamed, such as from a previous run of finalize_write(). """ if not writer_results: return [], [], [], 0 src_glob = FileSystems.join( FileSystems.split(writer_results[0])[0], '*') dst_glob = self._get_final_name_glob(num_shards) src_glob_files = set(file_metadata.path for mr in FileSystems.match([src_glob]) for file_metadata in mr.metadata_list) dst_glob_files = set(file_metadata.path for mr in FileSystems.match([dst_glob]) for file_metadata in mr.metadata_list) src_files = [] dst_files = [] delete_files = [] num_skipped = 0 for shard_num, src in enumerate(writer_results): final_name = self._get_final_name(shard_num, num_shards) dst = final_name src_exists = src in src_glob_files dst_exists = dst in dst_glob_files if not src_exists and not dst_exists: raise BeamIOError( 'src and dst files do not exist. src: %s, dst: %s' % (src, dst)) if not src_exists and dst_exists: logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst) num_skipped += 1 continue if (src_exists and dst_exists and FileSystems.checksum(src) == FileSystems.checksum(dst)): logging.debug('src: %s == dst: %s, deleting src', src, dst) delete_files.append(src) continue src_files.append(src) dst_files.append(dst) return src_files, dst_files, delete_files, num_skipped
def test_basic_file_name_provided(self): content = 'TestingMyContent\nIn multiple lines\nhaha!' dir = '%s%s' % (self._new_tempdir(), os.sep) self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(lambda f: f.read().decode('utf-8').splitlines())) assert_that(content_pc, equal_to(content.splitlines()))
def test_basic_two_files(self): files = [] tempdir = '%s%s' % (self._new_tempdir(), os.sep) # Create a couple files to be matched files.append(self._create_temp_file(dir=tempdir)) files.append(self._create_temp_file(dir=tempdir)) with TestPipeline() as p: files_pc = (p | fileio.MatchFiles(FileSystems.join(tempdir, '*')) | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files))
def test_csv_file_source(self): content = 'name,year,place\ngoogle,1999,CA\nspotify,2006,sweden' rows = [r.split(',') for r in content.split('\n')] dir = '%s%s' % (self._new_tempdir(), os.sep) self._create_temp_file(dir=dir, content=content) with TestPipeline() as p: content_pc = (p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.FlatMap(lambda rf: csv.reader(_get_file_reader(rf)))) assert_that(content_pc, equal_to(rows))
def test_find_orphaned_files(self): dir = self._new_tempdir() write_transform = beam.io.fileio.WriteToFiles(path=dir) def write_orphaned_file(temp_dir, writer_key): temp_dir_path = FileSystems.join(dir, temp_dir) file_prefix_dir = FileSystems.join(temp_dir_path, str(abs(hash(writer_key)))) file_name = '%s_%s' % (file_prefix_dir, uuid.uuid4()) with FileSystems.create(file_name) as f: f.write(b'Hello y\'all') return file_name with TestPipeline() as p: _ = (p | beam.Create(WriteFilesTest.SIMPLE_COLLECTION) | "Serialize" >> beam.Map(json.dumps) | write_transform) # Pre-create the temp directory. temp_dir_path = FileSystems.mkdirs( FileSystems.join(dir, write_transform._temp_directory.get())) write_orphaned_file(write_transform._temp_directory.get(), (None, GlobalWindow())) f2 = write_orphaned_file(write_transform._temp_directory.get(), ('other-dest', GlobalWindow())) temp_dir_path = FileSystems.join(dir, write_transform._temp_directory.get()) leftovers = FileSystems.match(['%s%s*' % (temp_dir_path, os.sep)]) found_files = [m.path for m in leftovers[0].metadata_list] self.assertListEqual(found_files, [f2])
def _check_state_for_finalize_write(self, writer_results, num_shards): """Checks writer output files' states. Returns: src_files, dst_files: Lists of files to rename. For each i, finalize_write should rename(src_files[i], dst_files[i]). delete_files: Src files to delete. These could be leftovers from an incomplete (non-atomic) rename operation. num_skipped: Tally of writer results files already renamed, such as from a previous run of finalize_write(). """ if not writer_results: return [], [], [], 0 src_glob = FileSystems.join(FileSystems.split(writer_results[0])[0], '*') dst_glob = self._get_final_name_glob(num_shards) src_glob_files = set(file_metadata.path for mr in FileSystems.match([src_glob]) for file_metadata in mr.metadata_list) dst_glob_files = set(file_metadata.path for mr in FileSystems.match([dst_glob]) for file_metadata in mr.metadata_list) src_files = [] dst_files = [] delete_files = [] num_skipped = 0 for shard_num, src in enumerate(writer_results): final_name = self._get_final_name(shard_num, num_shards) dst = final_name src_exists = src in src_glob_files dst_exists = dst in dst_glob_files if not src_exists and not dst_exists: raise BeamIOError('src and dst files do not exist. src: %s, dst: %s' % ( src, dst)) if not src_exists and dst_exists: logging.debug('src: %s -> dst: %s already renamed, skipping', src, dst) num_skipped += 1 continue if (src_exists and dst_exists and FileSystems.checksum(src) == FileSystems.checksum(dst)): logging.debug('src: %s == dst: %s, deleting src', src, dst) delete_files.append(src) continue src_files.append(src) dst_files.append(dst) return src_files, dst_files, delete_files, num_skipped
def pip_fake(args): """Fakes fetching a package from pip by creating a temporary file. Args: args: a complete list of command line arguments to invoke pip. The fake is sensitive to the order of the arguments. Supported commands: 1) Download SDK sources file: python pip -m download --dest /tmp/dir apache-beam==2.0.0 \ --no-deps --no-binary :all: 2) Download SDK binary wheel file: python pip -m download --dest /tmp/dir apache-beam==2.0.0 \ --no-deps --no-binary :all: --python-version 27 \ --implementation cp --abi cp27mu --platform manylinux1_x86_64 """ package_file = None if len(args) >= 8: # package_name==x.y.z if '==' in args[6]: distribution_name = args[6][0:args[6].find('==')] distribution_version = args[6][args[6].find('==') + 2:] if args[8] == '--no-binary': package_file = '%s-%s.zip' % (distribution_name, distribution_version) elif args[8] == '--only-binary' and len(args) >= 18: if not has_wheels: # Imitate the case when desired wheel distribution is not in PyPI. raise RuntimeError('No matching distribution.') # Per PEP-0427 in wheel filenames non-alphanumeric characters # in distribution name are replaced with underscore. distribution_name = distribution_name.replace('-', '_') package_file = '%s-%s-%s%s-%s-%s.whl' % ( distribution_name, distribution_version, args[13], # implementation args[11], # python version args[15], # abi tag args[17] # platform ) assert package_file, 'Pip fake does not support the command: ' + str( args) self.create_temp_file(FileSystems.join(args[5], package_file), 'Package content.')
def test_run_example_with_setup_file(self): pipeline = TestPipeline(is_integration_test=True) coordinate_output = FileSystems.join( pipeline.get_option('output'), 'juliaset-{}'.format(str(uuid.uuid4())), 'coordinates.txt') extra_args = { 'coordinate_output': coordinate_output, 'grid_size': self.GRID_SIZE, 'setup_file': os.path.normpath( os.path.join(os.path.dirname(__file__), '..', 'setup.py')), 'on_success_matcher': all_of(PipelineStateMatcher(PipelineState.DONE)), } args = pipeline.get_full_options_as_args(**extra_args) juliaset.run(args)
def pip_fake(args): """Fakes fetching a package from pip by creating a temporary file. Args: args: a complete list of command line arguments to invoke pip. The fake is sensitive to the order of the arguments. Supported commands: 1) Download SDK sources file: python pip -m download --dest /tmp/dir apache-beam==2.0.0 \ --no-deps --no-binary :all: 2) Download SDK binary wheel file: python pip -m download --dest /tmp/dir apache-beam==2.0.0 \ --no-deps --no-binary :all: --python-version 27 \ --implementation cp --abi cp27mu --platform manylinux1_x86_64 """ package_file = None if len(args) >= 8: # package_name==x.y.z if '==' in args[6]: distribution_name = args[6][0:args[6].find('==')] distribution_version = args[6][args[6].find('==') + 2:] if args[8] == '--no-binary': package_file = '%s-%s.zip' % (distribution_name, distribution_version) elif args[8] == '--only-binary' and len(args) >= 18: if not has_wheels: # Imitate the case when desired wheel distribution is not in PyPI. raise RuntimeError('No matching distribution.') # Per PEP-0427 in wheel filenames non-alphanumeric characters # in distribution name are replaced with underscore. distribution_name = distribution_name.replace('-', '_') package_file = '%s-%s-%s%s-%s-%s.whl' % ( distribution_name, distribution_version, args[13], # implementation args[11], # python version args[15], # abi tag args[17] # platform ) assert package_file, 'Pip fake does not support the command: ' + str(args) self.create_temp_file( FileSystems.join(args[5], package_file), 'Package content.')
def test_match_files_one_directory_failure(self): directories = [ '%s%s' % (self._new_tempdir(), os.sep), '%s%s' % (self._new_tempdir(), os.sep)] files = list() files.append(self._create_temp_file(dir=directories[0])) files.append(self._create_temp_file(dir=directories[0])) with TestPipeline() as p: files_pc = ( p | beam.Create([FileSystems.join(d, '*') for d in directories]) | fileio.MatchAll(fileio.EmptyMatchTreatment.ALLOW_IF_WILDCARD) | beam.Map(lambda x: x.path)) assert_that(files_pc, equal_to(files))
def _merge_headers(known_args, pipeline_args, pipeline_mode): """Merges VCF headers using beam based on pipeline_mode.""" if known_args.representative_header_file: return options = PipelineOptions(pipeline_args) # Always run pipeline locally if data is small. if (pipeline_mode == PipelineModes.SMALL and not known_args.infer_undefined_headers): options.view_as(StandardOptions).runner = 'DirectRunner' google_cloud_options = options.view_as(GoogleCloudOptions) if google_cloud_options.job_name: google_cloud_options.job_name += '-' + _MERGE_HEADERS_JOB_NAME else: google_cloud_options.job_name = _MERGE_HEADERS_JOB_NAME temp_directory = google_cloud_options.temp_location or tempfile.mkdtemp() # Add a time prefix to ensure files are unique in case multiple # pipelines are run at the same time. temp_merged_headers_file_name = '-'.join([ datetime.datetime.now().strftime('%Y%m%d-%H%M%S'), google_cloud_options.job_name, _MERGE_HEADERS_FILE_NAME]) known_args.representative_header_file = FileSystems.join( temp_directory, temp_merged_headers_file_name) with beam.Pipeline(options=options) as p: headers = p if pipeline_mode == PipelineModes.LARGE: headers |= (beam.Create([known_args.input_pattern]) | vcf_header_io.ReadAllVcfHeaders()) else: headers |= vcf_header_io.ReadVcfHeaders(known_args.input_pattern) merged_header = (headers | 'MergeHeaders' >> merge_headers.MergeHeaders( known_args.split_alternate_allele_info_fields)) if known_args.infer_undefined_headers: merged_header = _add_inferred_headers(p, known_args, merged_header) _ = (merged_header | 'WriteHeaders' >> vcf_header_io.WriteVcfHeaders( known_args.representative_header_file))
def create_job_description(self, job): """Creates a job described by the workflow proto.""" # Stage the pipeline for the runner harness self.stage_file(job.google_cloud_options.staging_location, names.STAGED_PIPELINE_FILENAME, io.BytesIO(job.proto_pipeline.SerializeToString())) # Stage other resources for the SDK harness resources = self._stage_resources(job.options) job.proto.environment = Environment( pipeline_url=FileSystems.join(job.google_cloud_options.staging_location, names.STAGED_PIPELINE_FILENAME), packages=resources, options=job.options, environment_version=self.environment_version).proto logging.debug('JOB: %s', job)
def create_job_description(self, job): """Creates a job described by the workflow proto.""" # Stage the pipeline for the runner harness self.stage_file(job.google_cloud_options.staging_location, shared_names.STAGED_PIPELINE_FILENAME, io.BytesIO(job.proto_pipeline.SerializeToString())) # Stage other resources for the SDK harness resources = self._stage_resources(job.options) job.proto.environment = Environment( pipeline_url=FileSystems.join(job.google_cloud_options.staging_location, shared_names.STAGED_PIPELINE_FILENAME), packages=resources, options=job.options, environment_version=self.environment_version).proto logging.debug('JOB: %s', job)
def test_read_gzip_compressed_file_without_suffix(self): dir = '%s%s' % (self._new_tempdir(), os.sep) file_contents = b'compressed_contents!' import gzip with gzip.GzipFile(os.path.join(dir, 'compressed'), 'w') as f: f.write(file_contents) with TestPipeline() as p: content_pc = ( p | beam.Create([FileSystems.join(dir, '*')]) | fileio.MatchAll() | fileio.ReadMatches() | beam.Map(lambda rf: rf.open(compression_type=CompressionTypes .GZIP).read(len(file_contents)))) assert_that(content_pc, equal_to([file_contents]))
def _stage_beam_sdk(sdk_remote_location, staging_location, temp_dir): """Stages a Beam SDK file with the appropriate version. Args: sdk_remote_location: A GCS path to a SDK file or a URL from which the file can be downloaded. The SDK file can be a tarball or a wheel. Set to 'pypi' to download and stage a wheel and source SDK from PyPi. staging_location: A GCS bucket where the SDK file should be copied. temp_dir: path to temporary location where the file should be downloaded. Returns: A list of SDK files that were staged to the staging location. Raises: RuntimeError: if staging was not successful. """ if (sdk_remote_location.startswith('http://') or sdk_remote_location.startswith('https://')): local_download_file = _dependency_file_download( sdk_remote_location, temp_dir) staged_name = _desired_sdk_filename_in_staging_location(local_download_file) staged_path = FileSystems.join(staging_location, staged_name) logging.info( 'Staging Beam SDK from %s to %s', sdk_remote_location, staged_path) _dependency_file_copy(local_download_file, staged_path) return [staged_name] elif sdk_remote_location.startswith('gs://'): # Stage the file to the GCS staging area. staged_name = _desired_sdk_filename_in_staging_location(sdk_remote_location) staged_path = FileSystems.join(staging_location, staged_name) logging.info( 'Staging Beam SDK from %s to %s', sdk_remote_location, staged_path) _dependency_file_copy(sdk_remote_location, staged_path) return [staged_name] elif sdk_remote_location == 'pypi': sdk_local_file = _download_pypi_sdk_package(temp_dir) sdk_sources_staged_name = _desired_sdk_filename_in_staging_location( sdk_local_file) staged_path = FileSystems.join(staging_location, sdk_sources_staged_name) logging.info('Staging SDK sources from PyPI to %s', staged_path) _dependency_file_copy(sdk_local_file, staged_path) staged_sdk_files = [sdk_sources_staged_name] try: # Stage binary distribution of the SDK, for now on a best-effort basis. sdk_local_file = _download_pypi_sdk_package(temp_dir, fetch_binary=True) sdk_binary_staged_name = _desired_sdk_filename_in_staging_location( sdk_local_file) staged_path = FileSystems.join(staging_location, sdk_binary_staged_name) logging.info('Staging binary distribution of the SDK from PyPI to %s', staged_path) _dependency_file_copy(sdk_local_file, staged_path) staged_sdk_files.append(sdk_binary_staged_name) except RuntimeError as e: logging.warn('Failed to download requested binary distribution ' 'of the SDK: %s', repr(e)) return staged_sdk_files else: raise RuntimeError( 'The --sdk_location option was used with an unsupported ' 'type of location: %s' % sdk_remote_location)
def stage_job_resources( options, file_copy=_dependency_file_copy, build_setup_args=None, temp_dir=None, populate_requirements_cache=_populate_requirements_cache): """For internal use only; no backwards-compatibility guarantees. Creates (if needed) and stages job resources to options.staging_location. Args: options: Command line options. More specifically the function will expect staging_location, requirements_file, setup_file, and save_main_session options to be present. file_copy: Callable for copying files. The default version will copy from a local file to a GCS location using the gsutil tool available in the Google Cloud SDK package. build_setup_args: A list of command line arguments used to build a setup package. Used only if options.setup_file is not None. Used only for testing. temp_dir: Temporary folder where the resource building can happen. If None then a unique temp directory will be created. Used only for testing. populate_requirements_cache: Callable for populating the requirements cache. Used only for testing. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged in options.staging_location. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ temp_dir = temp_dir or tempfile.mkdtemp() resources = [] google_cloud_options = options.view_as(GoogleCloudOptions) setup_options = options.view_as(SetupOptions) # Make sure that all required options are specified. There are a few that have # defaults to support local running scenarios. if google_cloud_options.staging_location is None: raise RuntimeError( 'The --staging_location option must be specified.') if google_cloud_options.temp_location is None: raise RuntimeError( 'The --temp_location option must be specified.') # Stage a requirements file if present. if setup_options.requirements_file is not None: if not os.path.isfile(setup_options.requirements_file): raise RuntimeError('The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) staged_path = FileSystems.join(google_cloud_options.staging_location, REQUIREMENTS_FILE) file_copy(setup_options.requirements_file, staged_path) resources.append(REQUIREMENTS_FILE) requirements_cache_path = ( os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache') if setup_options.requirements_cache is None else setup_options.requirements_cache) # Populate cache with packages from requirements and stage the files # in the cache. if not os.path.exists(requirements_cache_path): os.makedirs(requirements_cache_path) populate_requirements_cache( setup_options.requirements_file, requirements_cache_path) for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): file_copy(pkg, FileSystems.join(google_cloud_options.staging_location, os.path.basename(pkg))) resources.append(os.path.basename(pkg)) # Handle a setup file if present. # We will build the setup package locally and then copy it to the staging # location because the staging location is a GCS path and the file cannot be # created directly there. if setup_options.setup_file is not None: if not os.path.isfile(setup_options.setup_file): raise RuntimeError('The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % setup_options.setup_file) if os.path.basename(setup_options.setup_file) != 'setup.py': raise RuntimeError( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of %s' % setup_options.setup_file) tarball_file = _build_setup_package(setup_options.setup_file, temp_dir, build_setup_args) staged_path = FileSystems.join(google_cloud_options.staging_location, WORKFLOW_TARBALL_FILE) file_copy(tarball_file, staged_path) resources.append(WORKFLOW_TARBALL_FILE) # Handle extra local packages that should be staged. if setup_options.extra_packages is not None: resources.extend( _stage_extra_packages(setup_options.extra_packages, google_cloud_options.staging_location, temp_dir=temp_dir, file_copy=file_copy)) # Pickle the main session if requested. # We will create the pickled main session locally and then copy it to the # staging location because the staging location is a GCS path and the file # cannot be created directly there. if setup_options.save_main_session: pickled_session_file = os.path.join(temp_dir, names.PICKLED_MAIN_SESSION_FILE) pickler.dump_session(pickled_session_file) staged_path = FileSystems.join(google_cloud_options.staging_location, names.PICKLED_MAIN_SESSION_FILE) file_copy(pickled_session_file, staged_path) resources.append(names.PICKLED_MAIN_SESSION_FILE) if hasattr(setup_options, 'sdk_location'): if setup_options.sdk_location == 'default': stage_tarball_from_remote_location = True elif (setup_options.sdk_location.startswith('gs://') or setup_options.sdk_location.startswith('http://') or setup_options.sdk_location.startswith('https://')): stage_tarball_from_remote_location = True else: stage_tarball_from_remote_location = False staged_path = FileSystems.join(google_cloud_options.staging_location, names.DATAFLOW_SDK_TARBALL_FILE) if stage_tarball_from_remote_location: # If --sdk_location is not specified then the appropriate package # will be obtained from PyPI (https://pypi.python.org) based on the # version of the currently running SDK. If the option is # present then no version matching is made and the exact URL or path # is expected. # # Unit tests running in the 'python setup.py test' context will # not have the sdk_location attribute present and therefore we # will not stage a tarball. if setup_options.sdk_location == 'default': sdk_remote_location = 'pypi' else: sdk_remote_location = setup_options.sdk_location _stage_beam_sdk_tarball(sdk_remote_location, staged_path, temp_dir) resources.append(names.DATAFLOW_SDK_TARBALL_FILE) else: # Check if we have a local Beam SDK tarball present. This branch is # used by tests running with the SDK built at head. if setup_options.sdk_location == 'default': module_path = os.path.abspath(__file__) sdk_path = os.path.join( os.path.dirname(module_path), '..', '..', '..', names.DATAFLOW_SDK_TARBALL_FILE) elif os.path.isdir(setup_options.sdk_location): sdk_path = os.path.join( setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE) else: sdk_path = setup_options.sdk_location if os.path.isfile(sdk_path): logging.info('Copying Beam SDK "%s" to staging location.', sdk_path) file_copy(sdk_path, staged_path) resources.append(names.DATAFLOW_SDK_TARBALL_FILE) else: if setup_options.sdk_location == 'default': raise RuntimeError('Cannot find default Beam SDK tar file "%s"', sdk_path) elif not setup_options.sdk_location: logging.info('Beam SDK will not be staged since --sdk_location ' 'is empty.') else: raise RuntimeError( 'The file "%s" cannot be found. Its location was specified by ' 'the --sdk_location command-line option.' % sdk_path) # Delete all temp files created while staging job resources. shutil.rmtree(temp_dir) return resources
def _stage_extra_packages(extra_packages, staging_location, temp_dir, file_copy=_dependency_file_copy): """Stages a list of local extra packages. Args: extra_packages: Ordered list of local paths to extra packages to be staged. staging_location: Staging location for the packages. temp_dir: Temporary folder where the resource building can happen. Caller is responsible for cleaning up this folder after this function returns. file_copy: Callable for copying files. The default version will copy from a local file to a GCS location using the gsutil tool available in the Google Cloud SDK package. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged in staging_location. Raises: RuntimeError: If files specified are not found or do not have expected name patterns. """ resources = [] staging_temp_dir = None local_packages = [] for package in extra_packages: if not (os.path.basename(package).endswith('.tar') or os.path.basename(package).endswith('.tar.gz') or os.path.basename(package).endswith('.whl') or os.path.basename(package).endswith('.zip')): raise RuntimeError( 'The --extra_package option expects a full path ending with ' '".tar", ".tar.gz", ".whl" or ".zip" instead of %s' % package) if os.path.basename(package).endswith('.whl'): logging.warning( 'The .whl package "%s" is provided in --extra_package. ' 'This functionality is not officially supported. Since wheel ' 'packages are binary distributions, this package must be ' 'binary-compatible with the worker environment (e.g. Python 2.7 ' 'running on an x64 Linux host).') if not os.path.isfile(package): if package.startswith('gs://'): if not staging_temp_dir: staging_temp_dir = tempfile.mkdtemp(dir=temp_dir) logging.info('Downloading extra package: %s locally before staging', package) if os.path.isfile(staging_temp_dir): local_file_path = staging_temp_dir else: _, last_component = FileSystems.split(package) local_file_path = FileSystems.join(staging_temp_dir, last_component) _dependency_file_copy(package, local_file_path) else: raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--extra_packages command line option.' % package) else: local_packages.append(package) if staging_temp_dir: local_packages.extend( [FileSystems.join(staging_temp_dir, f) for f in os.listdir( staging_temp_dir)]) for package in local_packages: basename = os.path.basename(package) staged_path = FileSystems.join(staging_location, basename) file_copy(package, staged_path) resources.append(basename) # Create a file containing the list of extra packages and stage it. # The file is important so that in the worker the packages are installed # exactly in the order specified. This approach will avoid extra PyPI # requests. For example if package A depends on package B and package A # is installed first then the installer will try to satisfy the # dependency on B by downloading the package from PyPI. If package B is # installed first this is avoided. with open(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), 'wt') as f: for package in local_packages: f.write('%s\n' % os.path.basename(package)) staged_path = FileSystems.join(staging_location, EXTRA_PACKAGES_FILE) # Note that the caller of this function is responsible for deleting the # temporary folder where all temp files are created, including this one. file_copy(os.path.join(temp_dir, EXTRA_PACKAGES_FILE), staged_path) resources.append(EXTRA_PACKAGES_FILE) return resources
def stage_job_resources(self, options, build_setup_args=None, temp_dir=None, populate_requirements_cache=None, staging_location=None): """For internal use only; no backwards-compatibility guarantees. Creates (if needed) and stages job resources to staging_location. Args: options: Command line options. More specifically the function will expect requirements_file, setup_file, and save_main_session options to be present. build_setup_args: A list of command line arguments used to build a setup package. Used only if options.setup_file is not None. Used only for testing. temp_dir: Temporary folder where the resource building can happen. If None then a unique temp directory will be created. Used only for testing. populate_requirements_cache: Callable for populating the requirements cache. Used only for testing. staging_location: Location to stage the file. Returns: A list of file names (no paths) for the resources staged. All the files are assumed to be staged at staging_location. Raises: RuntimeError: If files specified are not found or error encountered while trying to create the resources (e.g., build a setup package). """ temp_dir = temp_dir or tempfile.mkdtemp() resources = [] setup_options = options.view_as(SetupOptions) # Make sure that all required options are specified. if staging_location is None: raise RuntimeError('The staging_location must be specified.') # Stage a requirements file if present. if setup_options.requirements_file is not None: if not os.path.isfile(setup_options.requirements_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--requirements_file command line option.' % setup_options.requirements_file) staged_path = FileSystems.join(staging_location, REQUIREMENTS_FILE) self.stage_artifact(setup_options.requirements_file, staged_path) resources.append(REQUIREMENTS_FILE) requirements_cache_path = ( os.path.join(tempfile.gettempdir(), 'dataflow-requirements-cache') if setup_options.requirements_cache is None else setup_options.requirements_cache) # Populate cache with packages from requirements and stage the files # in the cache. if not os.path.exists(requirements_cache_path): os.makedirs(requirements_cache_path) (populate_requirements_cache if populate_requirements_cache else Stager._populate_requirements_cache)(setup_options.requirements_file, requirements_cache_path) for pkg in glob.glob(os.path.join(requirements_cache_path, '*')): self.stage_artifact( pkg, FileSystems.join(staging_location, os.path.basename(pkg))) resources.append(os.path.basename(pkg)) # Handle a setup file if present. # We will build the setup package locally and then copy it to the staging # location because the staging location is a remote path and the file cannot # be created directly there. if setup_options.setup_file is not None: if not os.path.isfile(setup_options.setup_file): raise RuntimeError( 'The file %s cannot be found. It was specified in the ' '--setup_file command line option.' % setup_options.setup_file) if os.path.basename(setup_options.setup_file) != 'setup.py': raise RuntimeError( 'The --setup_file option expects the full path to a file named ' 'setup.py instead of %s' % setup_options.setup_file) tarball_file = Stager._build_setup_package(setup_options.setup_file, temp_dir, build_setup_args) staged_path = FileSystems.join(staging_location, WORKFLOW_TARBALL_FILE) self.stage_artifact(tarball_file, staged_path) resources.append(WORKFLOW_TARBALL_FILE) # Handle extra local packages that should be staged. if setup_options.extra_packages is not None: resources.extend( self._stage_extra_packages( setup_options.extra_packages, staging_location, temp_dir=temp_dir)) # Pickle the main session if requested. # We will create the pickled main session locally and then copy it to the # staging location because the staging location is a remote path and the # file cannot be created directly there. if setup_options.save_main_session: pickled_session_file = os.path.join(temp_dir, names.PICKLED_MAIN_SESSION_FILE) pickler.dump_session(pickled_session_file) staged_path = FileSystems.join(staging_location, names.PICKLED_MAIN_SESSION_FILE) self.stage_artifact(pickled_session_file, staged_path) resources.append(names.PICKLED_MAIN_SESSION_FILE) if hasattr(setup_options, 'sdk_location'): if (setup_options.sdk_location == 'default') or Stager._is_remote_path( setup_options.sdk_location): # If --sdk_location is not specified then the appropriate package # will be obtained from PyPI (https://pypi.python.org) based on the # version of the currently running SDK. If the option is # present then no version matching is made and the exact URL or path # is expected. # # Unit tests running in the 'python setup.py test' context will # not have the sdk_location attribute present and therefore we # will not stage SDK. sdk_remote_location = 'pypi' if (setup_options.sdk_location == 'default' ) else setup_options.sdk_location resources.extend( self._stage_beam_sdk(sdk_remote_location, staging_location, temp_dir)) else: # This branch is also used by internal tests running with the SDK built # at head. if os.path.isdir(setup_options.sdk_location): # TODO(angoenka): remove reference to Dataflow sdk_path = os.path.join(setup_options.sdk_location, names.DATAFLOW_SDK_TARBALL_FILE) else: sdk_path = setup_options.sdk_location if os.path.isfile(sdk_path): logging.info('Copying Beam SDK "%s" to staging location.', sdk_path) staged_path = FileSystems.join( staging_location, Stager._desired_sdk_filename_in_staging_location( setup_options.sdk_location)) self.stage_artifact(sdk_path, staged_path) _, sdk_staged_filename = FileSystems.split(staged_path) resources.append(sdk_staged_filename) else: if setup_options.sdk_location == 'default': raise RuntimeError('Cannot find default Beam SDK tar file "%s"' % sdk_path) elif not setup_options.sdk_location: logging.info('Beam SDK will not be staged since --sdk_location ' 'is empty.') else: raise RuntimeError( 'The file "%s" cannot be found. Its location was specified by ' 'the --sdk_location command-line option.' % sdk_path) # Delete all temp files created while staging job resources. shutil.rmtree(temp_dir) self.commit_manifest() return resources