def __init__(self,
               file_pattern,
               min_bundle_size=0,
               compression_type=CompressionTypes.AUTO,
               splittable=True,
               validate=True):
    """Initializes :class:`FileBasedSource`.

    Args:
      file_pattern (str): the file glob to read a string or a
        :class:`~apache_beam.options.value_provider.ValueProvider`
        (placeholder to inject a runtime value).
      min_bundle_size (str): minimum size of bundles that should be generated
        when performing initial splitting on this source.
      compression_type (str): Used to handle compressed output files.
        Typical value is :attr:`CompressionTypes.AUTO
        <apache_beam.io.filesystem.CompressionTypes.AUTO>`,
        in which case the final file path's extension will be used to detect
        the compression.
      splittable (bool): whether :class:`FileBasedSource` should try to
        logically split a single file into data ranges so that different parts
        of the same file can be read in parallel. If set to :data:`False`,
        :class:`FileBasedSource` will prevent both initial and dynamic splitting
        of sources for single files. File patterns that represent multiple files
        may still get split into sources for individual files. Even if set to
        :data:`True` by the user, :class:`FileBasedSource` may choose to not
        split the file, for example, for compressed files where currently it is
        not possible to efficiently read a data range without decompressing the
        whole file.
      validate (bool): Boolean flag to verify that the files exist during the
        pipeline creation time.

    Raises:
      ~exceptions.TypeError: when **compression_type** is not valid or if
        **file_pattern** is not a :class:`str` or a
        :class:`~apache_beam.options.value_provider.ValueProvider`.
      ~exceptions.ValueError: when compression and splittable files are
        specified.
      ~exceptions.IOError: when the file pattern specified yields an empty
        result.
    """

    if not isinstance(file_pattern, (basestring, ValueProvider)):
      raise TypeError('%s: file_pattern must be of type string'
                      ' or ValueProvider; got %r instead'
                      % (self.__class__.__name__, file_pattern))

    if isinstance(file_pattern, basestring):
      file_pattern = StaticValueProvider(str, file_pattern)
    self._pattern = file_pattern

    self._concat_source = None
    self._min_bundle_size = min_bundle_size
    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError('compression_type must be CompressionType object but '
                      'was %s' % type(compression_type))
    self._compression_type = compression_type
    self._splittable = splittable
    if validate and file_pattern.is_accessible():
      self._validate()
Exemple #2
0
 def __init__(self, load_time, group_by):
     if isinstance(load_time, string_types):
         load_time = StaticValueProvider(str, load_time)
     if isinstance(group_by, string_types):
         group_by = StaticValueProvider(str, group_by)
     self.load_time = load_time
     self.group_by = group_by
    def _add_argparse_args(cls, parser):
        parser.add_value_provider_argument(
            '--group_by',
            default=StaticValueProvider(str, 'ASSET_TYPE'),
            choices=['ASSET_TYPE', 'ASSET_TYPE_VERSION'],
            help='How to group exported resources into Bigquery tables.')

        parser.add_value_provider_argument(
            '--write_disposition',
            default=StaticValueProvider(str, 'WRITE_APPEND'),
            choices=['WRITE_APPEND', 'WRITE_EMPTY'],
            help='To append to or overwrite BigQuery tables..')

        parser.add_value_provider_argument(
            '--input', help='A glob of all input asset json files to process.')

        parser.add_value_provider_argument(
            '--stage',
            help='GCS location to write intermediary BigQuery load files.')

        parser.add_value_provider_argument(
            '--load_time',
            default=StaticValueProvider(str,
                                        datetime.now().isoformat()),
            help='Load time of the data (YYYY-MM-DD[HH:MM:SS])).')

        parser.add_value_provider_argument('--dataset',
                                           help='BigQuery dataset to load to.')
  def __init__(self,
               file_pattern,
               min_bundle_size=0,
               compression_type=CompressionTypes.AUTO,
               splittable=True,
               validate=True):
    """Initializes :class:`FileBasedSource`.

    Args:
      file_pattern (str): the file glob to read a string or a
        :class:`~apache_beam.options.value_provider.ValueProvider`
        (placeholder to inject a runtime value).
      min_bundle_size (str): minimum size of bundles that should be generated
        when performing initial splitting on this source.
      compression_type (str): Used to handle compressed output files.
        Typical value is :attr:`CompressionTypes.AUTO
        <apache_beam.io.filesystem.CompressionTypes.AUTO>`,
        in which case the final file path's extension will be used to detect
        the compression.
      splittable (bool): whether :class:`FileBasedSource` should try to
        logically split a single file into data ranges so that different parts
        of the same file can be read in parallel. If set to :data:`False`,
        :class:`FileBasedSource` will prevent both initial and dynamic splitting
        of sources for single files. File patterns that represent multiple files
        may still get split into sources for individual files. Even if set to
        :data:`True` by the user, :class:`FileBasedSource` may choose to not
        split the file, for example, for compressed files where currently it is
        not possible to efficiently read a data range without decompressing the
        whole file.
      validate (bool): Boolean flag to verify that the files exist during the
        pipeline creation time.

    Raises:
      ~exceptions.TypeError: when **compression_type** is not valid or if
        **file_pattern** is not a :class:`str` or a
        :class:`~apache_beam.options.value_provider.ValueProvider`.
      ~exceptions.ValueError: when compression and splittable files are
        specified.
      ~exceptions.IOError: when the file pattern specified yields an empty
        result.
    """

    if not isinstance(file_pattern, (basestring, ValueProvider)):
      raise TypeError('%s: file_pattern must be of type string'
                      ' or ValueProvider; got %r instead'
                      % (self.__class__.__name__, file_pattern))

    if isinstance(file_pattern, basestring):
      file_pattern = StaticValueProvider(str, file_pattern)
    self._pattern = file_pattern

    self._concat_source = None
    self._min_bundle_size = min_bundle_size
    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError('compression_type must be CompressionType object but '
                      'was %s' % type(compression_type))
    self._compression_type = compression_type
    self._splittable = splittable
    if validate and file_pattern.is_accessible():
      self._validate()
    def __init__(self,
                 aggregator_dict=None,
                 user_project_id="",
                 user_job_id="",
                 tags=tag_constants.SERVING,
                 signature_name=(
                     signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY),
                 skip_preprocessing=False,
                 target="",
                 config=None):
        """Constructor of Prediction beam.DoFn class.

    Args:
      aggregator_dict: A dict of aggregators containing maps from counter name
                       to the aggregator.
      user_project_id: A string. The project to which the logs will be sent.
      user_job_id:     A string. The job to which the logs will be sent.
      tags: A comma-separated string that contains a list of tags for serving
            graph.
      signature_name: A string to map into the signature map to get the serving
                     signature.
      skip_preprocessing: bool whether to skip preprocessing even when
                          the metadata.yaml/metadata.json file exists.
      target: The execution engine to connect to. See target in tf.Session(). In
              most cases, users should not set the target.
      config: A ConfigProto proto with configuration options. See config in
              tf.Session()

    Side Inputs:
      model_dir: The directory containing the model to load and the
                 checkpoint files to restore the session.
    """
        self._target = target

        # TODO(user): Remove the "if" section when the direct use of
        # PredictionDoFn() is retired from ml_transform.
        if isinstance(user_project_id, basestring):
            user_project_id = StaticValueProvider(str, user_project_id)
        if isinstance(user_job_id, basestring):
            user_job_id = StaticValueProvider(str, user_job_id)
        if isinstance(tags, basestring):
            tags = StaticValueProvider(str, tags)
        if isinstance(signature_name, basestring):
            signature_name = StaticValueProvider(str, signature_name)

        self._user_project_id = user_project_id
        self._user_job_id = user_job_id
        self._tags = tags
        self._signature_name = signature_name
        self._skip_preprocessing = skip_preprocessing
        self._config = config
        self._aggregator_dict = aggregator_dict
        self._model_state = None
        self._cloud_logger = None
        self._tag_list = []

        # Metrics.
        self._model_load_seconds_distribution = beam.metrics.Metrics.distribution(
            _METRICS_NAMESPACE, "model_load_seconds")
Exemple #6
0
def uploader(mocker):
    credential_id = StaticValueProvider(str, 'id')
    secret = StaticValueProvider(str, 'secret')
    access = StaticValueProvider(str, 'access')
    refresh = StaticValueProvider(str, 'refresh')
    credentials = OAuthCredentials(credential_id, secret, access, refresh)

    return CampaignManagerConversionUploaderDoFn(credentials)
    def __init__(self, group_by, num_shards):
        if isinstance(group_by, string_types):
            group_by = StaticValueProvider(str, group_by)
        if isinstance(num_shards, str):
            num_shards = StaticValueProvider(str, num_shards)

        self.num_shards = num_shards
        self.group_by = group_by
        self.shard_map = None
Exemple #8
0
def uploader(mocker):
    mocker.patch('googleads.oauth2.GoogleRefreshTokenClient')
    mocker.patch('googleads.adwords.AdWordsClient')
    client_id = StaticValueProvider(str, 'id')
    secret = StaticValueProvider(str, 'secret')
    access = StaticValueProvider(str, 'access')
    refresh = StaticValueProvider(str, 'refresh')
    credentials = OAuthCredentials(client_id, secret, access, refresh)
    return GoogleAnalyticsDataImportUploaderDoFn(credentials)
    def __init__(self, stage_dir, load_time):
        if isinstance(stage_dir, string_types):
            stage_dir = StaticValueProvider(str, stage_dir)
        if isinstance(load_time, string_types):
            load_time = StaticValueProvider(str, load_time)

        self.stage_dir = stage_dir
        self.load_time = load_time
        self.open_files = {}
Exemple #10
0
    def __init__(self,
                 file_pattern,
                 min_bundle_size=0,
                 compression_type=CompressionTypes.AUTO,
                 splittable=True,
                 validate=True):
        """Initializes ``FileBasedSource``.

    Args:
      file_pattern: the file glob to read a string or a ValueProvider
                    (placeholder to inject a runtime value).
      min_bundle_size: minimum size of bundles that should be generated when
                       performing initial splitting on this source.
      compression_type: compression type to use
      splittable: whether FileBasedSource should try to logically split a single
                  file into data ranges so that different parts of the same file
                  can be read in parallel. If set to False, FileBasedSource will
                  prevent both initial and dynamic splitting of sources for
                  single files. File patterns that represent multiple files may
                  still get split into sources for individual files. Even if set
                  to True by the user, FileBasedSource may choose to not split
                  the file, for example, for compressed files where currently
                  it is not possible to efficiently read a data range without
                  decompressing the whole file.
      validate: Boolean flag to verify that the files exist during the pipeline
                creation time.
    Raises:
      TypeError: when compression_type is not valid or if file_pattern is not a
                 string or a ValueProvider.
      ValueError: when compression and splittable files are specified.
      IOError: when the file pattern specified yields an empty result.
    """

        if not isinstance(file_pattern, (basestring, ValueProvider)):
            raise TypeError('%s: file_pattern must be of type string'
                            ' or ValueProvider; got %r instead' %
                            (self.__class__.__name__, file_pattern))

        if isinstance(file_pattern, basestring):
            file_pattern = StaticValueProvider(str, file_pattern)
        self._pattern = file_pattern

        self._concat_source = None
        self._min_bundle_size = min_bundle_size
        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        self._compression_type = compression_type
        if compression_type in (CompressionTypes.UNCOMPRESSED,
                                CompressionTypes.AUTO):
            self._splittable = splittable
        else:
            # We can't split compressed files efficiently so turn off splitting.
            self._splittable = False
        if validate and file_pattern.is_accessible():
            self._validate()
Exemple #11
0
def eraser(mocker):
    mocker.patch('googleads.oauth2.GoogleRefreshTokenClient')
    mocker.patch('googleads.adwords.AdWordsClient')
    client_id = StaticValueProvider(str, "id")
    secret = StaticValueProvider(str, "secret")
    access = StaticValueProvider(str, "access")
    refresh = StaticValueProvider(str, "refresh")
    credentials = OAuthCredentials(client_id, secret, access, refresh)
    return GoogleAnalyticsDataImportEraser(credentials)
Exemple #12
0
def uploader(mocker):
    mocker.patch('googleads.oauth2.GoogleRefreshTokenClient')
    mocker.patch('googleads.adwords.AdWordsClient')
    credential_id = StaticValueProvider(str, 'id')
    secret = StaticValueProvider(str, 'secret')
    access = StaticValueProvider(str, 'access')
    refresh = StaticValueProvider(str, 'refresh')
    credentials = OAuthCredentials(credential_id, secret, access, refresh)
    return GoogleAdsOfflineUploaderDoFn(credentials,
                                        StaticValueProvider(str, 'devtoken'))
 def __init__(self, dataset, write_disposition):
     if isinstance(dataset, string_types):
         dataset = StaticValueProvider(str, dataset)
     if isinstance(write_disposition, string_types):
         write_disposition = StaticValueProvider(str, write_disposition)
     self.write_disposition = write_disposition
     self.dataset = dataset
     self.bigquery_client = None
     self.dataset_location = None
     self.load_jobs = {}
def test_init():
    id = StaticValueProvider(str, "id")
    secret = StaticValueProvider(str, "secret")
    access = StaticValueProvider(str, "access")
    refresh = StaticValueProvider(str, "refresh")
    credentials = OAuthCredentials(id, secret, access, refresh)
    assert credentials.get_client_id() == "id"
    assert credentials.get_client_secret() == "secret"
    assert credentials.get_access_token() == "access"
    assert credentials.get_refresh_token() == "refresh"
Exemple #15
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        if not self._temp_directory:
            temp_location = (
                p.options.view_as(GoogleCloudOptions).temp_location
                or self.path.get())
            dir_uid = str(uuid.uuid4())
            self._temp_directory = StaticValueProvider(
                str,
                filesystems.FileSystems.join(temp_location,
                                             '.temp%s' % dir_uid))
            _LOGGER.info('Added temporary directory %s',
                         self._temp_directory.get())

        output = (pcoll
                  | beam.ParDo(
                      _WriteUnshardedRecordsFn(
                          base_path=self._temp_directory,
                          destination_fn=self.destination_fn,
                          sink_fn=self.sink_fn,
                          max_writers_per_bundle=self.
                          _max_num_writers_per_bundle)).with_outputs(
                              _WriteUnshardedRecordsFn.SPILLED_RECORDS,
                              _WriteUnshardedRecordsFn.WRITTEN_FILES))

        written_files_pc = output[_WriteUnshardedRecordsFn.WRITTEN_FILES]
        spilled_records_pc = output[_WriteUnshardedRecordsFn.SPILLED_RECORDS]

        more_written_files_pc = (
            spilled_records_pc
            | beam.ParDo(
                _AppendShardedDestination(self.destination_fn, self.shards))
            | "GroupRecordsByDestinationAndShard" >> beam.GroupByKey()
            | beam.ParDo(
                _WriteShardedRecordsFn(self._temp_directory, self.sink_fn,
                                       self.shards)))

        files_by_destination_pc = (
            (written_files_pc, more_written_files_pc)
            | beam.Flatten()
            | beam.Map(lambda file_result:
                       (file_result.destination, file_result))
            | "GroupTempFilesByDestination" >> beam.GroupByKey())

        # Now we should take the temporary files, and write them to the final
        # destination, with their proper names.

        file_results = (files_by_destination_pc
                        | beam.ParDo(
                            _MoveTempFilesIntoFinalDestinationFn(
                                self.path, self.file_naming_fn,
                                self._temp_directory)))

        return file_results
Exemple #16
0
def test_not_active(mocker, caplog):
    credential_id = StaticValueProvider(str, 'id')
    secret = StaticValueProvider(str, 'secret')
    access = StaticValueProvider(str, 'access')
    refresh = StaticValueProvider(str, 'refresh')
    credentials = OAuthCredentials(credential_id, secret, access, refresh)
    uploader_dofn = GoogleAdsOfflineUploaderDoFn(credentials, None)
    mocker.patch.object(uploader_dofn, '_get_oc_service')
    uploader_dofn.process(Batch(None, []))
    uploader_dofn._get_oc_service.assert_not_called()
    assert 'Skipping upload, parameters not configured.' in caplog.text
Exemple #17
0
 def test_static_value_provider_empty_write(self):
     temp_path = StaticValueProvider(
         value_type=str, value=tempfile.NamedTemporaryFile().name)
     sink = MyFileBasedSink(temp_path,
                            file_name_suffix=StaticValueProvider(
                                value_type=str, value='.output'),
                            coder=coders.ToStringCoder())
     with TestPipeline() as p:
         p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
     self.assertEqual(
         open(temp_path.get() + '-00000-of-00001.output').read(),
         '[start][end]')
 def test_static_value_provider_empty_write(self):
   temp_path = StaticValueProvider(value_type=str,
                                   value=tempfile.NamedTemporaryFile().name)
   sink = MyFileBasedSink(
       temp_path,
       file_name_suffix=StaticValueProvider(value_type=str, value='.output'),
       coder=coders.ToStringCoder()
   )
   with TestPipeline() as p:
     p | beam.Create([]) | beam.io.Write(sink)  # pylint: disable=expression-not-assigned
   self.assertEqual(
       open(temp_path.get() + '-00000-of-00001.output').read(), '[start][end]')
Exemple #19
0
  def __init__(
      self,
      file_path_prefix,
      coder,
      file_name_suffix='',
      num_shards=0,
      shard_name_template=None,
      mime_type='application/octet-stream',
      compression_type=CompressionTypes.AUTO,
      skip_if_empty=False):
    """
     Raises:
      TypeError: if file path parameters are not a :class:`str` or
        :class:`~apache_beam.options.value_provider.ValueProvider`, or if
        **compression_type** is not member of
        :class:`~apache_beam.io.filesystem.CompressionTypes`.
      ValueError: if **shard_name_template** is not of expected
        format.
    """
    if not isinstance(file_path_prefix, (str, ValueProvider)):
      raise TypeError(
          'file_path_prefix must be a string or ValueProvider;'
          'got %r instead' % file_path_prefix)
    if not isinstance(file_name_suffix, (str, ValueProvider)):
      raise TypeError(
          'file_name_suffix must be a string or ValueProvider;'
          'got %r instead' % file_name_suffix)

    if not CompressionTypes.is_valid_compression_type(compression_type):
      raise TypeError(
          'compression_type must be CompressionType object but '
          'was %s' % type(compression_type))
    if shard_name_template is None:
      shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE
    elif shard_name_template == '':
      num_shards = 1
    if isinstance(file_path_prefix, str):
      file_path_prefix = StaticValueProvider(str, file_path_prefix)
    if isinstance(file_name_suffix, str):
      file_name_suffix = StaticValueProvider(str, file_name_suffix)
    self.file_path_prefix = file_path_prefix
    self.file_name_suffix = file_name_suffix
    self.num_shards = num_shards
    self.coder = coder
    self.shard_name_format = self._template_to_format(shard_name_template)
    self.shard_name_glob_format = self._template_to_glob_format(
        shard_name_template)
    self.compression_type = compression_type
    self.mime_type = mime_type
    self.skip_if_empty = skip_if_empty
    def test_value_provider_options(self):
        class UserOptions(PipelineOptions):
            @classmethod
            def _add_argparse_args(cls, parser):
                parser.add_value_provider_argument(
                    '--pot_vp_arg1', help='This flag is a value provider')

                parser.add_value_provider_argument('--pot_vp_arg2',
                                                   default=1,
                                                   type=int)

                parser.add_argument('--pot_non_vp_arg1', default=1, type=int)

        # Provide values: if not provided, the option becomes of the type runtime vp
        options = UserOptions(['--pot_vp_arg1', 'hello'])
        self.assertIsInstance(options.pot_vp_arg1, StaticValueProvider)
        self.assertIsInstance(options.pot_vp_arg2, RuntimeValueProvider)
        self.assertIsInstance(options.pot_non_vp_arg1, int)

        # Values can be overwritten
        options = UserOptions(pot_vp_arg1=5,
                              pot_vp_arg2=StaticValueProvider(value_type=str,
                                                              value='bye'),
                              pot_non_vp_arg1=RuntimeValueProvider(
                                  option_name='foo',
                                  value_type=int,
                                  default_value=10))
        self.assertEqual(options.pot_vp_arg1, 5)
        self.assertTrue(options.pot_vp_arg2.is_accessible(),
                        '%s is not accessible' % options.pot_vp_arg2)
        self.assertEqual(options.pot_vp_arg2.get(), 'bye')
        self.assertFalse(options.pot_non_vp_arg1.is_accessible())

        with self.assertRaises(RuntimeError):
            options.pot_non_vp_arg1.get()
Exemple #21
0
 def test_iobase_source(self):
   query = StaticValueProvider(str, self.query)
   with beam.Pipeline(argv=self.args) as p:
     result = (
         p | 'read with value provider query' >> beam.io.ReadFromBigQuery(
             query=query, use_standard_sql=True, project=self.project))
     assert_that(result, equal_to(self.TABLE_DATA))
 def __init__(self, dataset):
     if isinstance(dataset, string_types):
         dataset = StaticValueProvider(str, dataset)
     self.dataset = dataset
     self.bigquery_client = None
     self.dataset_location = None
     self.load_jobs = {}
Exemple #23
0
 def __init__(self, dataset, write_disposition):
     # Can't use super().
     # https://issues.apache.org/jira/browse/BEAM-6158?focusedCommentId=16919945
     # super(DeleteDataSetTables, self).__init__(dataset)
     BigQueryDoFn.__init__(self, dataset)
     if isinstance(write_disposition, string_types):
         write_disposition = StaticValueProvider(str, write_disposition)
     self.write_disposition = write_disposition
Exemple #24
0
 def __init__(self, dataset, load_time):
     # Can't use super().
     # https://issues.apache.org/jira/browse/BEAM-6158?focusedCommentId=16919945
     # super(LoadToBigQuery, self).__init__(dataset)
     BigQueryDoFn.__init__(self, dataset)
     if isinstance(load_time, string_types):
         load_time = StaticValueProvider(str, load_time)
     self.load_time = load_time
Exemple #25
0
 def __init__(self, **kwargs):
     self._conn = None
     self._max_id = None
     self._current_id = 0
     for k, v in kwargs.items():
         if isinstance(v, (str, unicode)):
             v = StaticValueProvider(v)
         setattr(self, '_' + k, v)
Exemple #26
0
  def __init__(self,
               aggregator_dict=None,
               user_project_id="",
               user_job_id="",
               skip_preprocessing=False,
               target="",
               config=None):
    """Constructor of Prediction beam.DoFn class.

    Args:
      aggregator_dict: A dict of aggregators containing maps from counter name
                       to the aggregator.
      user_project_id: A string. The project to which the logs will be sent.
      user_job_id:     A string. The job to which the logs will be sent.
      skip_preprocessing: bool whether to skip preprocessing even when
                          the metadata.yaml/metadata.json file exists.
      target: The execution engine to connect to. See target in tf.Session(). In
              most cases, users should not set the target.
      config: A ConfigProto proto with configuration options. See config in
              tf.Session()

    Side Inputs:
      model_dir: The directory containing the model to load and the
                 checkpoint files to restore the session.
    """
    self._target = target

    # TODO(user): Remove the "if" section when the direct use of
    # PredictionDoFn() is retired from ml_transform.
    if isinstance(user_project_id, basestring):
      user_project_id = StaticValueProvider(str, user_project_id)
    if isinstance(user_job_id, basestring):
      user_job_id = StaticValueProvider(str, user_job_id)

    self._user_project_id = user_project_id
    self._user_job_id = user_job_id
    self._skip_preprocessing = skip_preprocessing
    self._config = config
    self._aggregator_dict = aggregator_dict
    self._model_state = None
    self._cloud_logger = None

    # Metrics.
    self._model_load_seconds_distribution = beam.metrics.Metrics.distribution(
        self.__class__, "model_load_seconds")
Exemple #27
0
  def testValueProviderNamespace(self):
    self.vp_namespace = StaticValueProvider(str, 'vp_namespace')
    self.expected_namespace = 'vp_namespace'

    q = Query(kind='kind', project=self._PROJECT, namespace=self.vp_namespace)
    cq = q._to_client_query(self._test_client)
    self.assertEqual(self.expected_namespace, cq.namespace)

    _LOGGER.info('query: %s', q)  # Test __repr__()
Exemple #28
0
    def __init__(self,
                 file_path_prefix,
                 coder,
                 file_name_suffix='',
                 num_shards=0,
                 shard_name_template=None,
                 mime_type='application/octet-stream',
                 compression_type=CompressionTypes.AUTO):
        """
     Raises:
      TypeError: if file path parameters are not a string or ValueProvider,
                 or if compression_type is not member of CompressionTypes.
      ValueError: if shard_name_template is not of expected format.
    """
        if not isinstance(file_path_prefix, (basestring, ValueProvider)):
            raise TypeError(
                'file_path_prefix must be a string or ValueProvider;'
                'got %r instead' % file_path_prefix)
        if not isinstance(file_name_suffix, (basestring, ValueProvider)):
            raise TypeError(
                'file_name_suffix must be a string or ValueProvider;'
                'got %r instead' % file_name_suffix)

        if not CompressionTypes.is_valid_compression_type(compression_type):
            raise TypeError(
                'compression_type must be CompressionType object but '
                'was %s' % type(compression_type))
        if shard_name_template is None:
            shard_name_template = DEFAULT_SHARD_NAME_TEMPLATE
        elif shard_name_template == '':
            num_shards = 1
        if isinstance(file_path_prefix, basestring):
            file_path_prefix = StaticValueProvider(str, file_path_prefix)
        if isinstance(file_name_suffix, basestring):
            file_name_suffix = StaticValueProvider(str, file_name_suffix)
        self.file_path_prefix = file_path_prefix
        self.file_name_suffix = file_name_suffix
        self.num_shards = num_shards
        self.coder = coder
        self.shard_name_format = self._template_to_format(shard_name_template)
        self.compression_type = compression_type
        self.mime_type = mime_type
Exemple #29
0
    def __init__(
            self,
            # gcs_location=None,
            get_destination_uri=None,
            table=None,
            dataset=None,
            project=None,
            query=None,
            validate=False,
            coder=None,
            use_standard_sql=False,
            flatten_results=True,
            kms_key=None):
        if table is not None and query is not None:
            raise ValueError(
                'Both a BigQuery table and a query were specified.'
                ' Please specify only one of these.')
        elif table is None and query is None:
            raise ValueError('A BigQuery table or a query must be specified')
        elif table is not None:
            self.table_reference = bigquery_tools.parse_table_reference(
                table, dataset, project)
            self.query = None
            self.use_legacy_sql = True
        else:
            if isinstance(query, (str, unicode)):
                query = StaticValueProvider(str, query)
            self.query = query
            # TODO(BEAM-1082): Change the internal flag to be standard_sql
            self.use_legacy_sql = not use_standard_sql
            self.table_reference = None

        self.get_destination_uri = get_destination_uri
        # self.gcs_location = gcs_location
        if isinstance(project, (str, unicode)):
            project = StaticValueProvider(str, query)
        self.project = project
        self.validate = validate
        self.flatten_results = flatten_results
        self.coder = coder or _JsonToDictCoder
        self.kms_key = kms_key
        self.split_result = None
    def __init__(self, file_patterns, **kwargs):
        # Handle the templated values.
        if not isinstance(file_patterns, (basestring, ValueProvider)):
            raise TypeError('%s: file_pattern must be of type string'
                            ' or ValueProvider; got %r instead' %
                            (self.__class__.__name__, file_patterns))

        if isinstance(file_patterns, basestring):
            file_patterns = StaticValueProvider(str, file_patterns)
        self._file_patterns = file_patterns
        self._sources = []
        self._kwargs = kwargs
Exemple #31
0
 def test_iobase_source_with_query_and_filters(self):
     EXPECTED_TABLE_DATA = [{'string': u'привет'}]
     query = StaticValueProvider(str, self.query)
     with beam.Pipeline(argv=self.args) as p:
         result = (p | 'Direct read with query' >> beam.io.ReadFromBigQuery(
             method=beam.io.ReadFromBigQuery.Method.DIRECT_READ,
             row_restriction='number > 2',
             selected_fields=['string'],
             use_standard_sql=True,
             project=self.project,
             query=query))
         assert_that(result, equal_to(EXPECTED_TABLE_DATA))
Exemple #32
0
    def testValueProviderFilters(self):
        self.vp_filters = [
            [(StaticValueProvider(str, 'property_name'),
              StaticValueProvider(str, '='), StaticValueProvider(str,
                                                                 'value'))],
            [(StaticValueProvider(str, 'property_name'),
              StaticValueProvider(str, '='), StaticValueProvider(str,
                                                                 'value')),
             ('property_name', '=', 'value')],
        ]
        self.expected_filters = [
            [('property_name', '=', 'value')],
            [('property_name', '=', 'value'), ('property_name', '=', 'value')],
        ]

        for vp_filter, exp_filter in zip(self.vp_filters,
                                         self.expected_filters):
            q = Query(kind='kind',
                      project=self._PROJECT,
                      namespace=self._NAMESPACE,
                      filters=vp_filter)
            cq = q._to_client_query(self._test_client)
            self.assertEqual(exp_filter, cq.filters)

            logging.info('query: %s', q)  # Test __repr__()