Ejemplo n.º 1
0
def _ReadFromBigQueryImpl(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline,
        query: Text,
        use_bigquery_source: bool = False) -> beam.pvalue.PCollection:
    """Read from BigQuery.

  Args:
    pipeline: beam pipeline.
    query: a BigQuery sql string.
    use_bigquery_source: Whether to use BigQuerySource instead of experimental
      `ReadFromBigQuery` PTransform.

  Returns:
    PCollection of dict.
  """
    # TODO(b/155441037): Consolidate to ReadFromBigQuery once its performance
    # on dataflow runner is on par with BigQuerySource.
    if use_bigquery_source:
        return (
            pipeline
            | 'ReadFromBigQuerySource' >> beam.io.Read(
                beam.io.BigQuerySource(query=query, use_standard_sql=True)))

    return (pipeline
            | 'ReadFromBigQuery' >> beam_bigquery.ReadFromBigQuery(
                query=query,
                use_standard_sql=True,
                bigquery_job_labels=telemetry_utils.get_labels_dict()))
Ejemplo n.º 2
0
def ReadFromBigQuery(
    pipeline: beam.Pipeline, query: Text) -> beam.pvalue.PCollection:
  """Read data from BigQuery.

  Args:
    pipeline: Beam pipeline.
    query: A BigQuery sql string.

  Returns:
    PCollection of dict.
  """
  return (pipeline
          | 'ReadFromBigQuery' >> bigquery.ReadFromBigQuery(
              query=query,
              use_standard_sql=True,
              bigquery_job_labels=telemetry_utils.get_labels_dict()))
Ejemplo n.º 3
0
def _ReadFromBigQueryImpl(  # pylint: disable=invalid-name
        pipeline: beam.Pipeline, query: Text) -> beam.pvalue.PCollection:
    """Read from BigQuery.

  Args:
    pipeline: beam pipeline.
    query: a BigQuery sql string.

  Returns:
    PCollection of dict.
  """
    return (pipeline
            | 'ReadFromBigQuery' >> beam_bigquery.ReadFromBigQuery(
                query=query,
                use_standard_sql=True,
                bigquery_job_labels=telemetry_utils.get_labels_dict()))
Ejemplo n.º 4
0
def ReadFromBigQuery(pipeline: beam.Pipeline,
                     query_project: Text,
                     query_dataset: Text,
                     query_table: Text,
                     gcs_location: Text,
                     dest_project: Text,
                     query_limit: int = None) -> beam.pvalue.PCollection:
    """
    The Beam PTransform used to read data from a specific BQ table.

    Args:
        pipeline: Input beam.Pipeline object coming from a TFX Executor.
        query_project: Google Cloud project where the target table is
         located.
        query_dataset: Google Cloud project where the target dataset is
         located.
        query_table: Name of the target BigQuery table.
        gcs_location: Name of the Google Cloud Storage bucket where
         the extracted table should be written as a string.
        dest_project: Additional Google Cloud Project identifier.
        query_limit: Optional, maximum limit of how many datapoints
         to read from the specified BQ table.

    Returns:
        A beam.PCollection of data points. Each row in the BigQuery table
         represents a single data point.

    """
    query = f'SELECT * FROM `{query_project}.{query_dataset}.{query_table}`'

    if query_limit is not None:
        query += f'\nLIMIT {query_limit}'

    return (pipeline
            | 'ReadFromBigQuery' >> beam_bigquery.ReadFromBigQuery(
                project=dest_project,
                gcs_location=gcs_location,
                query=query,
                use_standard_sql=True))
Ejemplo n.º 5
0
 def __init__(self, *args, klio_message_columns=None, **kwargs):
     self.__reader = beam_bq.ReadFromBigQuery(*args, **kwargs)
     self.__mapper = _KlioReadFromBigQueryMapper(klio_message_columns)