예제 #1
0
    def _load_data(self, partitions_using_temp_tables,
                   partitions_direct_to_destination, load_job_name_pcv,
                   schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name):
        """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
        # Load data using temp tables
        trigger_loads_outputs = (
            partitions_using_temp_tables
            | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=True,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format,
                    step_name=step_name), load_job_name_pcv, *
                self.schema_side_inputs).with_outputs(
                    TriggerLoadJobs.TEMP_TABLES, main='main'))

        temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        finished_temp_tables_load_jobs_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForTempTableLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(temp_tables_load_job_ids_pc)))

        schema_mod_job_ids_pc = (
            finished_temp_tables_load_jobs_pc
            | beam.ParDo(
                UpdateDestinationSchema(
                    write_disposition=self.write_disposition,
                    test_client=self.test_client,
                    additional_bq_parameters=self.additional_bq_parameters,
                    step_name=step_name), schema_mod_job_name_pcv))

        finished_schema_mod_jobs_pc = (
            p
            | "ImpulseMonitorSchemaModJobs" >> beam.Create([None])
            | "WaitForSchemaModJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(schema_mod_job_ids_pc)))

        destination_copy_job_ids_pc = (
            finished_temp_tables_load_jobs_pc
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                test_client=self.test_client,
                                step_name=step_name), copy_job_name_pcv,
                pvalue.AsIter(finished_schema_mod_jobs_pc)))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            p
            | "RemoveTempTables/Impulse" >> beam.Create([None])
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda _, unused_copy_jobs, deleting_tables: deleting_tables,
                pvalue.AsIter(finished_copy_jobs_pc),
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Keys()
            | "RemoveTempTables/Delete" >> beam.ParDo(
                DeleteTablesFn(self.test_client)))

        # Load data directly to destination table
        destination_load_job_ids_pc = (
            partitions_direct_to_destination
            | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=False,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format,
                    step_name=step_name), load_job_name_pcv, *
                self.schema_side_inputs))

        _ = (p
             | "ImpulseMonitorDestinationLoadJobs" >> beam.Create([None])
             | "WaitForDestinationLoadJobs" >> beam.ParDo(
                 WaitForBQJobs(self.test_client),
                 pvalue.AsList(destination_load_job_ids_pc)))

        destination_load_job_ids_pc = (
            (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
            | beam.Flatten())

        return destination_load_job_ids_pc, destination_copy_job_ids_pc
예제 #2
0
def run(bigquery_dataset, bigquery_table, storage_bucket="coffeecircle"):
    """
    Extracts the data from the provided file data.csv with an ecommerce sales information

    :param bigquery_dataset: name of the dataset that is going to be use to load the data
    :param bigquery_table: Name of table used to load the data
    :param storage_bucket: Name of the bucket use to store DataFlow logs
    """

    # DataFlow requires to have the imports inside the function or context where the pipeline
    # definition is even though it goes against Python best practices.
    import csv

    import apache_beam as beam
    from apache_beam.io import ReadFromText
    from apache_beam.options.pipeline_options import PipelineOptions, GoogleCloudOptions, StandardOptions
    from datetime import datetime

    class Split(beam.DoFn):
        def process(self, element):
            reader = csv.reader(element.split('\n'), delimiter=',')
            try:
                for row in reader:
                    return [{
                        'invoice_no':
                        str(row[0]),
                        'stock_code':
                        str(row[1]),
                        'description':
                        str(row[2]),
                        'quantity':
                        int(row[3]),
                        'invoice_date':
                        str(datetime.strptime(row[4], '%m/%d/%Y %H:%M')),
                        'unit_price':
                        float(row[5]),
                        'customer_id':
                        int(row[6]) if row[6] else None,
                        'country':
                        str(row[7])
                    }]
            except Exception as exc:
                print(exc)

    # Gets BigQuery schema definition
    bigquery_schema = 'invoice_no:STRING,stock_code:STRING,description:STRING,\
                       quantity:INTEGER,invoice_date:TIMESTAMP,unit_price:FLOAT,\
                       customer_id:INTEGER,country:STRING'

    # Storage bucket for logs
    storage_bucket = "gs://{}/dataflow".format(storage_bucket)

    # Retrieve project Id and append to PROJECT form GoogleCloudOptions
    global PROJECT
    PROJECT = PipelineOptions().view_as(GoogleCloudOptions).project

    # Create and set your PipelineOptions.
    options = PipelineOptions()

    # For Cloud execution, set the Cloud Platform project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = 'coffeecircle'
    google_cloud_options.job_name = "test-" + \
        str(bigquery_table).replace('_', '-')
    google_cloud_options.staging_location = ("%s/staging_location" %
                                             storage_bucket)
    google_cloud_options.temp_location = ("%s/temp" % storage_bucket)
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = beam.Pipeline(options=options)

    # Transformation and loading steps
    rows = (p | ReadFromText('gs://coffeecircle/data.csv', skip_header_lines=1)
            | beam.ParDo(Split()))
    rows | 'Write data into Bigquery' >> beam.io.WriteToBigQuery(
        table=bigquery_table,
        dataset=bigquery_dataset,
        project='coffeecircle',
        schema=bigquery_schema,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)
    result = p.run()
    result.wait_until_finish()
예제 #3
0
def run(apache_beam_pipeline_options: PipelineOptions,
        data_input: str,
        reference_input: str,
        output: str,
        calculation_month_count: int,
        metric_types: List[str],
        state_code: Optional[str],
        calculation_end_month: Optional[str],
        person_filter_ids: Optional[List[int]]):
    """Runs the supervision calculation pipeline."""

    # Workaround to load SQLAlchemy objects at start of pipeline. This is necessary because the BuildRootEntity
    # function tries to access attributes of relationship properties on the SQLAlchemy room_schema_class before they
    # have been loaded. However, if *any* SQLAlchemy objects have been instantiated, then the relationship properties
    # are loaded and their attributes can be successfully accessed.
    _ = schema.StatePerson()

    apache_beam_pipeline_options.view_as(SetupOptions).save_main_session = True

    # Get pipeline job details
    all_pipeline_options = apache_beam_pipeline_options.get_all_options()

    input_dataset = all_pipeline_options['project'] + '.' + data_input
    reference_dataset = all_pipeline_options['project'] + '.' + reference_input

    person_id_filter_set = set(person_filter_ids) if person_filter_ids else None

    with beam.Pipeline(options=apache_beam_pipeline_options) as p:
        # Get StatePersons
        persons = (p | 'Load Persons' >> BuildRootEntity(dataset=input_dataset,
                                                         root_entity_class=entities.StatePerson,
                                                         unifying_id_field=entities.StatePerson.get_class_id_name(),
                                                         build_related_entities=True,
                                                         unifying_id_field_filter_set=person_id_filter_set,
                                                         state_code=state_code))

        # Get StateIncarcerationPeriods
        incarceration_periods = (p | 'Load IncarcerationPeriods' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code
        ))

        # Get StateSupervisionViolations
        supervision_violations = (p | 'Load SupervisionViolations' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionViolation,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code
        ))

        # TODO(2769): Don't bring this in as a root entity
        # Get StateSupervisionViolationResponses
        supervision_violation_responses = (p | 'Load SupervisionViolationResponses' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionViolationResponse,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code
        ))

        # Get StateSupervisionSentences
        supervision_sentences = (p | 'Load SupervisionSentences' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code
        ))

        # Get StateIncarcerationSentences
        incarceration_sentences = (p | 'Load IncarcerationSentences' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateIncarcerationSentence,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code
        ))

        # Get StateSupervisionPeriods
        supervision_periods = (p | 'Load SupervisionPeriods' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateSupervisionPeriod,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=True,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code
        ))

        # Get StateAssessments
        assessments = (p | 'Load Assessments' >> BuildRootEntity(
            dataset=input_dataset,
            root_entity_class=entities.StateAssessment,
            unifying_id_field=entities.StatePerson.get_class_id_name(),
            build_related_entities=False,
            unifying_id_field_filter_set=person_id_filter_set,
            state_code=state_code
        ))

        # Bring in the table that associates StateSupervisionViolationResponses to information about StateAgents
        ssvr_to_agent_association_query = f"SELECT * FROM `{reference_dataset}.ssvr_to_agent_association`"

        ssvr_to_agent_associations = (p | "Read SSVR to Agent table from BigQuery" >>
                                      beam.io.Read(beam.io.BigQuerySource
                                                   (query=ssvr_to_agent_association_query,
                                                    use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the
        # supervision_violation_response_id column as the key
        ssvr_agent_associations_as_kv = (ssvr_to_agent_associations | 'Convert SSVR to Agent table to KV tuples' >>
                                         beam.ParDo(ConvertDictToKVTuple(),
                                                    'supervision_violation_response_id')
                                         )

        supervision_period_to_agent_association_query = f"SELECT * FROM `{reference_dataset}." \
                                                        f"supervision_period_to_agent_association`"

        supervision_period_to_agent_associations = (p | "Read Supervision Period to Agent table from BigQuery" >>
                                                    beam.io.Read(beam.io.BigQuerySource
                                                                 (query=supervision_period_to_agent_association_query,
                                                                  use_standard_sql=True)))

        # Convert the association table rows into key-value tuples with the value for the supervision_period_id column
        # as the key
        supervision_period_to_agent_associations_as_kv = (supervision_period_to_agent_associations |
                                                          'Convert Supervision Period to Agent table to KV tuples' >>
                                                          beam.ParDo(ConvertDictToKVTuple(),
                                                                     'supervision_period_id')
                                                          )

        if state_code is None or state_code == 'US_MO':
            # Bring in the reference table that includes sentence status ranking information
            us_mo_sentence_status_query = f"SELECT * FROM `{reference_dataset}.us_mo_sentence_statuses`"

            us_mo_sentence_statuses = (p | "Read MO sentence status table from BigQuery" >>
                                       beam.io.Read(beam.io.BigQuerySource(query=us_mo_sentence_status_query,
                                                                           use_standard_sql=True)))
        else:
            us_mo_sentence_statuses = (p | f"Generate empty MO statuses list for non-MO state run: {state_code} " >>
                                       beam.Create([]))

        us_mo_sentence_status_rankings_as_kv = (
            us_mo_sentence_statuses |
            'Convert MO sentence status ranking table to KV tuples' >>
            beam.ParDo(ConvertDictToKVTuple(), 'person_id')
        )

        sentences_and_statuses = (
            {'incarceration_sentences': incarceration_sentences,
             'supervision_sentences': supervision_sentences,
             'sentence_statuses': us_mo_sentence_status_rankings_as_kv}
            | 'Group sentences to the sentence statuses for that person' >>
            beam.CoGroupByKey()
        )

        sentences_converted = (
            sentences_and_statuses
            | 'Convert to state-specific sentences' >>
            beam.ParDo(ConvertSentencesToStateSpecificType()).with_outputs('incarceration_sentences',
                                                                           'supervision_sentences')
        )

        # Group StateSupervisionViolationResponses and StateSupervisionViolations by person_id
        supervision_violations_and_responses = (
            {'violations': supervision_violations,
             'violation_responses': supervision_violation_responses
             } | 'Group StateSupervisionViolationResponses to '
                 'StateSupervisionViolations' >>
            beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolation entities on the corresponding
        # StateSupervisionViolationResponses
        violation_responses_with_hydrated_violations = (
            supervision_violations_and_responses
            | 'Set hydrated StateSupervisionViolations on '
            'the StateSupervisionViolationResponses' >>
            beam.ParDo(SetViolationOnViolationsResponse()))

        # Group StateIncarcerationPeriods and StateSupervisionViolationResponses by person_id
        incarceration_periods_and_violation_responses = (
            {'incarceration_periods': incarceration_periods,
             'violation_responses':
                 violation_responses_with_hydrated_violations}
            | 'Group StateIncarcerationPeriods to '
              'StateSupervisionViolationResponses' >>
            beam.CoGroupByKey()
        )

        # Set the fully hydrated StateSupervisionViolationResponse entities on the corresponding
        # StateIncarcerationPeriods
        incarceration_periods_with_source_violations = (
            incarceration_periods_and_violation_responses
            | 'Set hydrated StateSupervisionViolationResponses on '
            'the StateIncarcerationPeriods' >>
            beam.ParDo(SetViolationResponseOnIncarcerationPeriod()))

        # Group each StatePerson with their StateIncarcerationPeriods and StateSupervisionSentences
        person_periods_and_sentences = (
            {'person': persons,
             'assessments': assessments,
             'incarceration_periods':
                 incarceration_periods_with_source_violations,
             'supervision_periods': supervision_periods,
             'supervision_sentences': sentences_converted.supervision_sentences,
             'incarceration_sentences': sentences_converted.incarceration_sentences,
             'violation_responses': violation_responses_with_hydrated_violations
             }
            | 'Group StatePerson to all entities' >>
            beam.CoGroupByKey()
        )

        # Identify SupervisionTimeBuckets from the StatePerson's StateSupervisionSentences and StateIncarcerationPeriods
        person_time_buckets = (
            person_periods_and_sentences
            | 'Get SupervisionTimeBuckets' >>
            beam.ParDo(ClassifySupervisionTimeBuckets(),
                       AsDict(ssvr_agent_associations_as_kv),
                       AsDict(supervision_period_to_agent_associations_as_kv)))

        # Get pipeline job details for accessing job_id
        all_pipeline_options = apache_beam_pipeline_options.get_all_options()

        # Get the type of metric to calculate
        metric_types_set = set(metric_types)

        # Add timestamp for local jobs
        job_timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H_%M_%S.%f')
        all_pipeline_options['job_timestamp'] = job_timestamp

        # Get supervision metrics
        supervision_metrics = (person_time_buckets | 'Get Supervision Metrics' >>
                               GetSupervisionMetrics(
                                   pipeline_options=all_pipeline_options,
                                   metric_types=metric_types_set,
                                   calculation_end_month=calculation_end_month,
                                   calculation_month_count=calculation_month_count))
        if person_id_filter_set:
            logging.warning("Non-empty person filter set - returning before writing metrics.")
            return

        # Convert the metrics into a format that's writable to BQ
        writable_metrics = (supervision_metrics | 'Convert to dict to be written to BQ' >>
                            beam.ParDo(
                                SupervisionMetricWritableDict()).with_outputs(
                                    'populations', 'revocations', 'successes',
                                    'successful_sentence_lengths', 'assessment_changes', 'revocation_analyses',
                                    'revocation_violation_type_analyses'
                                )
                            )

        # Write the metrics to the output tables in BigQuery
        populations_table = output + '.supervision_population_metrics'

        revocations_table = output + '.supervision_revocation_metrics'

        successes_table = output + '.supervision_success_metrics'

        successful_sentence_lengths_table = output + '.successful_supervision_sentence_days_served_metrics'

        assessment_changes_table = output + '.terminated_supervision_assessment_score_change_metrics'

        revocation_analysis_table = output + '.supervision_revocation_analysis_metrics'

        revocation_violation_type_analysis_table = output + \
            '.supervision_revocation_violation_type_analysis_metrics'

        _ = (writable_metrics.populations
             | f"Write population metrics to BQ table: {populations_table}" >>
             beam.io.WriteToBigQuery(
                 table=populations_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
             ))

        _ = (writable_metrics.revocations
             | f"Write revocation metrics to BQ table: {revocations_table}" >>
             beam.io.WriteToBigQuery(
                 table=revocations_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
             ))

        _ = (writable_metrics.successes
             | f"Write success metrics to BQ table: {successes_table}" >>
             beam.io.WriteToBigQuery(
                 table=successes_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
             ))

        _ = (writable_metrics.successful_sentence_lengths
             | f"Write supervision successful sentence length metrics to BQ"
               f" table: {successful_sentence_lengths_table}" >>
             beam.io.WriteToBigQuery(
                 table=successful_sentence_lengths_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
             ))

        _ = (writable_metrics.assessment_changes
             | f"Write assessment change metrics to BQ table: {assessment_changes_table}" >>
             beam.io.WriteToBigQuery(
                 table=assessment_changes_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
             ))

        _ = (writable_metrics.revocation_analyses
             | f"Write revocation analyses metrics to BQ table: {revocation_analysis_table}" >>
             beam.io.WriteToBigQuery(
                 table=revocation_analysis_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
             ))

        _ = (writable_metrics.revocation_violation_type_analyses
             | f"Write revocation violation type analyses metrics to BQ table: "
               f"{revocation_violation_type_analysis_table}" >>
             beam.io.WriteToBigQuery(
                 table=revocation_violation_type_analysis_table,
                 create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                 write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND
             ))
예제 #4
0
def run(argv=None):
    """
    This function parses the command line arguments and runs the Beam Pipeline.

    Args:
        argv: list containing the commandline arguments for this call of the
         script.
    """
    # Keeps track if schema was inferred by input or ouput table.
    schema_inferred = False

    data_args, pipeline_args = parse_data_generator_args(argv)
    data_args, schema_inferred = fetch_schema(data_args, schema_inferred)
    pipeline_options = PipelineOptions(pipeline_args)

    temp_location = pipeline_options.display_data()['temp_location']
    temp_blob = write_n_line_file_to_gcs(
        pipeline_options.display_data()['project'], temp_location,
        data_args.num_records)

    data_gen = DataGenerator(bq_schema_filename=data_args.schema_file,
                             input_bq_table=data_args.input_bq_table,
                             p_null=data_args.p_null,
                             n_keys=data_args.n_keys,
                             min_date=data_args.min_date,
                             max_date=data_args.max_date,
                             only_pos=data_args.only_pos,
                             max_int=data_args.max_int,
                             max_float=data_args.max_float,
                             float_precision=data_args.float_precision,
                             write_disp=data_args.write_disp,
                             key_skew=data_args.key_skew,
                             primary_key_cols=data_args.primary_key_cols)

    # Initiate the pipeline using the pipeline arguments passed in from the
    # command line.  This includes information including where Dataflow should
    # store temp files, and what the project id is and what runner to use.
    p = beam.Pipeline(options=pipeline_options)

    rows = (
        p
        # Read the file we created with num_records newlines.
        | 'Read file with num_records lines' >> beam.io.ReadFromText(
            os.path.join('gs://', temp_blob.bucket.name, temp_blob.name))

        # Use our instance of our custom DataGenerator Class to generate 1 fake
        # datum with the appropriate schema for each element in the PColleciton
        # created above.
        | 'Generate Data' >> beam.ParDo(FakeRowGen(data_gen))
        | 'Parse Json Strings' >> beam.FlatMap(lambda row: [json.loads(row)]))

    if data_args.primary_key_cols:
        for key in data_args.primary_key_cols.split(','):
            rows |= 'Enforcing primary key: {}'.format(
                key) >> EnforcePrimaryKeys(key)

    if data_args.csv_schema_order:
        (rows
         | 'Order fields for CSV writing.' >> beam.FlatMap(
             lambda d: [dict_to_csv(d, data_args.csv_schema_order.split(','))])
         | 'Write to GCS' >> beam.io.textio.WriteToText(
             file_path_prefix=data_args.output_prefix, file_name_suffix='.csv')
         )

    if data_args.avro_schema_file:
        avsc = avro.schema.parse(open(data_args.avro_schema_file, 'rb').read())
        fastavro_avsc = fastavro.schema.load_schema(data_args.avro_schema_file)

        (rows
         # Need to convert time stamps from strings to timestamp-micros
         | 'Fix date and time Types for Avro.' >>
         beam.FlatMap(lambda row: fix_record_for_avro(row, avsc))
         | 'Write to Avro.' >> beam.io.avroio.WriteToAvro(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.avro',
             use_fastavro=True,
             schema=fastavro_avsc))

    if data_args.write_to_parquet:
        with open(data_args.schema_file, 'r') as infile:
            str_schema = json.load(infile)
        pa_schema = get_pyarrow_translated_schema(str_schema)
        (rows
         | 'Fix data and time Types for Parquet.' >>
         beam.FlatMap(lambda row: fix_record_for_parquet(row, str_schema))
         | 'Write to Parquet.' >> beam.io.WriteToParquet(
             file_path_prefix=data_args.output_prefix,
             codec='null',
             file_name_suffix='.parquet',
             schema=pa_schema))

    if data_args.output_bq_table:
        (rows
         | 'Write to BigQuery.' >> beam.io.gcp.bigquery.WriteToBigQuery(
             # The table name is a required argument for the BigQuery sink.
             # In this case we use the value passed in from the command
             # line.
             data_args.output_bq_table,
             schema=None if schema_inferred else data_gen.get_bq_schema(),
             # Creates the table in BigQuery if it does not yet exist.
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=data_gen.write_disp,
             # Use the max recommended batch size.
             batch_size=500))

    p.run().wait_until_finish()

    # Manually clean up of temp_num_records.txt because it will be outside this
    # job's directory and Dataflow will not remove it for us.
    temp_blob.delete()
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--topic_prefix',
                        dest='topic_prefix',
                        default=default_topic)
    parser.add_argument('--bucket', dest='bucket', default=default_bucket)

    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--project={}'.format(project), '--streaming',
        '--experiments=allow_non_updatable_job'
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    class DiffOutputsFn(beam.DoFn):
        # These tags will be used to tag the outputs of this DoFn.
        OUTPUT_TAG_BUY = 'buy'
        OUTPUT_TAG_SELL = 'sell'
        OUTPUT_TAG_ERROR = 'error'

        def process(self, element):
            dictionary = yaml.load(element)
            dictionary['timestamp'] = datetime.now().strftime(
                '%Y-%m-%d %H:%M:%S')
            if dictionary['type'] == 'buy':
                dictionary.pop('type')
                yield pvalue.TaggedOutput(self.OUTPUT_TAG_BUY, dictionary)
            elif dictionary['type'] == 'sell':
                dictionary.pop('type')
                yield pvalue.TaggedOutput(self.OUTPUT_TAG_SELL, dictionary)
            else:
                # we don't drop the key here, since we want to know where the mistake was
                yield pvalue.TaggedOutput(self.OUTPUT_TAG_ERROR, dictionary)

    def string_join(elements):
        string = str(elements)
        return string.replace('},', '};')

    with beam.Pipeline(options=pipeline_options) as p:

        output_buy = []
        output_sell = []
        output_error = []

        for branch in range(BRANCHES):
            current_topic = known_args.topic_prefix + str(branch)
            diff_outputs = (p | "ReadTopic{}".format(branch) >>
                            beam.io.ReadFromPubSub(topic=current_topic)
                            | "SplitOutputs{}".format(branch) >> beam.ParDo(
                                DiffOutputsFn()).with_outputs(
                                    DiffOutputsFn.OUTPUT_TAG_BUY,
                                    DiffOutputsFn.OUTPUT_TAG_SELL,
                                    DiffOutputsFn.OUTPUT_TAG_ERROR))

            # We need to make a list for each output type
            output_buy.append(diff_outputs.buy)
            output_sell.append(diff_outputs.sell)
            output_error.append(diff_outputs.error)

        buy = (
            tuple(output_buy) | "FlattenBuy" >> beam.Flatten()
            |
            "WindowBuy" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH))
            | "CombineBuy" >>
            beam.CombineGlobally(string_join).without_defaults()
            | "WriteToGCSBuy" >>
            WriteToText(file_path_prefix=known_args.bucket + 'buy/'))

        sell = (
            tuple(output_sell) | "FlattenSell" >> beam.Flatten()
            |
            "WindowSell" >> beam.WindowInto(window.FixedWindows(WINDOW_LENGTH))
            | "CombineSell" >>
            beam.CombineGlobally(string_join).without_defaults()
            | "WriteToGCSSell" >>
            WriteToText(file_path_prefix=known_args.bucket + 'sell/'))

        error = (tuple(output_error) | "FlattenError" >> beam.Flatten()
                 | "WindowError" >> beam.WindowInto(
                     window.FixedWindows(WINDOW_LENGTH))
                 | "CombineError" >>
                 beam.CombineGlobally(string_join).without_defaults()
                 | "WriteToGCSError" >>
                 WriteToText(file_path_prefix=known_args.bucket + 'error/'))
예제 #6
0
def ExtractSliceKeys(extracts: beam.pvalue.PCollection,
                     slice_spec: List[slicer.SingleSliceSpec],
                     eval_config: Optional[config_pb2.EvalConfig] = None,
                     materialize: bool = True) -> beam.pvalue.PCollection:
    return extracts | beam.ParDo(ExtractSliceKeysFn(eval_config, materialize),
                                 slice_spec=slice_spec)
예제 #7
0
    def expand(self, deployed_model):
        """Apply the transform.

    Args:
      deployed_model: A PCollection should be the output of DeployVersion, or a
        tuple of (model, version).

    Returns:
         A PCollection with a the results of the Prediction

    Raises:
       ValueError: If the arguments are invalid.
    """
        pipeline = deployed_model.pipeline

        # For the job name use a combination of the transform label and a
        # datestamp. The datestamp is intended to make it unique.
        now = datetime.datetime.now()
        # We add some salt to the job name to avoid collisions if we try to submit
        # multiple jobs at the same time.
        # N.B. The job_name is fixed at pipeline construction time. This is
        # critical because multiple invocation of the Train transform (e.g. because
        # of retries) need to use the same job name.
        salt = '%04x' % random.getrandbits(4 * 4)

        # TODO(b/28989568): We need to lower case the name because the backend
        # only allows lower case letters for job names. The backend should probably
        # do this automatically but currently it doesn't.
        job_name = '{0}_{1}_{2}'.format(self.label,
                                        now.strftime('%y%m%d_%H%M%S'),
                                        salt).lower().replace(' ', '_')

        options = pipeline.options
        # TODO(b/29163051) Options can be None depending on how the runner was
        # constructed.
        if options is None:
            options = df_options.PipelineOptions()

        cloud_options = options.view_as(df_options.GoogleCloudOptions)
        project_id = cloud_options.project

        if cloud_options.temp_location:
            temp_dir = cloud_options.temp_location
        elif cloud_options.staging_location:
            temp_dir = cloud_options.staging_location
        else:
            raise ValueError(
                '--staging_location must be specified to run in the cloud')

        if not self.output_uri:
            output_uri = os.path.join(temp_dir, 'prediction_results')
        else:
            output_uri = self.output_uri

        logging.info('Output uri : %s', output_uri)

        # Construct the batch prediction job.
        prediction_request = ml_func.PredictionJobRequest(
            project_id,
            job_name,
            self.input_uris,
            output_uri,
            self.region,
            self.data_format,
            endpoint=self.cloud_ml_endpoint,
            runtime_version=self.runtime_version)
        request = (
            pipeline | 'PredictRequest' >> beam.Create([prediction_request])
            | 'AugmentPredictArgs' >> beam.ParDo(
                ml_func._AugmentPredictArgsDo(),  # pylint: disable=protected-access
                beam.pvalue.AsSingleton(deployed_model)))

        # Run the batch prediction job
        predict_do = ml_func.BatchPredictionJobDo(api_class=self.api_version)
        unused_prediction_results = (
            request | 'BatchPrediction' >> beam.ParDo(predict_do))

        # Wait until the prediction job is done, then Read the results from the file
        # to which they were written and return.
        results = 'Read Results' >> beam.io.ReadFromText(output_uri,
                                                         validate=False)
        return results
 def expand(self, pcoll):
   return (
       pcoll
       | 'ProcessTransformLog' >> beam.ParDo(ProcessTransformLog()))
 def expand(self, pcoll):
   return (
       pcoll
       | 'JoinTable' >> beam.ParDo(JoinTable()))
예제 #10
0
    def process(self, element):
        """
        Prepares each row to be written in the csv
        """
        result = [
            "{},{},{}".format(element[0], element[1]['users'][0],
                              element[1]['timings'][0])
        ]
        with open(output_filename, 'a') as f:
            f.write(result[0] + "\n")
        return result


if __name__ == '__main__':
    with beam.Pipeline(options=options) as p:
        rows = (p | ReadFromText(input_filename) | beam.ParDo(Split()))

        timings = (rows | beam.ParDo(CollectTimings())
                   | "Grouping timings" >> beam.GroupByKey()
                   | "Calculating average" >> beam.CombineValues(
                       beam.combiners.MeanCombineFn()))

        users = (rows | beam.ParDo(CollectUsers())
                 | "Grouping users" >> beam.GroupByKey() | "Counting users" >>
                 beam.CombineValues(beam.combiners.CountCombineFn()))

        to_be_joined = ({
            'timings': timings,
            'users': users
        } | beam.CoGroupByKey() | beam.ParDo(WriteToCSV())
                        | WriteToText(output_filename))
예제 #11
0
    'temp_location': BUCKET + '/temp',
    'staging_location': BUCKET + '/staging',
    'machine_type': 'n1-standard-1', # machine types listed here: https://cloud.google.com/compute/docs/machine-types
    'num_workers': 1
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)

with beam.Pipeline('DataflowRunner', options=opts) as p:

    takes_pcoll = p | 'Read from BQ Takes' >> beam.io.Read(beam.io.BigQuerySource(query='SELECT sid, cno, grade FROM college_split.Takes'))
    class_pcoll = p | 'Read from BQ Class' >> beam.io.Read(beam.io.BigQuerySource(query='SELECT cno FROM college_split.Class'))

    # write PCollections to log files
    takes_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'takes_query_results.txt')
    class_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'class_query_results.txt')

    # apply ParDo to check cno value's referential integrity 
    norm_takes_pcoll = takes_pcoll | 'Normalize Record' >> beam.ParDo(NormalizeTakesFn(), beam.pvalue.AsList(class_pcoll))

    # write PCollection to log file
    norm_takes_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'norm_takes_pcoll.txt')
    
    qualified_table_name = PROJECT_ID + ':college_normalized.Takes'
    table_schema = 'sid:STRING,cno:STRING,grade:STRING'
    
    # write PCollection to new BQ table
    norm_takes_pcoll | 'Write BQ table' >> beam.io.Write(beam.io.BigQuerySink(qualified_table_name, 
                                                    schema=table_schema,  
                                                    create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                    write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
예제 #12
0
def examples_wordcount_debugging(renames):
    """DebuggingWordCount example snippets."""
    import re

    import apache_beam as beam

    # [START example_wordcount_debugging_logging]
    # [START example_wordcount_debugging_aggregators]
    import logging

    class FilterTextFn(beam.DoFn):
        """A DoFn that filters for a specific key based on a regular expression."""

        def __init__(self, pattern):
            self.pattern = pattern
            # A custom metric can track values in your pipeline as it runs. Create
            # custom metrics matched_word and unmatched_words.
            self.matched_words = Metrics.counter(self.__class__, 'matched_words')
            self.umatched_words = Metrics.counter(self.__class__, 'umatched_words')

        def process(self, element):
            word, _ = element
            if re.match(self.pattern, word):
                # Log at INFO level each element we match. When executing this pipeline
                # using the Dataflow service, these log lines will appear in the Cloud
                # Logging UI.
                logging.info('Matched %s', word)

                # Add 1 to the custom metric counter matched_words
                self.matched_words.inc()
                yield element
            else:
                # Log at the "DEBUG" level each element that is not matched. Different
                # log levels can be used to control the verbosity of logging providing
                # an effective mechanism to filter less important information. Note
                # currently only "INFO" and higher level logs are emitted to the Cloud
                # Logger. This log message will not be visible in the Cloud Logger.
                logging.debug('Did not match %s', word)

                # Add 1 to the custom metric counter umatched_words
                self.umatched_words.inc()

    # [END example_wordcount_debugging_logging]
    # [END example_wordcount_debugging_aggregators]

    p = TestPipeline()  # Use TestPipeline for testing.
    filtered_words = (
        p
        | beam.io.ReadFromText(
            'gs://dataflow-samples/shakespeare/kinglear.txt')
        | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
        | beam.combiners.Count.PerElement()
        | 'FilterText' >> beam.ParDo(FilterTextFn('Flourish|stomach')))

    # [START example_wordcount_debugging_assert]
    beam.assert_that(
        filtered_words, beam.equal_to([('Flourish', 3), ('stomach', 1)]))
    # [END example_wordcount_debugging_assert]

    output = (filtered_words
              | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
              | 'Write' >> beam.io.WriteToText('gs://my-bucket/counts.txt'))

    p.visit(SnippetUtils.RenameFiles(renames))
    p.run()
from apache_beam.options.pipeline_options import PipelineOptions,GoogleCloudOptions,StandardOptions



def printer(data_item):
    print(data_item)




options = PipelineOptions()
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'first-gcp-wordcount'

# google_cloud_options.job_name = 'myjob'
# google_cloud_options.staging_location = 'gs://your-bucket-name-here/staging'
# google_cloud_options.temp_location = 'gs://your-bucket-name-here/temp'
options.view_as(StandardOptions).runner = 'Directrunner'
options.view_as(StandardOptions).streaming = True

p = beam.Pipeline(options=options)

lines=\
    (p| "read data from subscription :" >> beam.io.ReadFromPubSub
            (subscription="projects/first-gcp-wordcount/subscriptions/subscribe_test_twitter").with_output_types(bytes)
      # | "map the message :" >> beam.Map(lambda x:p)
      | "print the value :" >> beam.ParDo(printer)
      # | "makedataframe :" >> beam.ParDo(dataframe_val)
      # | "print the value 2:">> beam.ParDo(printer)
     )
p.run().wait_until_finish()
예제 #14
0
import datetime

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions
from config import Config

from data.utils import dataloader

if __name__ == "__main__":
    options = PipelineOptions(
        runner=Config.RUNNER,
        project=Config.PROJECT_ID,
        job_name='generate-tfrecords' +
        datetime.datetime.now().strftime("%m-%d-%Y-%H-%M-%S"),
        temp_location='gs://raw_data_layer/temp',
        region='us-central1',
        setup_file="./setup.py")

    with beam.Pipeline(options=options) as pipeline:
        content = (
            pipeline
            | "create data" >> beam.io.ReadFromTFRecord(
                file_pattern=
                "/Volumes/STEF-EXT/object_detection/kitti/kitti/3.2.0/kitti-train.tfrecord*"
            )
            | "parse tfds examples" >> beam.ParDo(dataloader.ParseExample())
            | "create tf examples" >> beam.ParDo(dataloader.ConvertToExample())
            | "write to TFRecords" >>
            beam.io.WriteToTFRecord(file_path_prefix=Config.LABELS_TFRECORD))
예제 #15
0
  def test_wordcount(self):
    class WordExtractingDoFn(beam.DoFn):
      def process(self, element):
        text_line = element.strip()
        words = text_line.split()
        return words

    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

    # Count the occurrences of each word.
    counts = (
        p
        | beam.Create(['to be or not to be that is the question'])
        | 'split' >> beam.ParDo(WordExtractingDoFn())
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1]))))

    # Watch the local scope for Interactive Beam so that counts will be cached.
    ib.watch(locals())

    result = p.run()
    result.wait_until_finish()

    actual = list(result.get(counts))
    self.assertSetEqual(
        set(actual),
        set([
            ('or', 1),
            ('that', 1),
            ('be', 2),
            ('is', 1),
            ('question', 1),
            ('to', 2),
            ('the', 1),
            ('not', 1),
        ]))

    # Truncate the precision to millis because the window coder uses millis
    # as units then gets upcast to micros.
    end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
    df_counts = ib.collect(counts, include_window_info=True)
    df_expected = pd.DataFrame({
        0: [e[0] for e in actual],
        1: [e[1] for e in actual],
        'event_time': [end_of_window for _ in actual],
        'windows': [[GlobalWindow()] for _ in actual],
        'pane_info': [
            PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0) for _ in actual
        ]
    },
                               columns=[
                                   0, 1, 'event_time', 'windows', 'pane_info'
                               ])

    pd.testing.assert_frame_equal(df_expected, df_counts)

    actual_reified = result.get(counts, include_window_info=True)
    expected_reified = [
        WindowedValue(
            e,
            Timestamp(micros=end_of_window), [GlobalWindow()],
            PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) for e in actual
    ]
    self.assertEqual(actual_reified, expected_reified)
        }
  return [total_age/age_adjusted_rate.length(), total_crude/crude_rate.length()]
    
    
PROJECT_ID = os.environ['dogwood-outcome-231223']

# Project ID is required when using the BQ source
options = {
    'project': PROJECT_ID
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)

# Create beam pipeline using local runner
with beam.Pipeline('DirectRunner', options=opts) as p:

    query = p | 'Read Query' >> beam.io.Read(beam.io.BigQuerySource(query='SELECT crude_rate, population,age_adjusted_rate, area FROM cancer_stats.Cancer_By_Area_Incidence LIMIT 1000'))
    
    formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo(AvgCancerStat())

    # write PCollections to log files
    p | 'Write log' >> WriteToText('input.txt')

    # write PCollection to log file
    formatted_dob_pcoll | 'Write log 2' >> WriteToText('output.txt')
    
    # write PCollection to new BQ table
    norm_takes_pcoll | 'Write BQ table' >> beam.io.Write(beam.io.BigQuerySink(AvgCancerArea, 
            schema= 'area:STRING, avg_crude:FLOAT,population:INT,avg_age_adjusted_Rate:FLOAT',  
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
예제 #17
0
                else:
                    valenc = bytes(val)    
                row.set_cell('cf1',colname.encode("utf-8"),valenc, datetime.now())
        rows.append(row)    
        table.mutate_rows(rows)   
    except:   
        logging.error("Failed with input: ", str(element))
        raise
        
options = PipelineOptions()

google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.job_name = 'beamwordcount'
google_cloud_options.staging_location = 'gs://beamgcdeveloperwipro/staging'
google_cloud_options.temp_location = 'gs://beamgcdeveloperwipro/temp'
options.view_as(StandardOptions).runner = 'DataflowRunner'

avro_input = options.view_as(MyOptions).avro_input
json_input = options.view_as(MyOptions).json_input
project_id =  google_cloud_options.project

p = beam.Pipeline(options=options)

lines_avro  = p | "ReadAvroFromGCS" >> beam.io.avroio.ReadFromAvro(avro_input)
lines_text  = p | "ReadJsonFromGCS" >> beam.io.ReadFromText(json_input)
lines_avro | "CreateHbaseRowsFromAvro" >> beam.ParDo(CreateHbaseRow(project_id,
                                                                'mybigtable','customer'))
lines_json = lines_text | "ConvertToJson" >> beam.ParDo(ConvertToJson())  
lines_json | "CreateHbaseRowsFromJson" >> beam.ParDo(CreateHbaseRow(project_id,
                                                                'mybigtable','customerfromjson'))   
p.run()
예제 #18
0
파일: test2.py 프로젝트: basanthsk/GCP
                for dim, axis in item.items():
                    label = (title[dim].values[0])
                    d = {
                        u'fbkey': '{}'.format(element['fbkeyl2']),
                        'axisDim': int(dim),
                        'axisOrder': int(idx),
                        'axisValue': float(axis),
                        'axisTitle': u'{}'.format(label)
                    }
                    yield json.dumps(d)

        except Exception as e:
            beam.pvalue.TaggedOutput('exception',
                                     element['physical_measurement'])


if __name__ == '__main__':
    with beam.Pipeline(options=options) as pipeline:
        data, log = (
            pipeline
            | beam.io.ReadFromText(infile, coder=JsonCoder())
            | beam.Filter(lambda row: all(
                [row['content'] != 'notParse', row['type'] == 'measurement']))
            # | beam.Map(lambda e : (e['content'],e['physical_measurement']))
            | 'Print Results' >> beam.ParDo(DimTrans()).with_outputs(
                'exception', main='data'))

        data | beam.io.WriteToText(outfile)
        log | 'exception' >> beam.io.WriteToText('log file.txt')
        pipeline.run()
예제 #19
0
def run(argv=None, save_main_session=True):
    """Runs the workflow."""
    known_args, pipeline_args = parse_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    input_info = known_args.input

    with TestPipeline(options=pipeline_options) as p:
        source = SyntheticSource(input_info)

        # pylint: disable=expression-not-assigned
        barrier = known_args.barrier

        pc_list = []
        num_roots = 2**(len(known_args.steps) -
                        1) if (barrier == 'merge-gbk'
                               or barrier == 'merge-side-input') else 1
        for read_no in range(num_roots):
            pc_list.append((p | ('Read %d' % read_no) >> beam.io.Read(source)))

        for step_no, steps in enumerate(known_args.steps):
            if step_no != 0:
                new_pc_list = []
                for pc_no, pc in enumerate(pc_list):
                    if barrier == 'shuffle':
                        new_pc_list.append(
                            (pc | ('shuffle %d.%d' %
                                   (step_no, pc_no)) >> ShuffleBarrier()))
                    elif barrier == 'side-input':
                        new_pc_list.append(
                            (pc | ('side-input %d.%d' %
                                   (step_no, pc_no)) >> SideInputBarrier()))
                    elif barrier == 'expand-gbk':
                        new_pc_list.extend(
                            expand_using_gbk(
                                ('expand-gbk %d.%d' % (step_no, pc_no)), pc))
                    elif barrier == 'expand-second-output':
                        new_pc_list.extend(
                            expand_using_second_output(
                                ('expand-second-output %d.%d' %
                                 (step_no, pc_no)), pc))
                    elif barrier == 'merge-gbk':
                        if pc_no % 2 == 0:
                            new_pc_list.append(
                                merge_using_gbk(
                                    ('merge-gbk %d.%d' % (step_no, pc_no)), pc,
                                    pc_list[pc_no + 1]))
                        else:
                            continue
                    elif barrier == 'merge-side-input':
                        if pc_no % 2 == 0:
                            new_pc_list.append(
                                merge_using_side_input(
                                    ('merge-side-input %d.%d' %
                                     (step_no, pc_no)), pc,
                                    pc_list[pc_no + 1]))
                        else:
                            continue

                pc_list = new_pc_list

            new_pc_list = []
            for pc_no, pc in enumerate(pc_list):
                if steps['splittable']:
                    step = get_synthetic_sdf_step(
                        per_element_delay_sec=steps['per_element_delay'],
                        per_bundle_delay_sec=steps['per_bundle_delay'],
                        output_records_per_input_record=steps[
                            'output_records_per_input_record'],
                        output_filter_ratio=steps['output_filter_ratio'],
                        initial_splitting_num_bundles=steps[
                            'initial_splitting_num_bundles'],
                        initial_splitting_uneven_chunks=steps[
                            'initial_splitting_uneven_chunks'],
                        disable_liquid_sharding=steps[
                            'disable_liquid_sharding'],
                        size_estimate_override=steps['size_estimate_override'])
                else:
                    step = SyntheticStep(
                        per_element_delay_sec=steps['per_element_delay'],
                        per_bundle_delay_sec=steps['per_bundle_delay'],
                        output_records_per_input_record=steps[
                            'output_records_per_input_record'],
                        output_filter_ratio=steps['output_filter_ratio'])
                new_pc = pc | 'SyntheticStep %d.%d' % (
                    step_no, pc_no) >> beam.ParDo(step)
                new_pc_list.append(new_pc)
            pc_list = new_pc_list

        if known_args.output:
            # If an output location is provided we format and write output.
            if len(pc_list) == 1:
                (pc_list[0]
                 | 'FormatOutput' >> beam.Map(lambda elm: (elm[0] + elm[1]))
                 | 'WriteOutput' >> WriteToText(known_args.output))

    logging.info('Pipeline run completed.')
예제 #20
0
    def test_streaming_complex_timing(self):
        # Use state on the TestCase class, since other references would be pickled
        # into a closure and not have the desired side effects.
        #
        # TODO(BEAM-5295): Use assert_that after it works for the cases here in
        # streaming mode.
        WriteFilesTest.all_records = []

        dir = '%s%s' % (self._new_tempdir(), os.sep)

        # Setting up the input (TestStream)
        ts = TestStream().advance_watermark_to(0)
        for elm in WriteFilesTest.LARGER_COLLECTION:
            timestamp = int(elm)

            ts.add_elements([('key', '%s' % elm)])
            if timestamp % 5 == 0 and timestamp != 0:
                # TODO(BEAM-3759): Add many firings per window after getting PaneInfo.
                ts.advance_processing_time(5)
                ts.advance_watermark_to(timestamp)
        ts.advance_watermark_to_infinity()

        def no_colon_file_naming(*args):
            file_name = fileio.destination_prefix_naming()(*args)
            return file_name.replace(':', '_')

        # The pipeline that we are testing
        options = PipelineOptions()
        options.view_as(StandardOptions).streaming = True
        with TestPipeline(options=options) as p:
            res = (p
                   | ts
                   | beam.WindowInto(
                       FixedWindows(10),
                       trigger=trigger.AfterWatermark(),
                       accumulation_mode=trigger.AccumulationMode.DISCARDING)
                   | beam.GroupByKey()
                   | beam.FlatMap(lambda x: x[1]))
            # Triggering after 5 processing-time seconds, and on the watermark. Also
            # discarding old elements.

            _ = (res
                 | beam.io.fileio.WriteToFiles(
                     path=dir,
                     file_naming=no_colon_file_naming,
                     max_writers_per_bundle=0)
                 | beam.Map(lambda fr: FileSystems.join(dir, fr.file_name))
                 | beam.ParDo(self.record_dofn()))

        # Verification pipeline
        with TestPipeline() as p:
            files = (p | beam.io.fileio.MatchFiles(FileSystems.join(dir, '*')))

            file_names = (files | beam.Map(lambda fm: fm.path))

            file_contents = (
                files
                | beam.io.fileio.ReadMatches()
                | beam.Map(lambda rf: (rf.metadata.path, rf.read_utf8().strip(
                ).split('\n'))))

            content = (file_contents
                       | beam.FlatMap(lambda fc: [ln.strip() for ln in fc[1]]))

            assert_that(file_names,
                        equal_to(WriteFilesTest.all_records),
                        label='AssertFilesMatch')
            assert_that(content,
                        matches_all(WriteFilesTest.LARGER_COLLECTION),
                        label='AssertContentsMatch')
예제 #21
0
    def expand(self, train_and_test_datasets):
        """Apply the transform.

    Args:
      train_and_test_datasets: A pair of (train, test) PCollections of
        json strings representing Example Protos

    Returns:
       A 2-tuple of
         A PCollection with a single TrainedModel, suitable for used by Predict
         A PCollection with a single TrainingJobResult that describes the
         result of training.

    Raises:
       ValueError: If the arguments are invalid.
    """
        train_dataset, test_dataset = train_and_test_datasets
        pipeline = train_dataset.pipeline

        # For the job name use a combination of the transform label and a
        # datestamp. The datestamp is intended to make it unique.
        now = datetime.datetime.now()
        # We add some salt to the job name to avoid collisions if we try to submit
        # multiple jobs at the same time.
        # N.B. The job_name is fixed at pipeline construction time. This is
        # critical because multiple invocation of the Train transform (e.g. because
        # of retries) need to use the same job name.
        salt = '%04x' % random.getrandbits(4 * 4)

        # TODO(b/28989568): We need to lower case the name because the backend
        # only allows lower case letters for job names. The backend should probably
        # do this automatically but currently it doesn't.
        job_name = '{0}_{1}_{2}'.format(self.label,
                                        now.strftime('%y%m%d_%H%M%S'),
                                        salt).lower()

        options = pipeline.options
        # TODO(b/29163051) Options can be None depending on how the runner was
        # constructed.
        if options is None:
            options = df_options.PipelineOptions()

        cloud_options = options.view_as(df_options.GoogleCloudOptions)
        run_on_cloud = self.use_cloud_ml

        if run_on_cloud is None:
            # TODO(user): Remove the fallback after the next Dataflow release.
            try:
                dataflow_runner = beam.runners.DataflowRunner
            except AttributeError:
                dataflow_runner = beam.runners.DataflowPipelineRunner

            # Choose a default based on the runner.
            if isinstance(pipeline.runner, dataflow_runner):
                run_on_cloud = True
            else:
                run_on_cloud = False

        if self.output_dir:
            temp_dir = self.output_dir
        elif run_on_cloud:
            cloud_options = options.view_as(df_options.GoogleCloudOptions)

            if cloud_options.temp_location:
                temp_dir = os.path.join(cloud_options.temp_location, job_name)
            elif cloud_options.staging_location:
                temp_dir = os.path.join(cloud_options.staging_location,
                                        job_name)
            else:
                raise ValueError(
                    '--staging_location must be specified to run in the cloud')
        else:
            temp_dir = tempfile.mkdtemp(job_name)
        logging.info('Temp dir: %s', temp_dir)

        if run_on_cloud:
            train_do = ml_func.TrainingJobDo()
            project = cloud_options.project
        else:
            train_do = ml_func._TrainingJobLocalDo()  # pylint: disable=protected-access
            project = None

        _ = train_dataset | dfutil.CountPCollection('ml-train-input')

        # Write the train and test data to files so we can pass it to the trainer.
        train_data_path = os.path.join(temp_dir, 'training')
        test_data_path = os.path.join(temp_dir, 'testing')
        output_dir = os.path.join(temp_dir, 'model')
        # TODO(b/34839956) Make sure we can handle the tf.Transform metadata.
        metadata_path = os.path.join(output_dir, 'metadata.json')

        # This PTransform is primarily to avoid stage name collisions in writing
        # training and test data.
        # TODO(user): Figure out why i_type @beam.ptransform_fn breaks pickling.
        train_files = (
            train_dataset | 'WriteTrainData' >> ml_func._WrapCallable(  # pylint: disable=protected-access
                self.tf_main_spec.write_input_data, train_data_path))
        test_files = (
            test_dataset | 'WriteTestData' >> ml_func._WrapCallable(  # pylint: disable=protected-access
                self.tf_main_spec.write_input_data, test_data_path))
        if self.metadata:
            metadata_files = self.metadata | SaveMetadata(metadata_path)
        else:
            metadata_files = pipeline | beam.Create([None])

        # Construct and run the training job.
        train_request = self.tf_main_spec.train_request.copy()
        if not train_request.package_uris:
            train_request.package_uris = []
        if self.package_uris:
            if isinstance(self.package_uris, basestring):
                train_request.package_uris.extend([self.package_uris])
            else:
                train_request.package_uris.extend(self.package_uris)
        # remove duplicates from train_request
        train_request.package_uris = list(set(train_request.package_uris))

        train_request.job_args = self.job_args or []
        if self.python_module:
            train_request.python_module = self.python_module
        if not train_request.project:
            train_request.parent = project
        if not train_request.job_name:
            train_request.job_name = job_name
        if not train_request.endpoint:
            train_request.endpoint = self.cloud_ml_endpoint
        if not train_request.hyperparameters:
            train_request.hyperparameters = self.hyperparameters
        if not train_request.region:
            train_request.region = self.region
        if not train_request.scale_tier:
            train_request.scale_tier = self.scale_tier
        if not train_request.worker_count:
            train_request.worker_count = self.worker_count
        if not train_request.ps_count:
            train_request.ps_count = self.ps_count
        if not train_request.worker_type:
            train_request.worker_type = self.worker_type
        if not train_request.ps_type:
            train_request.ps_type = self.ps_type
        if not train_request.master_type:
            train_request.master_type = self.master_type
        if not train_request.runtime_version:
            train_request.runtime_version = self.runtime_version

        requests = (
            pipeline | 'CreateRequest' >> beam.Create([train_request])
            | 'AugmentTrainingArgs' >> beam.ParDo(
                ml_func._AugmentTrainArgsDo(  # pylint: disable=protected-access
                    self.tf_main_spec),
                beam.pvalue.AsIter(train_files),
                beam.pvalue.AsIter(test_files),
                output_dir,
                beam.pvalue.AsSingleton(metadata_files)))

        train_results = requests | 'TrainModel' >> beam.ParDo(train_do)

        # Read and return the model directory and training results.
        model_directory = (
            train_results
            | 'CreateModel' >> beam.Map(self.tf_main_spec.read_model,
                                        output_dir, self.export_subdir))

        return model_directory, train_results
def run():
    # Command line arguments
    parser = argparse.ArgumentParser(
        description='Load from PubSub into BigQuery')
    parser.add_argument('--project',
                        required=True,
                        help='Specify Google Cloud project')
    parser.add_argument('--region',
                        required=True,
                        help='Specify Google Cloud region')
    parser.add_argument('--staging_location',
                        required=True,
                        help='Specify Cloud Storage bucket for staging')
    parser.add_argument('--temp_location',
                        required=True,
                        help='Specify Cloud Storage bucket for temp')
    parser.add_argument('--accum_mode',
                        required=True,
                        help='Accumulation mode for pipeline')

    opts, pipeline_args = parser.parse_known_args()

    options = PipelineOptions(pipeline_args, save_main_session=True)

    options.view_as(
        GoogleCloudOptions).job_name = f"{opts.accum_mode}-{time.time_ns()}"
    options.view_as(GoogleCloudOptions).project = opts.project
    options.view_as(GoogleCloudOptions).region = opts.region
    options.view_as(
        GoogleCloudOptions).staging_location = opts.staging_location
    options.view_as(GoogleCloudOptions).temp_location = opts.temp_location
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    table_schema = {
        "fields": [
            {
                "name": "taxi_events",
                "type": "INTEGER"
            },
            {
                "name": "timestamp",
                "type": "STRING"
            },
        ]
    }

    input_topic = "projects/pubsub-public-data/topics/taxirides-realtime"
    output_table = f"{opts.project}:dataflow_demos.{opts.accum_mode}"

    if opts.accum_mode == 'accumulating':
        accum_mode = beam.transforms.trigger.AccumulationMode.ACCUMULATING
    elif opts.accum_mode == 'discarding':
        accum_mode = beam.transforms.trigger.AccumulationMode.DISCARDING
    else:
        raise ValueError(
            'Invalid accumulation mode value. Use \'accumulating\' or \'discarding\' '
        )

    p = beam.Pipeline(options=options)

    (p | 'ReadFromPubSub' >> beam.io.ReadFromPubSub(input_topic)
     | 'ParseJson' >> beam.Map(parse_json).with_output_types(TaxiRide)
     | 'WindowByMinute' >> beam.WindowInto(
         beam.window.FixedWindows(60),
         trigger=AfterWatermark(early=AfterProcessingTime(10)),
         accumulation_mode=accum_mode)
     | "CountPerMinute" >> beam.CombineGlobally(
         CountCombineFn()).without_defaults()
     | "AddWindowTimestamp" >> beam.ParDo(GetTimestampFn())
     | 'WriteAggToBQ' >> beam.io.WriteToBigQuery(
         output_table,
         schema=table_schema,
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    logging.getLogger().setLevel(logging.INFO)
    logging.info("Building pipeline ...")

    p.run()
    'temp_location': BUCKET + '/temp',
    'staging_location': BUCKET + '/staging',
    'machine_type': 'n1-standard-8',
    'num_workers': 8
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)

with beam.Pipeline('DataflowRunner', options=opts) as p:

    query_results = p | beam.io.Read(beam.io.BigQuerySource(query='select * from tweets.tweets_data as a LEFT OUTER JOIN tweets.stock_companyname_lookup as b on a.company_names = b.name'))

    # write PCollection to a log file
    query_results | 'Write to File 1' >> WriteToText('input_joined_table.txt')

    # apply a ParDo to the PCollection
    out_pcoll = query_results | 'Filter tweets by source handles starting with character C' >> beam.ParDo(FilterSourceFn())

    # write PCollection to a log file
    out_pcoll | 'Write to File 2' >> WriteToText('output_joined_table_source_like_c.txt')

    qualified_table_name = 'avian-force-216105:beam_dataset.LeftJoin_Milestone8_Cluster'
    table_schema = 'a_id:INTEGER,a_text:STRING,a_timestamp:STRING,a_source:STRING,a_symbols:STRING,a_company_names:STRING,a_url:STRING,a_verified:BOOLEAN,b_ticker:STRING,b_name:STRING'

    #Existing Schema
    # id	INTEGER	NULLABLE
    # text	STRING	NULLABLE
    # timestamp	STRING	NULLABLE
    # source	STRING	NULLABLE
    # symbols	STRING	NULLABLE
    # company_names	STRING	NULLABLE
    # url	STRING	NULLABLE
def make_beam_pipeline(root,
                       input_filenames,
                       sample_rate,
                       debug,
                       embedding_names,
                       embedding_modules,
                       module_output_keys,
                       audio_key,
                       sample_rate_key,
                       label_key,
                       speaker_id_key,
                       average_over_time,
                       delete_audio_from_output,
                       output_filename,
                       input_format='tfrecord',
                       output_format='tfrecord',
                       suffix='Main'):
    """Construct beam pipeline for mapping from audio to embeddings.

  Args:
    root: The beam root node.
    input_filenames: Python list. List of input files.
    sample_rate: Python int, or `None`. The sample rate for all embeddings,
      or `None` if this is a TFDS dataset, or if each example has its own sample
      rate.
    debug: Python bool. Whether to operate in debug mode.
    embedding_names: Python list of embeddings.
    embedding_modules: Python list of TF-Hub modules.
    module_output_keys: Python list of strings, names of output modules.
    audio_key: Python string, the key of the audio.
    sample_rate_key: Python string or `None`, the key for.
    label_key: Python string. Field for label.
    speaker_id_key: Python string or `None`. Key for speaker ID, or `None`.
    average_over_time: Python bool. If `True`, average over the time axis.
    delete_audio_from_output: Python bool. Whether to remove audio fromm
      outputs.
    output_filename: Python string. Output filename.
    input_format: Python string. Must correspond to a function in
      `reader_functions`.
    output_format: Python string. Must correspond to a function
      `writer_functions`.
    suffix: Python string. Suffix to stage names to make them unique.
  """
    tf_examples_key_ = 'tf_examples'
    assert tf_examples_key_ not in embedding_names
    s = suffix  # for code brevity.

    # Read from input.
    input_examples = reader_functions[input_format](root, input_filenames, s)

    # In debug mode, take one input example.
    if debug:
        input_examples = (
            input_examples
            | f'TakeOne{s}' >>
            beam.transforms.combiners.Sample.FixedSizeGlobally(1)
            # Sampling generates lists, so flatten back into one collection.
            | f'DebugFlatten{s}' >> beam.FlatMap(lambda x: x))

    # Compute all the embeddings simultaneously.
    embedding_tables = {}
    for name, mod, out_key in zip(embedding_names, embedding_modules,
                                  module_output_keys):
        logging.info('Adding signal: %s %s, %s', name, mod, out_key)
        tbl = input_examples | f'ComputeEmbedding-{name}-{s}' >> beam.ParDo(
            ComputeEmbeddingMapFn(name=name,
                                  module=mod,
                                  output_key=out_key,
                                  audio_key=audio_key,
                                  sample_rate_key=sample_rate_key,
                                  sample_rate=sample_rate,
                                  average_over_time=average_over_time))
        embedding_tables[name] = tbl
    assert tf_examples_key_ not in embedding_tables
    embedding_tables[tf_examples_key_] = input_examples
    logging.info('embedding_tables: %s', embedding_tables)

    # Combine embeddings and tf.train.Example, using the common key.
    combined_tbl = (embedding_tables
                    | f'CombineEmbeddingTables-{s}' >> beam.CoGroupByKey()
                    | f'AddEmbeddings-{s}' >> beam.Map(
                        _add_embedding_column_map_fn,
                        original_example_key=tf_examples_key_,
                        delete_audio_from_output=delete_audio_from_output,
                        audio_key=audio_key,
                        label_key=label_key,
                        speaker_id_key=speaker_id_key))

    output_filename = f'{output_filename}@*'
    logging.info('Writing to %s', output_filename)
    writer_functions[output_format](combined_tbl, output_filename, s)
예제 #25
0
 def expand(self, dataset):
     return (dataset
             | 'DetectAnomaliesInExamples' >> beam.Map(
                 _detect_anomalies_in_example, options=self.options)
             | 'GenerateAnomalyReasonKeys' >> beam.ParDo(
                 _GenerateAnomalyReasonSliceKeys()))
예제 #26
0
def FilterOutSlices(  # pylint: disable=invalid-name
        values: beam.pvalue.PCollection, slices_count: beam.pvalue.PCollection,
        k_anonymization_count: int,
        error_metric_key: Text) -> beam.pvalue.PCollection:
    """Filter out slices with examples count lower than k_anonymization_count.

  Since we might filter out certain slices to preserve privacy in the case of
  small slices, to make end users aware of this, we will append filtered out
  slice keys with empty data, and a debug message explaining the omission.

  Args:
    values: PCollection of aggregated data keyed at slice_key
    slices_count: PCollection of slice keys and their example count.
    k_anonymization_count: If the number of examples in a specific slice is less
      than k_anonymization_count, then an error will be returned for that slice.
      This will be useful to ensure privacy by not displaying the aggregated
      data for smaller number of examples.
    error_metric_key: The special metric key to indicate errors.

  Returns:
    A PCollection keyed at all the possible slice_key and aggregated data for
    slice keys with example count more than k_anonymization_count and error
    message for filtered out slices.
  """
    class FilterOutSmallSlicesDoFn(beam.DoFn):
        """DoFn to filter out small slices."""
        def __init__(self, error_metric_key: Text):
            self.error_metric_key = error_metric_key

        def process(
            self, element: Tuple[SliceKeyType, Dict[Text, Any]]
        ) -> Generator[Tuple[SliceKeyType, Dict[Text, Any]], None, None]:
            """Filter out small slices.

      For slices (excluding overall slice) with examples count lower than
      k_anonymization_count, it adds an error message.

      Args:
        element: Tuple containing slice key and a dictionary containing
          corresponding elements from merged pcollections.

      Yields:
        PCollection of (slice_key, aggregated_data or error message)
      """
            (slice_key, value) = element
            if value['values']:
                if (not slice_key
                        or value['slices_count'][0] >= k_anonymization_count):
                    yield (slice_key, value['values'][0])
                else:
                    yield (slice_key, {
                        self.error_metric_key:
                        'Example count for this slice key is lower than '
                        'the minimum required value: %d. No data is aggregated for '
                        'this slice.' % k_anonymization_count
                    })

    return ({
        'values': values,
        'slices_count': slices_count
    }
            | 'CoGroupingSlicesCountAndAggregatedData' >> beam.CoGroupByKey()
            | 'FilterOutSmallSlices' >> beam.ParDo(
                FilterOutSmallSlicesDoFn(error_metric_key)))
예제 #27
0
def ComputePerSliceMetrics(  # pylint: disable=invalid-name
    slice_result: beam.pvalue.PCollection,
    eval_shared_model: types.EvalSharedModel,
    desired_batch_size: Optional[int] = None,
    num_bootstrap_samples: Optional[int] = 1,
    random_seed: Optional[int] = None,
) -> beam.pvalue.PCollection:
    """PTransform for computing, aggregating and combining metrics.

  Args:
    slice_result: Incoming PCollection consisting of slice key and extracts.
    eval_shared_model: Shared model parameters for EvalSavedModel.
    desired_batch_size: Optional batch size for batching in Aggregate.
    num_bootstrap_samples: Number of replicas to use in calculating uncertainty
      using bootstrapping.  If 1 is provided (default), aggregate metrics will
      be calculated with no uncertainty. If num_bootstrap_samples is > 0,
      multiple samples of each slice will be calculated using the Poisson
      bootstrap method. To calculate standard errors, num_bootstrap_samples
      should be 20 or more in order to provide useful data. More is better, but
      you pay a performance cost.
    random_seed: Seed to use for testing, because nondeterministic tests stink.

  Returns:
    DoOutputsTuple. The tuple entries are
    PCollection of (slice key, metrics) and
    PCollection of (slice key, plot metrics).
  """
    # TODO(ckuhn): Remove this workaround per discussions in CL/227944001
    slice_result.element_type = beam.typehints.Any

    compute_with_sampling = False
    if not num_bootstrap_samples:
        num_bootstrap_samples = 1
    if num_bootstrap_samples < 1:
        raise ValueError('num_bootstrap_samples should be > 0, got %d' %
                         num_bootstrap_samples)

    if num_bootstrap_samples > 1:
        slice_result_sampled = slice_result | 'FanoutBootstrap' >> beam.ParDo(
            _FanoutBootstrapFn(num_bootstrap_samples))
        compute_with_sampling = True

    output_results = (
        slice_result
        | 'CombinePerSlice' >> beam.CombinePerKey(
            _AggregateCombineFn(eval_shared_model=eval_shared_model,
                                desired_batch_size=desired_batch_size,
                                compute_with_sampling=False))
        | 'InterpretOutput' >> beam.ParDo(
            _ExtractOutputDoFn(eval_shared_model=eval_shared_model)))
    if compute_with_sampling:
        output_results = (
            slice_result_sampled
            | 'CombinePerSliceWithSamples' >> beam.CombinePerKey(
                _AggregateCombineFn(eval_shared_model=eval_shared_model,
                                    desired_batch_size=desired_batch_size,
                                    compute_with_sampling=True,
                                    seed_for_testing=random_seed))
            | 'InterpretSampledOutput' >> beam.ParDo(
                _ExtractOutputDoFn(eval_shared_model=eval_shared_model))
            | beam.GroupByKey()
            | beam.ParDo(_MergeBootstrap(),
                         beam.pvalue.AsIter(output_results)))
    # Separate metrics and plots.
    return (output_results
            | beam.ParDo(_SeparateMetricsAndPlotsFn()).with_outputs(
                _SeparateMetricsAndPlotsFn.OUTPUT_TAG_PLOTS,
                main=_SeparateMetricsAndPlotsFn.OUTPUT_TAG_METRICS))
예제 #28
0
  def test_streaming_wordcount(self):
    class WordExtractingDoFn(beam.DoFn):
      def process(self, element):
        text_line = element.strip()
        words = text_line.split()
        return words

    # Add the TestStream so that it can be cached.
    ib.options.capturable_sources.add(TestStream)

    p = beam.Pipeline(
        runner=interactive_runner.InteractiveRunner(),
        options=StandardOptions(streaming=True))

    data = (
        p
        | TestStream()
            .advance_watermark_to(0)
            .advance_processing_time(1)
            .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
            .advance_watermark_to(20)
            .advance_processing_time(1)
            .add_elements(['that', 'is', 'the', 'question'])
        | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

    counts = (
        data
        | 'split' >> beam.ParDo(WordExtractingDoFn())
        | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
        | 'group' >> beam.GroupByKey()
        | 'count' >> beam.Map(lambda wordones: (wordones[0], sum(wordones[1]))))

    # Watch the local scope for Interactive Beam so that referenced PCollections
    # will be cached.
    ib.watch(locals())

    # This is normally done in the interactive_utils when a transform is
    # applied but needs an IPython environment. So we manually run this here.
    ie.current_env().track_user_pipelines()

    # Create a fake limiter that cancels the BCJ once the main job receives the
    # expected amount of results.
    class FakeLimiter:
      def __init__(self, p, pcoll):
        self.p = p
        self.pcoll = pcoll

      def is_triggered(self):
        result = ie.current_env().pipeline_result(self.p)
        if result:
          try:
            results = result.get(self.pcoll)
          except ValueError:
            return False
          return len(results) >= 10
        return False

    # This sets the limiters to stop reading when the test receives 10 elements.
    ie.current_env().options.capture_control.set_limiters_for_test(
        [FakeLimiter(p, data)])

    # This tests that the data was correctly cached.
    pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
    expected_data_df = pd.DataFrame([
        ('to', 0, [IntervalWindow(0, 10)], pane_info),
        ('be', 0, [IntervalWindow(0, 10)], pane_info),
        ('or', 0, [IntervalWindow(0, 10)], pane_info),
        ('not', 0, [IntervalWindow(0, 10)], pane_info),
        ('to', 0, [IntervalWindow(0, 10)], pane_info),
        ('be', 0, [IntervalWindow(0, 10)], pane_info),
        ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
        ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
        ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
        ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
    ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable

    data_df = ib.collect(data, include_window_info=True)
    pd.testing.assert_frame_equal(expected_data_df, data_df)

    # This tests that the windowing was passed correctly so that all the data
    # is aggregated also correctly.
    pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
    expected_counts_df = pd.DataFrame([
        ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
        ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
        ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
        ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
        ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
    ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable

    counts_df = ib.collect(counts, include_window_info=True)

    # The group by key has no guarantee of order. So we post-process the DF by
    # sorting so we can test equality.
    sorted_counts_df = (counts_df
                        .sort_values(['event_time', 0], ascending=True)
                        .reset_index(drop=True)) # yapf: disable
    pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
예제 #29
0
}
opts = beam.pipeline.PipelineOptions(flags=[], **options)

with beam.Pipeline('DataflowRunner', options=opts) as p:

    query_results = p | beam.io.Read(
        beam.io.BigQuerySource(
            query='SELECT Year,State,Rate FROM Unemployment.unemployment_rate')
    )

    # write PCollection to a log file
    query_results | 'Write to File 1' >> WriteToText(DIR_PATH +
                                                     'unemployment_query.txt')

    #apply Pardo on the Pcollection
    state_pcoll = query_results | 'Create State abb' >> beam.ParDo(StateName())

    # write PCollection to a file
    state_pcoll | 'Write to File 2' >> WriteToText(DIR_PATH +
                                                   'output_unemployment.txt')

    qualified_takes_table_name = 'han97jiayan:Unemployment.unemployment_transform_cluster'
    takes_table_schema = 'Year:INTEGER,State:STRING,Rate:FLOAT'

    state_pcoll | 'Write Takes to BigQuery' >> beam.io.Write(
        beam.io.BigQuerySink(
            qualified_takes_table_name,
            schema=takes_table_schema,
            create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
            write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))
예제 #30
0
    def expand(self, pcoll):
        p = pcoll.pipeline
        try:
            step_name = self.label
        except AttributeError:
            step_name = 'BigQueryBatchFileLoads_%d' % BigQueryBatchFileLoads.COUNT
            BigQueryBatchFileLoads.COUNT += 1

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location
        job_name = (p.options.view_as(GoogleCloudOptions).job_name
                    or 'AUTOMATIC_JOB_NAME')

        empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
        singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

        load_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | "LoadJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD, 'LOAD_STEP')))

        schema_mod_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            |
            "SchemaModJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.LOAD,
                'SCHEMA_MOD_STEP')))

        copy_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | "CopyJobNamePrefix" >> beam.Map(lambda _: _generate_job_name(
                job_name, bigquery_tools.BigQueryJobTypes.COPY, 'COPY_STEP')))

        file_prefix_pcv = pvalue.AsSingleton(
            singleton_pc
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        if not self.with_auto_sharding:
            all_destination_file_pairs_pc = self._write_files(
                destination_data_kv_pc, file_prefix_pcv)
        else:
            all_destination_file_pairs_pc = self._write_files_with_auto_sharding(
                destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        partitions = (
            grouped_files_pc
            | beam.ParDo(
                PartitionFiles(self.max_partition_size,
                               self.max_files_per_partition)).with_outputs(
                                   PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                   PartitionFiles.SINGLE_PARTITION_TAG))

        multiple_partitions_per_destination_pc = partitions[
            PartitionFiles.MULTIPLE_PARTITIONS_TAG]
        single_partition_per_destination_pc = partitions[
            PartitionFiles.SINGLE_PARTITION_TAG]

        # When using dynamic destinations, elements with both single as well as
        # multiple partitions are loaded into BigQuery using temporary tables to
        # ensure atomicity.
        if self.dynamic_destinations:
            all_partitions = ((multiple_partitions_per_destination_pc,
                               single_partition_per_destination_pc)
                              | "FlattenPartitions" >> beam.Flatten())
            destination_load_job_ids_pc, destination_copy_job_ids_pc = (
                self._load_data(all_partitions, empty_pc, load_job_name_pcv,
                                schema_mod_job_name_pcv, copy_job_name_pcv, p,
                                step_name))
        else:
            destination_load_job_ids_pc, destination_copy_job_ids_pc = (
                self._load_data(multiple_partitions_per_destination_pc,
                                single_partition_per_destination_pc,
                                load_job_name_pcv, schema_mod_job_name_pcv,
                                copy_job_name_pcv, p, step_name))

        return {
            self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }