Esempio n. 1
0
    def testSideInputNotCopied(self):
        with beam.Pipeline() as p:
            side = (p
                    | 'CreateSide' >> beam.Create(['s1', 's2', 's3'])
                    | beam.Map(lambda x: DeepCopyTest._CountingIdentityFn(
                        'SideInput', x)))
            main = (
                p
                | 'CreateMain' >> beam.Create([1, 2, 3])
                |
                beam.Map(lambda x: DeepCopyTest._CountingIdentityFn('Main', x))
                | beam.Map(lambda e, s: (e, list(s)), pvalue.AsList(side)))
            copied = deep_copy.deep_copy(main)

            # Check that deep copy was performed.
            self.assertIsNot(main, copied)
            self.assertIsNot(main.producer, copied.producer)

            # Check that deep copy stops at the side input materialization boundary.
            self.assertIs(main.producer.side_inputs[0],
                          copied.producer.side_inputs[0])
            self.assertIs(main.producer.side_inputs[0].pvalue, side)

        # Check counts of processed items.
        self.assertEqual(DeepCopyTest._counts['SideInput'], 3)
        self.assertEqual(DeepCopyTest._counts['Main'], 6)
Esempio n. 2
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--key_file',
                        dest='key_file',
                        required=True,
                        help='Path to service account credentials JSON.')
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='GCS input file to sign.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    credentials = (
        p
        | 'Read Credentials from GCS' >> ReadFromText(known_args.key_file))

    (p
      | 'Read File from GCS' >> beam.Create([known_args.input]) \
      | 'Sign File' >> beam.ParDo(SignFileFn(), pvalue.AsList(credentials)))

    result = p.run()
    result.wait_until_finish()
Esempio n. 3
0
def run():
    parser = argparse.ArgumentParser()
    _, pipeline_args = parser.parse_known_args()

    # google cloud parameters
    pipeline_options = PipelineOptions(pipeline_args)
    google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    if not google_cloud_options.project:
        raise ValueError("Project is a required input.")
    logging.info("Using project: %s", google_cloud_options.project)
    if not google_cloud_options.job_name:
        google_cloud_options.job_name = 'name-counts-{}'.format(
            dt.datetime.utcnow().strftime('%Y%m%d-%H%M%S'))
    logging.info("Dataflow job name: %s", google_cloud_options.job_name)

    # parameters used in the pipeline
    name_count_options = pipeline_options.view_as(NameCountOptions)
    gcs_input_path = name_count_options.input_path
    gcs_output_path_first_names = name_count_options.output_path_template.format(
        'first_names')
    gcs_output_path_last_names = name_count_options.output_path_template.format(
        'last_names')
    logging.info("Using input path: %s", gcs_input_path)
    logging.info("Using output path for first names: %s",
                 gcs_output_path_first_names)
    logging.info("Using output path for last names: %s",
                 gcs_output_path_last_names)

    p = beam.Pipeline(options=pipeline_options)

    # Setup pipeline
    pipeline_startup = (
        p
        | 'Create File Glob' >> Create([gcs_input_path])
        | 'Read All Files' >> beam.io.ReadAllFromText()
        | 'Split Into Names' >> beam.Map(lambda string: string.split(',')))

    filter_letters = (
        p
        | 'Letters Side Input' >> Create(['A', 'B', 'C', 'X', 'Y', 'Z']))

    names_filtered = (pipeline_startup
                      | 'Filter Names' >> beam.ParDo(
                          SplitAndFilterNames(),
                          filter_letters=pvalue.AsList(filter_letters)))

    first_names = (names_filtered
                   | 'Get First Names' >> beam.ParDo(GetFirstName())
                   | 'Group By First Name' >> beam.CombinePerKey(max)
                   | 'Store First Name Result' >> beam.io.WriteToText(
                       gcs_output_path_first_names, num_shards=1))

    last_names = (names_filtered
                  | 'Get Last Names' >> beam.ParDo(GetLastName())
                  | 'Group By Last Name' >> beam.CombinePerKey(min)
                  | 'Store Last Name Result' >> beam.io.WriteToText(
                      gcs_output_path_last_names, num_shards=1))

    result = p.run()
    def _load_data(self, partitions_using_temp_tables,
                   partitions_direct_to_destination, load_job_name_pcv,
                   schema_mod_job_name_pcv, copy_job_name_pcv, p, step_name):
        """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
        # Load data using temp tables
        trigger_loads_outputs = (
            partitions_using_temp_tables
            | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=True,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format,
                    step_name=step_name), load_job_name_pcv, *
                self.schema_side_inputs).with_outputs(
                    TriggerLoadJobs.TEMP_TABLES, main='main'))

        temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        finished_temp_tables_load_jobs_pc = (
            p
            | "ImpulseMonitorLoadJobs" >> beam.Create([None])
            | "WaitForTempTableLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(temp_tables_load_job_ids_pc)))

        schema_mod_job_ids_pc = (
            finished_temp_tables_load_jobs_pc
            | beam.ParDo(
                UpdateDestinationSchema(
                    write_disposition=self.write_disposition,
                    test_client=self.test_client,
                    additional_bq_parameters=self.additional_bq_parameters,
                    step_name=step_name), schema_mod_job_name_pcv))

        finished_schema_mod_jobs_pc = (
            p
            | "ImpulseMonitorSchemaModJobs" >> beam.Create([None])
            | "WaitForSchemaModJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(schema_mod_job_ids_pc)))

        destination_copy_job_ids_pc = (
            finished_temp_tables_load_jobs_pc
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                test_client=self.test_client,
                                step_name=step_name), copy_job_name_pcv,
                pvalue.AsIter(finished_schema_mod_jobs_pc)))

        finished_copy_jobs_pc = (
            p
            | "ImpulseMonitorCopyJobs" >> beam.Create([None])
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            p
            | "RemoveTempTables/Impulse" >> beam.Create([None])
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda _, unused_copy_jobs, deleting_tables: deleting_tables,
                pvalue.AsIter(finished_copy_jobs_pc),
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Keys()
            | "RemoveTempTables/Delete" >> beam.ParDo(
                DeleteTablesFn(self.test_client)))

        # Load data directly to destination table
        destination_load_job_ids_pc = (
            partitions_direct_to_destination
            | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=False,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format,
                    step_name=step_name), load_job_name_pcv, *
                self.schema_side_inputs))

        _ = (p
             | "ImpulseMonitorDestinationLoadJobs" >> beam.Create([None])
             | "WaitForDestinationLoadJobs" >> beam.ParDo(
                 WaitForBQJobs(self.test_client),
                 pvalue.AsList(destination_load_job_ids_pc)))

        destination_load_job_ids_pc = (
            (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
            | beam.Flatten())

        return destination_load_job_ids_pc, destination_copy_job_ids_pc