Ejemplo n.º 1
0
 def expand(self, pvalue):
     beam_options = self.beam_options
     return (pvalue
             | beam.Create(self._generate())
             | WriteToBigTable(beam_options['project_id'],
                               beam_options['instance_id'],
                               beam_options['table_id']))
Ejemplo n.º 2
0
    def delete_opt_out(self,
                       days,
                       max_num_workers=1,
                       dataflow_service_account=None):
        import apache_beam as beam
        from apache_beam.io.gcp.bigtableio import WriteToBigTable

        sql = f"""
        select distinct client_id
        from `moz-fx-data-shared-prod.telemetry.deletion_request`
        where date(submission_timestamp) >= DATE_SUB(DATE '{self.ISODATE_DASH}', INTERVAL {days} DAY)
              and date(submission_timestamp) <= '{self.ISODATE_DASH}'
        """

        options = get_dataflow_options(
            max_num_workers, self.GCP_PROJECT,
            f"""taar-profile-delete-{self.ISODATE_NODASH}""", self.GCS_BUCKET,
            self.SUBNETWORK, dataflow_service_account)

        with beam.Pipeline(options=options) as p:
            p | "Read from BigQuery" >> beam.io.ReadFromBigQuery(
                query=sql, use_standard_sql=True) | "Collect rows" >> beam.Map(
                    delete_bigtable_rows
                ) | "Delete in Cloud BigTable" >> WriteToBigTable(
                    project_id=self.GCP_PROJECT,
                    instance_id=self.BIGTABLE_INSTANCE_ID,
                    table_id=self.BIGTABLE_TABLE_ID,
                )
Ejemplo n.º 3
0
def run(argv=None):
    """Build and run the pipeline."""
    options = BigtableOptions(argv)
    with beam.Pipeline(options=options) as p:
        p | beam.Create(["phone#4c410523#20190501", "phone#4c410523#20190502"
                         ]) | beam.ParDo(CreateRowFn()) | WriteToBigTable(
                             project_id=options.bigtable_project,
                             instance_id=options.bigtable_instance,
                             table_id=options.bigtable_table)
Ejemplo n.º 4
0
    def load_bigtable(self, max_num_workers=1, dataflow_service_account=None):
        import apache_beam as beam
        from apache_beam.io.gcp.bigtableio import WriteToBigTable

        self.create_table_in_bigtable()

        options = get_dataflow_options(
            max_num_workers, self.GCP_PROJECT,
            f"""taar-profile-load-{self.ISODATE_NODASH}""", self.GCS_BUCKET,
            self.SUBNETWORK, dataflow_service_account)
        with beam.Pipeline(options=options) as p:
            p | "Read" >> beam.io.ReadFromAvro(
                gcs_avro_uri(self.GCS_BUCKET, self.ISODATE_NODASH),
                use_fastavro=True,
            ) | "Create BigTable Rows" >> beam.Map(
                create_bigtable_rows
            ) | "Write Records to Cloud BigTable" >> WriteToBigTable(
                project_id=self.GCP_PROJECT,
                instance_id=self.BIGTABLE_INSTANCE_ID,
                table_id=self.BIGTABLE_TABLE_ID,
            )
        print("Export to BigTable is complete")
Ejemplo n.º 5
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_path', type=str, required=True)
    parser.add_argument('--destination', type=str, required=True)
    parser.add_argument('--project_id', type=str, required=True)
    parser.add_argument('--instance_id', type=str, required=True)
    parser.add_argument('--table_id', type=str, required=True)
    parser.add_argument('--k', type=str, required=True)
    known_args, pipeline_args = parser.parse_known_args(argv)

    # file path
    data_path = known_args.input_path
    user_map_path = os.path.join(data_path, 'user_map.avro')

    pipeline_options = PipelineOptions(pipeline_args)

    with beam.Pipeline(options=pipeline_options) as p:
        news_score, users_recommendation = (
            p
            | 'Initialize' >> beam.io.ReadFromAvro(user_map_path)
            | 'Recommend' >> beam.ParDo(Recommend(data_path), k=known_args.k).
            with_outputs('users_recommendation', main='news_score'))

        (news_score
        | 'FindTopNews' >> FindTopItems()
        | 'Write' >> WriteToJson(
            file_path_prefix=known_args.destination,
            file_name_suffix='-{}.json'.format(TS_SUFFIX),
            shard_name_template=''))

        (users_recommendation
         | 'CreateDirectRow' >> beam.ParDo(CreateDirectRow(), TIMESTAMP)
         | 'WriteToBigTable' >> WriteToBigTable(
            project_id=known_args.project_id,
            instance_id=known_args.instance_id,
            table_id=known_args.table_id))