def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam_us' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='employer', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS employer_id, * FROM (SELECT employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1)) AS t" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo( NoDuplicates()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_employer.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Dataflow' schema_id = 'employer_id:INTEGER, employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(): # set up location PROJECT_ID = 'trim-cistern-288221' BUCKET = 'gs://bhnk-milestone1-data' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) # executed with DirectRunner p = beam.Pipeline('DirectRunner', options=opts) # retrieve the data from imdb_refined dataset and save this information (location) sql = 'SELECT * FROM imdb_refined.Primary_Professions limit 250' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) # use the previously saved information (location) and read from BigQuery # query results is now input P collection query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) # Use ParDo to call function on query results out_pcoll = query_results | 'Split Primary Professions' >> beam.ParDo( SplitPrimaryProfessions()) out_pcoll | 'Log output' >> WriteToText('output.txt') dataset_id = 'imdb_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Primary_Professions_Beam' schema_id = 'nconst:STRING,primaryProfession:STRING' # write to BigQuery using the location set above out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) # run and display results after everything is finished result = p.run() result.wait_until_finish()
def run(): PROJECT_ID = 'acquired-rarity-288205' BUCKET = 'gs://ykdb_beam/temp' options = {'project': PROJECT_ID} opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DirectRunner', options=opts) # ***************************************** REMOVE DUPLICATES **************************************************** sql = "SELECT job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 AND length(soc_code) > 5 AND soc_code NOT LIKE '%-%' GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) LIMIT 50" bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll_no_dup = query_results | 'Format prevailing_wage_YR and Remove dups' >> beam.ParDo( NoDuplicates()) out_pcoll_fix_date = out_pcoll_no_dup | 'Format Date' >> beam.ParDo( FormatDate()) out_pcoll = out_pcoll_fix_date | 'Format Soc' >> beam.ParDo( FormatSocCode()) out_pcoll | 'Log output' >> WriteToText('output_occ_codeTest.txt') # ***************************************** INSERT INTO BQ **************************************************** dataset_id = 'H_1B_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occ_CodeTest' schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:Date, employment_end_date:Date, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input_table', required=True, help='Input table to process.') parser.add_argument('--num_records', required=True, help='The expected number of records', type=int) parser.add_argument('--num_slow', default=0, help=('Percentage of rows that will be slow. ' 'Must be in the range [0, 100)')) parser.add_argument('--beam_bq_source', default=False, type=bool, help=('Whether to use the new ReadFromBigQuery' ' transform, or the BigQuerySource.')) known_args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) with TestPipeline(options=options) as p: if known_args.beam_bq_source: reader = ReadFromBigQuery( table='%s:%s' % (options.view_as(GoogleCloudOptions).project, known_args.input_table)) else: reader = beam.io.Read( beam.io.BigQuerySource(known_args.input_table)) # pylint: disable=expression-not-assigned count = (p | 'read' >> reader | 'row to string' >> beam.ParDo(RowToStringWithSlowDown(), num_slow=known_args.num_slow) | 'count' >> beam.combiners.Count.Globally()) assert_that(count, equal_to([known_args.num_records]))
def run(): PROJECT_ID = 'trim-cistern-288221' BUCKET = 'gs://bhnk-milestone1-data' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' # use DataflowRunner instead of DirectRunner options = PipelineOptions(flags=None, runner='DataflowRunner', project=PROJECT_ID, job_name='imdbwriters', temp_location=BUCKET + '/temp', region='us-central1') p = beam.pipeline.Pipeline(options=options) sql = 'SELECT * FROM imdb_refined.Writers' bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Split Writers' >> beam.ParDo(SplitWriters()) out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt') dataset_id = 'imdb_refined' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Writers_Dataflow' schema_id = 'tconst:STRING,writers:STRING' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def process_tfma(schema_file, big_query_table=None, eval_model_dir=None, max_eval_rows=None, pipeline_args=None, publish_to_bq=False, project=None, metrics_table=None, metrics_dataset=None): """Runs a batch job to evaluate the eval_model against the given input. Args: schema_file: A file containing a text-serialized Schema that describes the eval data. big_query_table: A BigQuery table name specified as DATASET.TABLE which should be the input for evaluation. This can only be set if input_csv is None. eval_model_dir: A directory where the eval model is located. max_eval_rows: Number of rows to query from BigQuery. pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. publish_to_bq: project: metrics_dataset: metrics_table: Raises: ValueError: if input_csv and big_query_table are not specified correctly. """ if big_query_table is None: raise ValueError( '--big_query_table should be provided.') slice_spec = [ tfma.slicer.SingleSliceSpec(), tfma.slicer.SingleSliceSpec(columns=['trip_start_hour']) ] metrics_namespace = metrics_table schema = taxi.read_schema(schema_file) eval_shared_model = tfma.default_eval_shared_model( eval_saved_model_path=eval_model_dir, add_metrics_callbacks=[ tfma.post_export_metrics.calibration_plot_and_prediction_histogram(), tfma.post_export_metrics.auc_plots() ]) metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, filters=MetricsFilter().with_namespace(metrics_namespace) ) pipeline = beam.Pipeline(argv=pipeline_args) query = taxi.make_sql(big_query_table, max_eval_rows, for_eval=True) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery(query=query, project=project, use_standard_sql=True) | 'Measure time: Start' >> beam.ParDo(MeasureTime(metrics_namespace)) | 'CleanData' >> beam.Map(lambda x: ( taxi.clean_raw_data_dict(x, raw_feature_spec)))) # Examples must be in clean tf-example format. coder = taxi.make_proto_coder(schema) # Prepare arguments for Extract, Evaluate and Write steps extractors = tfma.default_extractors( eval_shared_model=eval_shared_model, slice_spec=slice_spec, desired_batch_size=None, materialize=False) evaluators = tfma.default_evaluators( eval_shared_model=eval_shared_model, desired_batch_size=None, num_bootstrap_samples=1) _ = ( raw_data | 'ToSerializedTFExample' >> beam.Map(coder.encode) | 'Extract Results' >> tfma.InputsToExtracts() | 'Extract and evaluate' >> tfma.ExtractAndEvaluate( extractors=extractors, evaluators=evaluators) | 'Map Evaluations to PCollection' >> MapEvalToPCollection() | 'Measure time: End' >> beam.ParDo( MeasureTime(metrics_namespace)) ) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
def transform_data(input_handle, outfile_prefix, working_dir, schema_file, transform_dir=None, max_rows=None, pipeline_args=None, publish_to_bq=False, project=None, metrics_table=None, metrics_dataset=None): """The main tf.transform method which analyzes and transforms data. Args: input_handle: BigQuery table name to process specified as DATASET.TABLE or path to csv file with input data. outfile_prefix: Filename prefix for emitted transformed examples working_dir: Directory in which transformed examples and transform function will be emitted. schema_file: An file path that contains a text-serialized TensorFlow metadata schema of the input data. transform_dir: Directory in which the transform output is located. If provided, this will load the transform_fn from disk instead of computing it over the data. Hint: this is useful for transforming eval data. max_rows: Number of rows to query from BigQuery pipeline_args: additional DataflowRunner or DirectRunner args passed to the beam pipeline. """ def preprocessing_fn(inputs): """tf.transform's callback function for preprocessing inputs. Args: inputs: map from feature keys to raw not-yet-transformed features. Returns: Map from string feature key to transformed feature operations. """ outputs = {} for key in taxi.DENSE_FLOAT_FEATURE_KEYS: # Preserve this feature as a dense float, setting nan's to the mean. outputs[taxi.transformed_name(key)] = transform.scale_to_z_score( _fill_in_missing(inputs[key])) for key in taxi.VOCAB_FEATURE_KEYS: # Build a vocabulary for this feature. outputs[taxi.transformed_name( key)] = transform.compute_and_apply_vocabulary( _fill_in_missing(inputs[key]), top_k=taxi.VOCAB_SIZE, num_oov_buckets=taxi.OOV_SIZE) for key in taxi.BUCKET_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = transform.bucketize( _fill_in_missing(inputs[key]), taxi.FEATURE_BUCKET_COUNT) for key in taxi.CATEGORICAL_FEATURE_KEYS: outputs[taxi.transformed_name(key)] = _fill_in_missing(inputs[key]) # Was this passenger a big tipper? taxi_fare = _fill_in_missing(inputs[taxi.FARE_KEY]) tips = _fill_in_missing(inputs[taxi.LABEL_KEY]) outputs[taxi.transformed_name(taxi.LABEL_KEY)] = tf.where( tf.is_nan(taxi_fare), tf.cast(tf.zeros_like(taxi_fare), tf.int64), # Test if the tip was > 20% of the fare. tf.cast(tf.greater(tips, tf.multiply(taxi_fare, tf.constant(0.2))), tf.int64)) return outputs namespace = metrics_table metrics_monitor = None if publish_to_bq: metrics_monitor = MetricsReader( publish_to_bq=publish_to_bq, project_name=project, bq_table=metrics_table, bq_dataset=metrics_dataset, namespace=namespace, filters=MetricsFilter().with_namespace(namespace)) schema = taxi.read_schema(schema_file) raw_feature_spec = taxi.get_raw_feature_spec(schema) raw_schema = dataset_schema.from_feature_spec(raw_feature_spec) raw_data_metadata = dataset_metadata.DatasetMetadata(raw_schema) pipeline = beam.Pipeline(argv=pipeline_args) with tft_beam.Context(temp_dir=working_dir): query = taxi.make_sql(input_handle, max_rows, for_eval=False) raw_data = ( pipeline | 'ReadBigQuery' >> ReadFromBigQuery( query=query, project=project, use_standard_sql=True) | 'Measure time: start' >> beam.ParDo(MeasureTime(namespace))) decode_transform = beam.Map(taxi.clean_raw_data_dict, raw_feature_spec=raw_feature_spec) if transform_dir is None: decoded_data = raw_data | 'DecodeForAnalyze' >> decode_transform transform_fn = ( (decoded_data, raw_data_metadata) | ('Analyze' >> tft_beam.AnalyzeDataset(preprocessing_fn))) _ = ( transform_fn | ('WriteTransformFn' >> tft_beam.WriteTransformFn(working_dir))) else: transform_fn = pipeline | tft_beam.ReadTransformFn(transform_dir) # Shuffling the data before materialization will improve Training # effectiveness downstream. Here we shuffle the raw_data (as opposed to # decoded data) since it has a compact representation. shuffled_data = raw_data | 'RandomizeData' >> beam.transforms.Reshuffle( ) decoded_data = shuffled_data | 'DecodeForTransform' >> decode_transform (transformed_data, transformed_metadata) = ( ((decoded_data, raw_data_metadata), transform_fn) | 'Transform' >> tft_beam.TransformDataset()) coder = example_proto_coder.ExampleProtoCoder( transformed_metadata.schema) _ = (transformed_data | 'SerializeExamples' >> beam.Map(coder.encode) | 'Measure time: end' >> beam.ParDo(MeasureTime(namespace)) | 'WriteExamples' >> beam.io.WriteToTFRecord( os.path.join(working_dir, outfile_prefix), file_name_suffix='.gz')) result = pipeline.run() result.wait_until_finish() if metrics_monitor: metrics_monitor.publish_metrics(result)
def test_get_destination_uri_static_vp(self): unique_id = uuid.uuid4().hex uri = ReadFromBigQuery.get_destination_uri( StaticValueProvider(str, 'gs://bucket'), None, unique_id) self.assertEqual( uri, 'gs://bucket/' + unique_id + '/bigquery-table-dump-*.json')
def test_get_destination_uri_none(self): with self.assertRaisesRegex(ValueError, '^ReadFromBigQuery requires a GCS ' 'location to be provided'): ReadFromBigQuery.get_destination_uri(None, None, uuid.uuid4().hex)
def process(self, lines): with self.gcsio().open(f'{self.gcs_path}/index.csv', 'w', mime_type='text/csv') as fp: fp.write(lines.encode()) job_name = f"reviewr-automl--{datetime.utcnow().strftime('%Y%m%d-%H%I%S')}" gcs_path = f'{GCS_DESTINATION}/{job_name}' pipeline_options = PipelineOptions(project=PROJECT_ID, region=DATAFLOW_REGION, job_name=job_name, temp_location=f'{gcs_path}/temp') p = beam.Pipeline(runner=RUNNER, options=pipeline_options) bq_row = p | 'ReadFromBigQuery' >> ReadFromBigQuery( query= f"SELECT * FROM `{BQ_SOURCE}`{' LIMIT 10' if RUNNER == 'DirectRunner' else ''}", project=PROJECT_ID, use_standard_sql=True, gcs_location=f'{gcs_path}/temp') bq_row | 'WriteExampleFile' >> beam.ParDo(WriteExampleFile(gcs_path)) bq_row | 'CreateLine' >> beam.ParDo(CreateLine(gcs_path))\ | 'CombineLines' >> beam.CombineGlobally(lambda lines: '\n'.join(lines))\ | 'WriteIndexFile' >> beam.ParDo(WriteIndexFile(gcs_path)) p.run()