def test(self): output = ( self.pipeline | 'Read from BigQuery' >> Read( BigQuerySource(dataset=self.input_dataset, table=self.input_table)) | 'Count messages' >> ParDo(CountMessages(self.metrics_namespace)) | 'Measure time' >> ParDo(MeasureTime(self.metrics_namespace)) | 'Count' >> Count.Globally()) assert_that(output, equal_to([self.input_options['num_records']]))
def run(argv=None): known_args, options = pipeline_options(argv) with beam.Pipeline(options=options) as p: lines = p | "Read BigQuery" >> beam.io.Read( BigQuerySource( query= "SELECT * FROM `bigquery-public-data.hacker_news.comments` LIMIT 100", use_standard_sql=True, )) with_author = ( lines | "Add Author" >> beam.Map(lambda row: (row["author"], row)) | "Group By Autho" >> beam.GroupByKey()) author_jsons = with_author | "Convert Row To JSON" >> beam.Map( row_to_json) author_jsons | beam.ParDo(OutputToFile(known_args.output))
def run(args=None): pipeline_options = PipelineOptions(args) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options.view_as(SetupOptions).save_main_session = True normalize_options = pipeline_options.view_as(NormalizeOptions) gcp_options = pipeline_options.view_as(GoogleCloudOptions) d1, d2 = parse_date_range(normalize_options.date_range) helper = QueryHelper(table=normalize_options.source_table, first_date_ts=d1, last_date_ts=d2) select_fields = ['mmsi', 'timestamp', 'seg_id', 'shipname', 'callsign', 'imo'] where_sql = 'shipname is not null or callsign is not null or imo is not null' if normalize_options.mmsi_quotient > 1: where_sql = "hash(mmsi) % {} = 0 and ({})".format(normalize_options.mmsi_quotient, where_sql) source_schema = helper.filter_table_schema(select_fields) source = BigQuerySource(query=helper.build_query(include_fields=select_fields, where_sql=where_sql)) dest_schema = TableSchema(fields=source_schema.fields) dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_SHIPNAME, type='STRING')) dest_schema.fields.append(TableFieldSchema(name=NORMALIZED_CALLSIGN, type='STRING')) dest_schema.fields.append(TableFieldSchema(name=VALID_IMO, type='INTEGER')) pipeline = beam.Pipeline(options=pipeline_options) ( pipeline | "ReadSource" >> ReadAsJSONDict(source) | "ConvertTimestamp" >> beam.ParDo(ParseBeamBQStrTimestampDoFn()) | "AddTimestamp" >> beam.ParDo(TimestampedValueDoFn()) | "NormalizeNames" >> beam.ParDo(NormalizeNamesDoFn()) | "WriteDest" >> WriteToBigQueryDatePartitioned( temp_gcs_location=gcp_options.temp_location, table=normalize_options.dest_table, schema=dest_schema, write_disposition=BigQueryDisposition.WRITE_TRUNCATE) ) result = pipeline.run() success_states = set([PipelineState.DONE]) if normalize_options.wait: result.wait_until_finish() else: success_states.add(PipelineState.RUNNING) return 0 if result.state in success_states else 1
def run(): PROJECT_ID = 'data-lake-290221' BUCKET = 'gs://dataflow-log-data' DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime( '%Y_%m_%d_%H_%M_%S') + '/' options = PipelineOptions(flags=None, runner='DirectRunner', project=PROJECT_ID, job_name='transpose', temp_location=BUCKET + '/temp', region='us-central1') options.view_as(SetupOptions).save_main_session = True p = beam.pipeline.Pipeline(options=options) sql = '''select farm_fingerprint(concat(cast(latitude as string), cast(longitude as string))) as location_id, * from covid19_confirmed.raw_cases''' #bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET) bq_source = BigQuerySource(query=sql, use_standard_sql=True) query_results = p | 'Read from BQ' >> beam.io.Read(bq_source) out_pcoll = query_results | 'Transpose' >> beam.ParDo(Transpose()) #out_pcoll | 'Write to log' >> WriteToText('records.txt') dataset_id = 'covid19_confirmed' table_id = PROJECT_ID + ':' + dataset_id + '.' + 'daily_cases' schema_id = 'location_id:INTEGER,date:DATE,cases:INTEGER' out_pcoll | 'Write to BQ' >> WriteToBigQuery( table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET) result = p.run() result.wait_until_finish()
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument("--date", required=True, type=str, help="") parser.add_argument("--output", required=True, type=str, help="PROJECT:DATASET.TABLE") known_args, pipeline_args = parser.parse_known_args(argv) # file_path = os.path.join(known_args.output, known_args.date, "user", known_args.date+"-user") table_name = known_args.output + "$" + re.sub("-", "", known_args.date) pipeline_options = PipelineOptions(pipeline_args) with beam.Pipeline(options=pipeline_options) as p: query_of_user = get_query('user', date=known_args.date) # CSV FORMAT # result = (p # | "ReadFromBQ" >> beam.io.Read(BigQuerySource(query=query_of_user, use_standard_sql=True)) # | "Projected" >> beam.ParDo(ProjectionBQ(), PROJECT_FIELDS_USER) # | "Format" >> beam.ParDo(FormatAsCSV(), COLUMNS, False) # | "Write" >> WriteToText(file_path, ".csv", shard_name_template="-SS", header=HEADERS)) # BIGQUERY FORMAT result = ( p | "ReadFromBQ" >> beam.io.Read( BigQuerySource(query=query_of_user, use_standard_sql=True)) | "Projected" >> beam.ParDo(ProjectionBQ(), PROJECT_FIELDS_USER) | "Write" >> beam.io.Write( BigQuerySink(table_name, schema=SCHEMA, create_disposition=beam.io.BigQueryDisposition. CREATE_IF_NEEDED, write_disposition=beam.io.BigQueryDisposition. WRITE_APPEND)))
def run(argv=None, comments=None): """Run the beam pipeline. Args: argv: (optional) the command line flags to parse. comments_collection: (optional) a list of comment JSON objects to process. Used in unit-tests to avoid requiring a BigQuery source. """ args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if comments is not None: comments = p | ("Read in-memory comments") >> beam.Create(comments) else: comments = p | ("Read " + args.reddit_table) >> Read( BigQuerySource(args.reddit_table)) comments |= ( "Normalise comments" >> beam.Map( partial(normalise_comment, max_length=args.max_length))) thread_id_to_comments = comments | ( "Key by thread id" >> beam.Map( lambda comment: (comment.thread_id, comment))) threads = thread_id_to_comments | ( "Group comments by thread ID" >> beam.GroupByKey()) threads = threads | ("Get threads" >> beam.Map(lambda t: t[1])) examples = threads | ( "Create {} examples".format(args.dataset_format) >> beam.FlatMap( partial(create_examples, parent_depth=args.parent_depth, min_length=args.min_length, format=args.dataset_format, ))) examples = _shuffle(examples) # [START dataflow_molecules_split_to_train_and_eval_datasets] # Split the dataset into a training set and an evaluation set assert 0 < (100 - args.train_split*100) < 100, 'eval_percent must in the range (0-100)' eval_percent = 100 - args.train_split*100 train_dataset, eval_dataset = ( examples | 'Split dataset' >> beam.Partition( lambda elem, _: int(random.uniform(0, 100) < eval_percent), 2)) # [END dataflow_molecules_split_to_train_and_eval_datasets] if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps serialized_train_examples = train_dataset | ( "serialize {} examples".format('train') >> beam.Map(serialize_fn)) ( serialized_train_examples | ("write " + 'train') >> write_sink( os.path.join(args.output_dir, 'train'), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) serialized_test_examples = eval_dataset | ( "serialize {} examples".format('valid') >> beam.Map(serialize_fn)) ( serialized_test_examples | ("write " + 'valid') >> write_sink( os.path.join(args.output_dir, 'valid'), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, ) ) result = p.run() result.wait_until_finish()
import logging from google.cloud import firestore from apache_beam.options.pipeline_options import PipelineOptions PROJECT = 'yelphelp' BUCKET = 'yelp_help_dataflow' options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT google_cloud_options.staging_location = "gs://" + BUCKET + "/staging" google_cloud_options.temp_location = "gs://" + BUCKET + "/temp" options.view_as(StandardOptions).runner = "DataFlowRunner" source = BigQuerySource( query= """SELECT business_id, count(checkin) as checkins FROM `yelphelp.YearlyData.Checkin` group by business_id""", use_standard_sql=True) class CreateEntities(beam.DoFn): def process(self, element): document_id = str(element.pop('business_id')) element['checkins'] = str(element['checkins']) return [(document_id, element)] class FirestoreWriteDoFn(beam.DoFn): MAX_DOCUMENTS = 200 def start_bundle(self): self._records = []
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="referentiel" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_WorkProgramRisk" ) source = BigQuerySource(query="SELECT distinct CORE_ID_WorkProgramId as CORE_ID_WorkProgramId, null as CORE_ID_RiskId, CORE_LB_WorkProgramCode as CORE_LB_WorkProgramCode, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Referentiel` ref left outer join `studied-client-307710.SMT_DWH.SMT_REF_WorkProgram` wrk on ref.Code = wrk.CORE_LB_WorkProgramCode", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_STG", tableId="Missions") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_AuditProcess") source = BigQuerySource( query= "SELECT distinct 1 as CORE_ID_AuditId, CORE_ID_ProcessId as CORE_ID_ProcessId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Missions` miss left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` process on cast(miss.AuditedProcesses as INT64) = process.CORE_ID_ProcessSourceId", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_STG", tableId="CS_FindingsExportImport") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_FindingDivision") source = BigQuerySource( query= "SELECT distinct fin.CORE_ID_FindingId as CORE_ID_FindingId, div.CORE_ID_DivisionId as CORE_ID_DivisionId, id as CORE_ID_FindingSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_FindingsExportImport` imp LEFT OUTER JOIN `studied-client-307710.SMT_DWH.SMT_REF_Finding` fin ON CAST(imp.Id AS int64) = fin.CORE_ID_FindingSourceId LEFT OUTER JOIN `studied-client-307710.SMT_DWH.SMT_REF_Division` div ON CAST(imp.CS_ConcernedDivisions AS int64) = div.CORE_ID_DivisionSourceId", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
from google.cloud import firestore from apache_beam.options.pipeline_options import PipelineOptions PROJECT = 'yelphelp' BUCKET = 'yelp_help_dataflow' options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT google_cloud_options.staging_location = "gs://" + BUCKET + "/staging" google_cloud_options.temp_location = "gs://" + BUCKET + "/temp" options.view_as(StandardOptions).runner = "DataFlowRunner" source = BigQuerySource( query="""SELECT business_id, ARRAY_AGG(t.text) as tips FROM (SELECT `yelphelp.YearlyData.Tip`.*, ROW_NUMBER() OVER (PARTITION BY business_id ORDER BY compliment_count, date DESC) AS seqnum FROM `yelphelp.YearlyData.Tip` ) t where seqnum <= 3 group by business_id""", use_standard_sql=True) class CreateEntities(beam.DoFn): def process(self, element): document_id = str(element.pop('business_id')) return [(document_id, element)] class FirestoreWriteDoFn(beam.DoFn): MAX_DOCUMENTS = 200 def start_bundle(self): self._records = []
import datetime PROJECT = 'yelphelp' BUCKET = 'yelp_help_dataflow' options = PipelineOptions() google_cloud_options = options.view_as(GoogleCloudOptions) google_cloud_options.project = PROJECT google_cloud_options.staging_location = "gs://" + BUCKET + "/staging" google_cloud_options.temp_location = "gs://" + BUCKET + "/temp" options.view_as(StandardOptions).runner = "DataFlowRunner" source = BigQuerySource( query= """SELECT R.business_id, b.name, b.categories, b.city, b.state, sum(R.useful) as useful,\ sum(R.funny) as funny, sum(R.cool) as cool, avg(R.stars) as avg_stars, count(R.stars) as num_stars \ FROM `yelphelp.YearlyData.Review` as R LEFT JOIN `yelphelp.YearlyData.Business` as B ON \ R.business_id = B.business_id group by business_id, name, categories, state, city""", use_standard_sql=True) class CreateEntities(beam.DoFn): def process(self, element): document_id = str(element.pop('business_id')) element['name'] = str(element['name']) element['categories'] = str(element['categories']) element['city'] = str(element['city']) element['state'] = str(element['state']) element['cool'] = int(element['cool']) element['funny'] = int(element['funny']) element['useful'] = int(element['useful'])
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="CS_FindingsExportImport" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_FindingConcernedFunction" ) source = BigQuerySource(query="SELECT distinct fin.CORE_ID_FindingId as CORE_ID_FindingId, con.CORE_ID_ConcernedFunctionId as CORE_ID_ConcernedFunctionId, id as CORE_ID_FindingSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_FindingsExportImport` imp left outer join `studied-client-307710.SMT_DWH.SMT_REF_Finding` fin on cast(imp.Id as int64) = fin.CORE_ID_FindingSourceId left outer join `studied-client-307710.SMT_DWH.SMT_REF_ConcernedFunction` con on cast(imp.CS_ConcernedFunc as int64) = con.CORE_ID_ConcernedFunctionSourceId ", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="Processes") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_Process") source = BigQuerySource( query= "SELECT ROW_NUMBER() over(order by NameFR) as CORE_ID_ProcessId, Parent as CORE_LB_ProcessParentReference, Level as CORE_LB_ProcessLevel, Ref as CORE_LB_ProcessReference, `Order` as CORE_LB_Order, Active as CORE_LB_Active, NameEN as CORE_LB_ProcessNameEN, NameFR as CORE_LB_ProcessNameFR, risk.CORE_ID_RiskId as CORE_ID_RiskId, DefEN as CORE_LB_ProcessDefEN, DefFR as CORE_LB_ProcessDefFR, Graph as CORE_LB_Graph, Id as CORE_ID_ProcessSourceId, LastMod as CORE_DT_LastModif, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Processes` proc left outer join `studied-client-307710.SMT_DWH.SMT_REF_Risk` risk on cast(proc.CS_Risk as INT64) = risk.CORE_ID_RiskSourceId", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
def test(self): self.result = (self.pipeline | 'Read from BigQuery' >> Read( BigQuerySource(dataset=self.input_dataset, table=self.input_table)) | 'Count' >> Count.Globally())
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="MissionTypes" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_AuditType" ) source = BigQuerySource(query="SELECT ROW_NUMBER() over(order by Code) as CORE_ID_AuditTypeId, Code as CORE_LB_AuditTypeCode, NameEN as CORE_LB_AuditTypeNameEN, NameFR as CORE_LB_AuditTypeNameFR, Active as CORE_LB_Active, `Order` as CORE_LB_Order, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, TrackingDisplay as CORE_LB_TrackingDisplay, Id as CORE_ID_AuditTypeSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.MissionTypes`", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="CS_FindingsExportImport" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_FindingRisk" ) source = BigQuerySource(query="SELECT distinct id as CORE_ID_FindingId, -1 as CORE_ID_RiskId, -1 as CORE_ID_FindingSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_FindingsExportImport` ", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="Referentiel" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_WorkProgram" ) source = BigQuerySource(query="SELECT ROW_NUMBER() over(order by CS_WFLevel)as CORE_ID_WorkProgramId, CS_WFLevel as CORE_LB_WFLevel, Parent as CORE_LB_WorkProgramParentCode, MasterAuditReferential as CORE_LB_MasterAuditReferential, CS_AuditReferenceDataLink as CORE_LB_CS_AuditReferenceDataLink, Level as CORE_LB_WorkProgramLevel, OriginLevel as CORE_LB_ReferentieOriginLevel, Code as CORE_LB_WorkProgramCode, OrderCode as CORE_LB_Order, ProcessOrigin as CORE_LB_ProcessOrigin, CORE_ID_ProcessId as CORE_ID_ProcessId, AuditPointOrigin as CORE_LB_AuditPoINT64Origin, CS_ControlPoint as CORE_LB_ControlPoINT64, null as CORE_LB_WorkProgramNameEN, null as CORE_LB_Context, CS_CSNewItem as CORE_LB_CSNewITem, CS_Progress as CORE_LB_CS_Progress, KPCount as CORE_FL_KPCount, RecoCount as CORE_FL_RecoCount, null as CORE_LB_WFICone, null as CORE_LB_SynthesisMessage, null as CORE_LB_DirectoryWorkingPManagers, DirectoryWorkingPAuditors as CORE_LB_DirectoryWorkingPAuditors, CChoiceList as CORE_LB_CChoiceList, CAnswer as CORE_LB_CAnswer, AnswerAttachedFile as CORE_LB_AnswerAttachedFile, CS_AssessControl as CORE_LB_AssessControl, CS_SubProcessAssessment as CORE_LB_SubProcessAssessment, LevelNo as CORE_LB_LevelNo, Hyperlink as CORE_LB_HyperLink, CS_Hyperlink2 as CORE_LB_HyperLink2, CS_Hyperlink3 as CORE_LB_HyperLink3, CS_Hyperlink4 as CORE_LB_HyperLink4, CS_CopyOfAuditGuide as CORE_LB_CopyOfAuditGuide, MissionManagers as CORE_LB_MissionManagers, MissionAuditTeam as CORE_LB_MissionAuditTeam, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, null as CORE_LB_TrackingDisplay, null as CORE_LB_Source, null as CORE_LB_OfflineRisks, null as CORE_LB_CopyOfTitle, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Referentiel` ref left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` process on ref.Domain = cast(process.CORE_ID_ProcessSourceId as STRING)", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_STG", tableId="Plans") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_AuditPlan") source = BigQuerySource( query= "SELECT ROW_NUMBER() over(order by CS_ReferencePlan) as CORE_ID_AuditPlanId, CS_ReferencePlan as CORE_LB_AuditPlanReference, Title as CORE_LB_AuditPlanTitle, Status as CORE_LB_AuditPlanStatus, Archived as CORE_LB_Archived, ArchivingDate as CORE_DT_ArchivedDate, ArchivingResponsibleFor as CORE_LB_ArchivingResponsibleFor, Reason as CORE_LB_Reason, AuditedSites as CORE_LB_AuditedSites, Services as CORE_LB_Services, CS_Objective as CORE_LB_Objective, AttachedFile as CORE_LB_AttachedFile, NbMissions as CORE_FL_NbMissions, StartDate as CORE_DT_StartDate, EndDate as CORE_DT_EndDate, Id as CORE_ID_AuditPlanSourceId, LastModif as CORE_DT_LastModif, Source as CORE_LB_Source, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Plans`", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_STG", tableId="CS_ConcernedFunctions") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_ConcernedFunction") source = BigQuerySource( query= "SELECT ROW_NUMBER() over(order by CS_Order) as CORE_ID_ConcernedFunctionId, CS_Order as CORE_LB_ConcernedFunctionOrder, CS_ConcernedFunctionsEN as CORE_LB_ConcernedFunctionNameEN, CS_ConcernedFunctionsFR as CORE_LB_ConcernedFunctionNameFR, CS_Id as CORE_ID_ConcernedFunctionSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_ConcernedFunctions`", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="Missions" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_Audit" ) source = BigQuerySource(query="SELECT ROW_NUMBER() over(order by CS_Ref) as CORE_ID_AuditId, CS_Ref as CORE_LB_AuditReference, Title as CORE_LB_AuditTitle, CS_CopyOfStatus as CORE_LB_AuditStatus, plan.CORE_ID_AuditPlanId as CORE_ID_AuditPlanId, CS_PreviousAudit as CORE_LB_PreviousAudit, CS_ReferenceAudits as CORE_LB_ReferenceAudits, LinkTypology as CORE_LB_LinkTypolygy, MissionTypology as CORE_LB_MissionTypolygy, CS_Cycle as CORE_LB_Cycle, Language as CORE_LB_Language, Archived as CORE_LB_Archived, ArchivingDate as CORE_DT_ArchivingDate, ArchivingResponsibleFor as CORE_LB_ArchivingResponsibleFor, Reason as CORN_LB_Reason, Services as CORE_LB_Services, zone.CORE_Id_ZoneId as CORE_ID_ZoneId, CS_CAnet as CORE_FL_CANet, CS_UnitsSold as CORE_FL_UnitsSold, CS_UnitsProduced as CORE_FL_UnitsProduced, CS_REXResultatDexploitation as CORE_FL_REXResultatDexploitation, CS_BAI as CORE_FL_BAI, CS_StatutoryEmployees as CORE_FL_StatutoryEmployees, CS_TotalEmployeesFTE as CORE_FL_TotalEmployeesFTE, CS_OverdueValue as CORE_FL_OverdueValue, CS_BadDebtProvisionValue as CORE_FL_BadDebtProvisionValue, CS_ReturnsValue as CORE_FL_ReturnsValue, CS_ServiceRateDivPerc as CORE_FL_ServiceRateDivPerc, CS_StockValue as CORE_FL_StockValue, CS_DestructionValue as CORE_FL_DestructionValue, CS_InfluencersValue as CORE_FL_InfluencersValue, CS_NbNonStatutoryEmployees as CORE_FL_NbNonStatutoryEmployees, Planned as CORE_LB_Planned, Criticity as CORE_LB_Criticity, typ.CORE_ID_AuditTypeId as CORE_ID_AuditTypeId, CS_CurrMissPha as CORE_LB_CurrMissPha, Initiator as CORE_LB_Initiator, Objective as CORT_LB_Objective, AuditContext as CORE_LB_AuditContext, ExNihilo as CORE_LB_ExNihilo, ToDuplicate as CORE_LB_ToDuplicate, RefCreate as CORE_LB_RefCreate, ActualStartDate as CORE_DT_ActualStartDate, ActualEndDate as CORE_DT_ActualEndDate, Agenda as CORE_LB_Agenda, null as CORE_FL_EntityDAF, CS_EntityICM as CORE_LB_EntityICMCode, InternalRN as CORE_FL_INT64ernalRN, ExternalRN as CORE_FL_ExternalRN, null as CORE_LB_CheckAvailabilityTA, CS_RecoTransDate as CORE_DT_RecoTransDate, CS_RecoAccepDate as CORE_DT_RecoAccepDate, CS_approvReco as CORE_FL_ApprovReco, CS_APValidated as CORE_LB_APValidated, CS_APValDate as CORE_DT_APValDate, CS_RepIssDate as CORE_DT_RepissDate, CS_ActionPlanClosed as CORE_LB_ActionPlanClosed, MissionManager as CORE_LB_MissionManager, GlobalMark as CORE_LB_GlobalMark, LabelMark as CORE_LB_LabelMark, null as CORE_LB_MarkDescription, null as CORE_LB_Conclusion, null as CORE_LB_Improve, null as CORE_LB_CopyOfSynthesis, null as CORE_LB_RATitle, null as CORE_LB_RASubTitle, null as CORE_DT_ARDate, null as CORE_LB_FinalAuditReportCode, null as CORE_DT_FinalAuditEmi, null as CORE_ID_MissionSourceId, null as CORE_FL_DurationReal, null as CORE_FL_DurationPlan, null as CORE_DT_LastUpdateReceived, null as CORE_DT_LastModif, null as CORE_LB_Source, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Missions` miss left outer join `studied-client-307710.SMT_DWH.SMT_REF_AuditPlan` plan on cast(miss.PlanCode as INT64) = plan.CORE_ID_AuditPlanSourceId left outer join `studied-client-307710.SMT_DWH.SMT_REF_AuditType` typ on cast(miss.Type as INT64) = typ.CORE_ID_AuditTypeSourceId left outer join `studied-client-307710.SMT_DWH.SMT_REF_Zone` zone on cast(miss.CS_ZoneScope as INT64) = zone.CORE_ID_ZoneSourceId", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="Recommendations" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_FindingRecommendation" ) source = BigQuerySource(query="SELECT distinct fin.CORE_ID_FindingId as CORE_ID_FindingId, recom.CORE_ID_RecommendationId as CORE_ID_RecommendationId, id as CORE_ID_RecommendationSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Recommendations` rec left outer join `studied-client-307710.SMT_DWH.SMT_REF_Finding` fin on rec.AssociatedKeyPoint = cast(fin.CORE_ID_FindingSourceId as string) left outer join `studied-client-307710.SMT_DWH.SMT_REF_Recommendation` recom on rec.id = cast(recom.CORE_ID_RecommendationSourceId as string)", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="referentiel" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_WorkProgramAudit" ) source = BigQuerySource(query="SELECT distinct ROW_NUMBER() over(order by Parent)as CORE_ID_WorkProgramAuditId, Parent as CORE_LB_WorkProgramParentCode, Level as CORE_LB_WorkProgramLevel, Code as CORE_LB_WorkProgramCode, CORE_ID_WorkProgramId as CORE_ID_WorkProgramId, null as CORE_ID_AuditId, CS_Progress as CORE_LB_CS_Progress, CS_AssessControl as CORE_LB_AssessControl, CS_SubProcessAssessment as CORE_LB_SubProcessAssessment, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, null as CORE_ID_WorkProgramSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Referentiel` ref left outer join `studied-client-307710.SMT_DWH.SMT_REF_WorkProgram` wrk on ref.Code = wrk.CORE_LB_WorkProgramCode", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="Risks") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_Risk") source = BigQuerySource( query= "SELECT ROW_NUMBER() over(order by Parent) as CORE_ID_RiskId, Parent as CORE_LB_RiskParentReference, Level as CORE_LB_RiskLevel, Ref as CORE_LB_RiskReference, `Order` as CORE_LB_Order, Active as CORE_LB_Active, NameEN as CORE_LB_RiskNameEN, NameFR as CORE_LB_RiskNameFR, DefEN as CORE_LB_RiskDefEN, DefFR as CORE_LB_RiskDefFR, KeyRisk as CORE_LB_KeyRisk, NonConformityRisk as CORE_LB_NonConformityRisk, IdentificationDate as CORE_DT_IdentificationDate, Id as CORE_ID_RiskSourceId, LastMod as CORE_DT_LastModif, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Risks`", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
def run(argv=None, comments=None): """Run the beam pipeline. Args: argv: (optional) the command line flags to parse. comments_collection: (optional) a list of comment JSON objects to process. Used in unit-tests to avoid requiring a BigQuery source. """ args, pipeline_args = _parse_args(argv) pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as(SetupOptions).save_main_session = True p = beam.Pipeline(options=pipeline_options) if comments is not None: comments = p | ("Read in-memory comments") >> beam.Create(comments) else: comments = p | ("Read " + args.reddit_table) >> Read( BigQuerySource(args.reddit_table)) comments |= ("Normalise comments" >> beam.Map( partial(normalise_comment, max_length=args.max_length))) thread_id_to_comments = comments | ( "Key by thread id" >> beam.Map(lambda comment: (comment.thread_id, comment))) threads = thread_id_to_comments | ( "Group comments by thread ID" >> beam.GroupByKey()) threads = threads | ("Get threads" >> beam.Map(lambda t: t[1])) examples = threads | ( "Create {} examples".format(args.dataset_format) >> beam.FlatMap( partial( create_examples, parent_depth=args.parent_depth, min_length=args.min_length, format=args.dataset_format, ))) examples = _shuffle(examples) examples |= "split train and test" >> beam.ParDo( _TrainTestSplitFn(train_split=args.train_split)).with_outputs( _TrainTestSplitFn.TEST_TAG, _TrainTestSplitFn.TRAIN_TAG) if args.dataset_format == _JSON_FORMAT: write_sink = WriteToText file_name_suffix = ".json" serialize_fn = json.dumps else: assert args.dataset_format == _TF_FORMAT write_sink = WriteToTFRecord file_name_suffix = ".tfrecord" serialize_fn = _features_to_serialized_tf_example for name, tag in [("train", _TrainTestSplitFn.TRAIN_TAG), ("test", _TrainTestSplitFn.TEST_TAG)]: serialized_examples = examples[tag] | ( "serialize {} examples".format(name) >> beam.Map(serialize_fn)) (serialized_examples | ("write " + name) >> write_sink( os.path.join(args.output_dir, name), file_name_suffix=file_name_suffix, num_shards=args.num_shards_train, )) result = p.run() result.wait_until_finish()
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_STG", tableId="CS_FindingsExportImport") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_Finding") source = BigQuerySource( query= "SELECT distinct ROW_NUMBER() over(order by Reference) as CORE_ID_FindingId, Reference as CORE_LB_FindingReference, null as CORE_LB_FindingTitle, null as CORE_ID_AuditId, null as CORE_ID_WorkProgramAuditId, null as CORE_LB_FindingDescription, CS_ShowTop5Subject as CORE_LB_ShowTop5Subject, top.CORE_ID_Top5SubjectId as CORE_ID_Top5SubjectId, CS_PreviousAuditFinding as CORE_LB_PreviousAuditFinding, CS_Levelofrisk as CORE_LB_RiskLevel, null as CORE_LB_OtherSuggestedRisks, AttachedFileLink as CORE_LB_AttachedFileLink, pro.CORE_ID_ProcessId as CORE_ID_ProcessId, CS_WorkProgramProcess as CORE_LB_WorkProgramProcess, AuditReportDisplay as CORE_LB_AuditReportDisplay, AuditReportOrder as CORE_LB_AuditReportOrder, Id as CORE_ID_FindingSourceId, LastModif as CORE_DT_LastModif, Source as CORE_LB_Source, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_FindingsExportImport` imp left outer join `studied-client-307710.SMT_DWH.SMT_REF_Top5Subject` top on cast(top.CORE_ID_Top5SubjectSourceId as string) = imp.CS_CorresTop5Subject left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` pro on cast(pro.CORE_ID_ProcessSourceId as string ) = imp.CS_ReferenceProcess ", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_STG", tableId="CS_Top5Subjects") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_Top5Subject") source = BigQuerySource( query= "SELECT ROW_NUMBER() over(order by CS_Id) as CORE_ID_Top5SubjectId, CS_Id as CORE_ID_Top5SubjectSourceId, CS_Reference as CORE_LB_Top5SubjectReference, CS_Ordre as CORE_LB_Top5SubjectOrder, CS_TitreFREN as CORE_LB_Top5SubjectNameEN, CS_TitreFRFR as CORE_LB_Top5SubjectNameFR, CS_Active as CORE_LB_Top5SubjectActive, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.CS_Top5Subjects`", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_STG", tableId="Recommendations" ) sink_table_spec = bigquery.TableReference( projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_Recommendation" ) source = BigQuerySource(query="SELECT distinct ROW_NUMBER() over(order by CS_MainRecommandation) as CORE_ID_RecommendationId, CS_LinkToAnExistingRecommandation as CORE_LB_LinkToAnExistingRecommandation, CS_MainRecommandation as CORE_LB_MainRecommandation, -1 as CORE_ID_AuditId, Template as CORE_LB_Template, Reference as CORE_LB_RecommendationReference, null as CORE_LB_RecommendationTitle, AllowFollowUp as CORE_LB_AllowFollowUp, -1 as CORE_ID_WorkProgramAuditId, pro.CORE_ID_ProcessId as CORE_ID_ProcessId, sub.CORE_ID_ProcessId as CORE_ID_SubProcessId, AuditedSite as CORE_LB_AuditedSiteCode, Archived as CORE_LB_Archived, ArchivingDate as CORE_LB_ArchivingDate, ArchivingResponsibleFor as CORE_LB_ArchivingResponsibleFor, Reason as CORE_LB_Reason, Services as CORE_LB_Services, CS_Levelofrisk as CORE_LB_LevelOfRisk, CS_OtherSuggestedRisks as CORE_LB_OtherSuggestedRisks, null as CORE_LB_OtherSuggestedRisksAll, null as CORE_LB_CopyOfDescription, CS_ZoneCommentsIA as CORE_LB_ZoneCommentsIA, CS_RecommendationAnswer as CORE_LB_RecommendationAnswer, CS_AcceptanceDate as CORE_DT_AcceptanceDate, null as CORE_LB_EntityComment, null as CORE_LB_AuditTeamAuditeesReason, CS_Status as CORE_LB_RecommendationStatus, CS_ImplemRatPercent as CORE_FL_ImplemRatPercent, CS_ImplemRatAudit as CORE_FL_ImplemRatAudit, Calendar as CORE_DT_Calendar, null as CORE_LB_Fonction, null as CORE_LB_InChargeOfRecoText, FUCreation as CORE_DT_FUCreation, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, Id as CORE_ID_RecommendationSourceId, LastModif as CORE_DT_LastModif, Source as CORE_LB_Source, AuditTypology as CORE_LB_AuditTypology, CS_FollowupCreation as CORE_DT_FollowUpCreation, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.Recommendations` rec left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` pro on cast(pro.CORE_ID_ProcessSourceId as STRING) = rec.CS_ProcessLevel left outer join `studied-client-307710.SMT_DWH.SMT_REF_Process` sub on cast(sub.CORE_ID_ProcessSourceId as STRING) = rec.CS_SubprocessLevel ", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(sink_table_spec, # schema=table_schema, write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, #WRITE_TRUNCATE
"CORE_ID_ZoneSourceId": "STRING", "CORE_DT_LastMod": "DATETIME", "CORE_DT_RecordCreationDate": "DATETIME", "CORE_DT_RecordModificationDate": "DATETIME", "CORE_FL_IsDeleted": "INTEGER", "CORE_FL_Latitude": "FLOAT", "CORE_FL_Longitude": "FLOAT" } mapping_list = [{"name": k, "type": mapping[k]} for k in mapping.keys()] return json.JSONEncoder(sort_keys=True).encode({"fields": mapping_list}) table_schema = parse_table_schema_from_json(make_sink_schema()) source = BigQuerySource( query= "SELECT ROW_NUMBER() over(order by Code) as CORE_Id_ZoneId, Parent as CORE_LB_ZoneParentCode, Level as CORE_LB_Level, Code as CORE_LB_ZoneCode, EntityEN as CORE_LB_ZoneEntityEN, EntityFR as CORE_LB_ZoneEntityFR, EntityStatus as CORE_LB_ZoneEntityStatus, `Order` as CORE_LB_Order, Id as CORE_ID_ZoneSourceId, LastMod as CORE_DT_LastMod, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted, geo.Latitude as CORE_FL_Latitude, geo.Longitude as CORE_FL_Longitude FROM `studied-client-307710.SMT_STG.SecondAxis` ax left outer join `studied-client-307710.SMT_STG.Geographic_Coordinates` geo on geo.SubZone = ax.EntityEN", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(
google_cloud_options.staging_location = "gs://projet_smart_gcp/staging" google_cloud_options.temp_location = "gs://projet_smart_gcp/temp" #options.view_as(StandardOptions).runner = "DirectRunner" # use this for debugging options.view_as(StandardOptions).runner = "DataFlowRunner" options.view_as(SetupOptions).setup_file = "./setup.py" # see here for bigquery docs https://beam.apache.org/documentation/io/built-in/google-bigquery/ source_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_STG", tableId="RecommendationCriticity") sink_table_spec = bigquery.TableReference(projectId="studied-client-307710", datasetId="SMT_DWH", tableId="SMT_REF_RiskLevel") source = BigQuerySource( query= "SELECT ROW_NUMBER() over(order by Code) as CORE_ID_RiskLevelId, Code as CORE_LB_RiskLevelCode, NameEN as CORE_LB_RiskLevelNameEN, NameFR as CORE_LB_RiskLevelNameFR, DescriptionEN as CORE_LB_RiskLevelDescriptionEN, DescriptionFR as CORE_LB_RiskLevelDescriptionFR, Active as CORE_LB_Active, `Order` as CORE_LB_Order, TextColor as CORE_LB_TextColor, BackgroundColor as CORE_LB_BackgroundColor, CreatedBy as CORE_LB_CreatedBy, CreatedOn as CORE_DT_CreatedOn, ModifiedBy as CORE_LB_ModifiedBy, ModifiedOn as CORE_DT_ModifiedOn, TrackingDisplay as CORE_LB_TrackingDisplay, Id as CORE_ID_RisklLevelSourceId, current_date() as CORE_DT_RecordCreationDate, current_date() as CORE_DT_RecordModificationDate, 0 as CORE_FL_IsDeleted FROM `studied-client-307710.SMT_STG.RecommendationCriticity`", use_standard_sql=True) # you can also use SQL queries #source = BigQuerySource(source_table_spec) #target = BigQuerySink(sink_table_spec, schema=table_schema) #target = beam.io.WriteToText("output.txt") def run(argv=None): parser = argparse.ArgumentParser() known_args, pipeline_args = parser.parse_known_args(argv) with beam.Pipeline(argv=pipeline_args) as p: raw_values = ( p | "ReadTable" >> beam.io.Read(source) | "cleanup" >> beam.ParDo(ElementCleanup()) | "writeTable" >> beam.io.WriteToBigQuery(