コード例 #1
0
def run():
    PROJECT_ID = 'electric-spark-266716'  # change to your project id
    BUCKET = 'gs://global_surface_temperatures'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'country-beam-dataflow'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    #create query to select all elements for cleansing
    sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, Country \
    FROM kaggle_modeled.Country as x'

    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    #read desired table from BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    #write inputs to input.txt
    query_results | 'Write input' >> WriteToText(DIR_PATH +
                                                 'input_country.txt')

    # apply ParDo to filter out dates
    formatted_country_pcoll = query_results | 'Filter Dates' >> beam.ParDo(
        FilterDateFn())

    # display filtered countries
    formatted_country_pcoll | 'Write filtered dates' >> WriteToText(
        DIR_PATH + 'output_country.txt')

    #create new table in BigQuery
    dataset_id = 'kaggle_modeled'
    table_id = 'Country_Beam_DF'
    schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,Country:STRING'

    # write PCollection to new BQ table
    formatted_country_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
コード例 #2
0
def run():
    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'takes-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = Pipeline(options=options)

    takes_sql = 'SELECT sid, cno, grade FROM college_workflow_modeled.Takes'
    class_sql = 'SELECT cid, cno FROM college_workflow_modeled.Class'

    takes_pcoll = p | 'Read from BQ Takes' >> beam.io.Read(
        beam.io.BigQuerySource(query=takes_sql, use_standard_sql=True))
    class_pcoll = p | 'Read from BQ Class' >> beam.io.Read(
        beam.io.BigQuerySource(query=class_sql, use_standard_sql=True))

    # write PCollections to log files
    takes_pcoll | 'Write log 1' >> WriteToText(DIR_PATH +
                                               'takes_query_results.txt')
    class_pcoll | 'Write log 2' >> WriteToText(DIR_PATH +
                                               'class_query_results.txt')

    # ParDo with side-input
    norm_takes_pcoll = takes_pcoll | 'Normalize Record' >> beam.ParDo(
        NormalizeDoFn(), beam.pvalue.AsList(class_pcoll))

    # write PCollection to log file
    norm_takes_pcoll | 'Write log 3' >> WriteToText(DIR_PATH +
                                                    'norm_takes_pcoll.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Takes_Beam_DF'
    schema_id = 'sid:STRING,cid:STRING,grade:STRING'

    # write PCollection to new BQ table
    norm_takes_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
コード例 #3
0
def run():         
    PROJECT_ID = 'cs327e-sp2020' # change to your project id
    BUCKET = 'gs://beam-output-data' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'student-df2'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT sid, fname, lname, dob, status FROM college_workflow_modeled.Student'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # standardize the students' date of birth  
    formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo(FormatDOBFn())

    # write PCollection to log file
    formatted_dob_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'formatted_dob_pcoll.txt')

    # group students by sid
    grouped_student_pcoll = formatted_dob_pcoll | 'Group by sid' >> beam.GroupByKey()

    # write PCollection to log file
    #grouped_student_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'grouped_student_pcoll.txt')

    # remove duplicate student records
    distinct_student_pcoll = grouped_student_pcoll | 'Dedup student records' >> beam.ParDo(DedupStudentRecordsFn())

    # write PCollection to log file
    distinct_student_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'distinct_student_pcoll.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Student_Beam_DF'
    schema_id = 'sid:STRING,fname:STRING,lname:STRING,dob:DATE,status:STRING'

    # write PCollection to new BQ table
    distinct_student_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                  table=table_id, 
                                                  schema=schema_id,
                                                  project=PROJECT_ID,
                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
         
    result = p.run()
    result.wait_until_finish()      
コード例 #4
0
def run():
    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'teacher-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT tid, instructor, dept FROM college_workflow_modeled.Teacher'
    query_results = p | 'Read from BigQuery' >> beam.io.Read(
        beam.io.BigQuerySource(query=sql, use_standard_sql=True))

    query_results | 'Write log 1' >> WriteToText('query_results.txt')

    teacher_pcoll = query_results | 'Standardize' >> beam.ParDo(
        StandardizeDoFn())

    teacher_pcoll | 'Write log 2' >> WriteToText('formatted_teacher_pcoll.txt')

    # group records by tid
    grouped_pcoll = teacher_pcoll | 'Group by tid' >> beam.GroupByKey()

    grouped_pcoll | 'Write log 3' >> WriteToText('grouped_teacher.txt')

    # remove duplicates
    distinct_pcoll = grouped_pcoll | 'Dedup' >> beam.ParDo(DedupRecordsDoFn())

    distinct_pcoll | 'Write log 4' >> WriteToText('distinct_teacher.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Teacher_Beam_DF'
    schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING'

    distinct_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
コード例 #5
0
def run():
    PROJECT_ID = 'studied-brand-266702'  # change to your project id
    BUCKET = 'gs://beam_cs327e_project'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'vaccination-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    # run BigQuery query on dataset
    sql = 'SELECT * FROM vaers_modeled.Vaccination'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    input_pcoll = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # write input PCollection to input.txt
    input_pcoll | 'Write input_pcoll log 1' >> WriteToText(
        DIR_PATH + 'input_vaccination.txt')

    # standardize vaccination V_FUNDBY, VAX_ROUTE and VAX_SITE unknown/empty attribute
    formatted_vaccination_pcoll = input_pcoll | 'Format Unknown Values' >> beam.ParDo(
        FormatUnknownFn())

    # write PCollection to log file
    formatted_vaccination_pcoll | 'Write log 2' >> WriteToText(
        DIR_PATH + 'formatted_unknown_pcoll.txt')

    # specify id and schema
    dataset_id = 'vaers_modeled'
    table_id = 'Vaccination_Beam_DF'
    schema_id = 'VACCINATION_ID:INTEGER, VAERS_ID:INTEGER, VAX_DATE:DATE, VAX_ID:INTEGER, MANU_ID:INTEGER, V_ADMINBY:STRING, V_FUNDBY:STRING, VAX_ROUTE:STRING, VAX_SITE:STRING'

    # write output PCollection to new BQ table
    formatted_vaccination_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
コード例 #6
0
def run():
    PROJECT_ID = 'starry-center-266501'  # change to your project id
    BUCKET = 'gs://imdb-beam'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'format-date-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT * FROM bollywood_modeled.bollywoodTitles'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # write PCollection to log file
    query_results | 'Write log 1' >> WriteToText(DIR_PATH +
                                                 'query_results.txt')

    # apply ParDo to format the student's date of birth
    formatDate_pcoll = query_results | 'Format the dates' >> beam.ParDo(
        FormatDateFn())

    # write PCollection to log file
    formatDate_pcoll | 'Write log 2' >> WriteToText(DIR_PATH +
                                                    'formatDate_pcoll.txt')

    dataset_id = 'bollywood_modeled'
    table_id = 'bollywoodTitles_Beam_DF'
    schema_id = 'title:STRING,releaseDate:DATE,croresGrossed:NUMERIC'

    # write PCollection to new BQ table
    formatDate_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'swift-area-266618'  # change to your project id
    BUCKET = 'gs://nullbusters_data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'student-df5'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT imdb_title_id, title, original_title, year, genre, duration, country, language, director, writer, production_company, actors, description, avg_vote, votes, budget, usa_gross_income, worlwide_gross_income, metascore, reviews_from_users, reviews_from_critics FROM imdb_modeled.Movies WHERE usa_gross_income IS NOT NULL and worlwide_gross_income IS NOT NULL'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    query_results | 'Write log Input' >> WriteToText('input.txt')

    # apply ParDo to format directors birth year and death years to be ints
    formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo(
        FormatYearsFn())

    # write PCollection to log file
    formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH +
                                                             'output.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Movies_Beam_DF'
    schema_id = 'imdb_title_id:STRING, title:STRING, original_title:STRING, year:INTEGER, genre:STRING, duration:INTEGER, country:STRING, language:STRING, director:STRING, writer:STRING, production_company:STRING, actors:STRING, description:STRING, avg_votes:FLOAT, votes:INTEGER, budget_currency:STRING, budget:INTEGER, usa_gross_income:INTEGER, worlwide_gross_income_currency:STRING, worlwide_gross_income:INTEGER, metascore:FLOAT, reviews_from_users:FLOAT, reviews_from_critics:FLOAT'

    # write PCollection to new BQ table
    formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
コード例 #8
0
def run():
    PROJECT_ID = 'responsive-cab-267123'  # change to your project id
    BUCKET = 'gs://bmease_cs327e'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'
    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'foodmap-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'
    # Create beam pipeline using local runner
    p = Pipeline(options=options)

    # get average price per year for each food
    sql = "SELECT LOWER(product_name) AS product_name, product_id,p.aisle_id FROM instacart_modeled.Products p WHERE p.product_name not like '%Filters%' and p.aisle_id NOT IN (11,20,22,25,44,55,73,80,109,118,126,127,132,133,10,54,60,74,75,85,87,101,111,114,56,82,102)"
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)
    # write PCollection to input file
    query_results | 'Write to input.txt' >> WriteToText(DIR_PATH + 'input.txt')

    # apply ParDo to format the key, value pairs
    # key is the food_id and value is a tuple of year and average price that year
    nom_match_pcoll = query_results | 'Food and matches from nom' >> beam.ParDo(
        MatchProductFn())

    # write PCollection to output file
    nom_match_pcoll | 'Write to output.txt' >> WriteToText(DIR_PATH +
                                                           'output.txt')

    dataset_id = 'USDA_ERS_modeled'
    table_id = 'Food_Map_Beam_DF'
    schema_id = 'food_id:INTEGER,product_id:INTEGER'

    # write PCollection to new BQ table
    nom_match_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'swift-area-266618'  # change to your project id
    BUCKET = 'gs://nullbusters_data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'directors'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT name, birth_name, height, bio, birth_details, birth_year, place_of_birth, death_details, death_year, spouses, divorces, children, known_for_titles, imdb_title_id, director_name_id, category, reason_of_death FROM imdb_modeled.Directors WHERE birth_year IS NOT NULL AND death_year IS NOT NULL'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    query_results | 'Write log Input' >> WriteToText('input.txt')

    # apply ParDo to format directors birth year and death years to be ints
    formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo(
        FormatYearsFn())

    # write PCollection to log file
    formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH +
                                                             'output.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Directors_Beam_DF'
    schema_id = 'name:STRING,birth_name:STRING,height:FLOAT,bio:STRING,birth_details:STRING,birth_year:INTEGER,place_of_birth:STRING,death_details:STRING,death_year:INTEGER,spouses:INTEGER,divorces:INTEGER,children:STRING,known_for_titles:STRING,imdb_title_id:STRING,category:STRING,reason_of_death:STRING,director_name_id:STRING'

    # write PCollection to new BQ table
    formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
コード例 #10
0
def run():

    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'location'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = Pipeline(options=options)

    sql = 'SELECT * FROM covid_19_modeled.Location_SQL_1'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # extract city from state
    state_pcoll = query_results | 'Format State' >> beam.ParDo(FormatStateFn())

    grouped_pcoll = state_pcoll | 'Group Locations' >> beam.GroupByKey()

    unique_pcoll = grouped_pcoll | 'Remove Duplicates' >> beam.ParDo(
        RemoveDuplicatesFn())

    dataset_id = 'covid_19_modeled'
    table_id = 'Location_Beam_DF'
    schema_id = 'id:INTEGER,city:STRING,state:STRING,country:STRING,latitude:NUMERIC,longitude:NUMERIC,fips:INTEGER,admin2:STRING,combined_key:STRING'

    # write PCollection to BQ table
    unique_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
コード例 #11
0
def run():
    PROJECT_ID = 'cs327e-sp2020' # change to your project id
    BUCKET = 'gs://beam-output-data' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'event'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = Pipeline(options=options)

    sql = 'SELECT * FROM covid_19_modeled.Event_SQL_1'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # format timestamp   
    ts_pcoll = query_results | 'Format Timestamp' >> beam.ParDo(FormatTimestampFn())
         
    # group by primary key
    grouped_pcoll = ts_pcoll | 'Group by PK' >> beam.GroupByKey()
         
    # remove duplicate records
    unique_pcoll = grouped_pcoll | 'Remove Duplicates' >> beam.ParDo(RemoveDuplicatesFn())

    # write new PCollection to log file
    unique_pcoll | 'Write log' >> WriteToText(DIR_PATH + 'unique_pcoll.txt')
        
    dataset_id = 'covid_19_modeled'
    table_id = 'Event_Beam_DF'
    schema_id = '''location_id:INTEGER,last_update:DATETIME,confirmed:INTEGER,deaths:INTEGER,recovered:INTEGER,active:INTEGER'''

    # write PCollection to BQ table
    unique_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                  table=table_id, 
                                                  schema=schema_id,
                                                  project=PROJECT_ID,
                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()      
コード例 #12
0
def run():
    PROJECT_ID = 'spry-cosine-266801'
    BUCKET = 'gs://icyhot-pack_beam'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'location-df'  # dataflow does not like '_' or special characters
    google_cloud_options.staging_location = BUCKET + '/staging'  #req*
    google_cloud_options.temp_location = BUCKET + '/temp'  #req*
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT id, province_state, country_region FROM covid19_jhu_csse_modeled.location_id'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # format US
    formatted_us_pcoll = query_results | 'Format US' >> beam.ParDo(
        FormatUSFn())

    # write PCollection to log file
    formatted_us_pcoll | 'Write log 1' >> WriteToText(DIR_PATH +
                                                      'formatted_us_pcoll.txt')

    dataset_id = 'covid19_jhu_csse_modeled'
    table_id = 'location_id_Beam_DF'
    schema_id = 'id:INTEGER, province_state:STRING, country_region:STRING'

    # write PCollection to new BQ table
    formatted_us_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
コード例 #13
0
def run():
    PROJECT_ID = 'starry-center-266501'  # change to your project id
    BUCKET = 'gs://imdb-beam'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'split-characters-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT DISTINCT tConst, nConst, job, characters FROM imdb_modeled.Characters'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # apply ParDo to split the directors titles
    # call pardo, pipe query results to pardo
    split_characters_pcoll = query_results | 'Return title: director i dictonaries' >> beam.ParDo(
        SplitCharactersFn())

    # write PCollection to log file
    split_characters_pcoll | 'Write log 1' >> WriteToText(
        DIR_PATH + 'split_characters_pcoll.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Characters_Beam_DF'
    schema_id = 'tConst:STRING, nConst:STRING, job:STRING, characters:STRING'

    # write PCollection to new BQ table
    split_characters_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()
コード例 #14
0
def run():
    pipeline_options = PipelineOptions()
    google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    google_cloud_options.project = settings['gcp']['project']
    google_cloud_options.job_name = settings['job']['name']
    google_cloud_options.staging_location = f"gs://{settings['gcp']['bucket']}/staging"
    google_cloud_options.temp_location = f"gs://{settings['gcp']['bucket']}/temp"
    pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner'
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as pipe:

        text = pipe | 'Read from text file' >> beam.io.ReadFromText(
            f"gs://twitter_data_etl/stuff/{settings['job']['input']}")

        tokens = (
            text
            | 'Remove RT from text' >> beam.Map(strip_extra, chars='RT')
            | 'Remove twitter handles' >> beam.Regex.replace_all(
                r'@[a-zA-Z0-9_]+', "")
            | 'Remove all url links' >> beam.Regex.replace_all(
                r'http[s]*://[a-zA-Z0-9_\.\/]+', '')
            | 'Remove punctuation' >> beam.Map(lambda text: text.translate(
                str.maketrans("", "", string.punctuation)))
            |
            'Remove all tabs' >> beam.Map(lambda text: text.replace("\t", ""))
            | 'Remove periods (not removed in punctuation step?)' >>
            beam.Map(lambda text: text.replace(".", ""))
            | 'Make all lowercase' >> beam.Map(lambda text: text.lower())
            | 'Split tweets into words' >> beam.ParDo(ExtractWordFromTweets()))

        words = (
            tokens
            | 'Prepare word tuples' >> beam.Map(lambda word: (word, 1))
            |
            'Group and sum the words to get counts' >> beam.CombinePerKey(sum)
            | 'Save to file' >> beam.io.WriteToText(
                settings['job']['output_words']))

        # This worked locally ... was having issues with getting the worker package installation in dataflow
        # emojis = (tokens | 'Filter to keep emojis only' >> beam.Filter(lambda token: token if token in emoji.UNICODE_EMOJI else False)
        # | 'Prepare emoji tuples' >> beam.Map(lambda emoji: (emoji, 1))
        # | 'Group and sum the emojis to get counts' >> beam.CombinePerKey(sum)
        # | 'Save emojis to text' >> beam.io.WriteToText(settings['job']['output_emojis'])
        # )

    result = pipe.run()
    result.wait_until_finish()
コード例 #15
0
def main(unused_argv):
    PyPIArtifactRegistry.register_artifact('beautifulsoup4', '>=4.9,<5.0')
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--port',
                        type=int,
                        help='port on which to serve the job api')
    parser.add_argument('--fully_qualified_name_glob', default=None)
    options = parser.parse_args()

    global server
    with fully_qualified_named_transform.FullyQualifiedNamedTransform.with_filter(
            options.fully_qualified_name_glob):
        server = grpc.server(thread_pool_executor.shared_unbounded_instance())
        beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server(
            expansion_service.ExpansionServiceServicer(
                PipelineOptions([
                    "--experiments", "beam_fn_api", "--sdk_location",
                    "container"
                ])), server)
        beam_artifact_api_pb2_grpc.add_ArtifactRetrievalServiceServicer_to_server(
            artifact_service.ArtifactRetrievalService(
                artifact_service.BeamFilesystemHandler(None).file_reader),
            server)
        server.add_insecure_port('localhost:{}'.format(options.port))
        server.start()
        _LOGGER.info('Listening for expansion requests at %d', options.port)

        signal.signal(signal.SIGTERM, cleanup)
        signal.signal(signal.SIGINT, cleanup)
        # blocking main thread forever.
        signal.pause()
コード例 #16
0
def main(argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--port',
                        type=int,
                        help='port on which to serve the job api')
    parser.add_argument('--fully_qualified_name_glob', default=None)
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(
        pipeline_args +
        ["--experiments=beam_fn_api", "--sdk_location=container"])

    with fully_qualified_named_transform.FullyQualifiedNamedTransform.with_filter(
            known_args.fully_qualified_name_glob):

        server = grpc.server(thread_pool_executor.shared_unbounded_instance())
        beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server(
            expansion_service.ExpansionServiceServicer(pipeline_options),
            server)
        beam_artifact_api_pb2_grpc.add_ArtifactRetrievalServiceServicer_to_server(
            artifact_service.ArtifactRetrievalService(
                artifact_service.BeamFilesystemHandler(None).file_reader),
            server)
        server.add_insecure_port('localhost:{}'.format(known_args.port))
        server.start()
        _LOGGER.info('Listening for expansion requests at %d', known_args.port)

        def cleanup(unused_signum, unused_frame):
            _LOGGER.info('Shutting down expansion service.')
            server.stop(None)

        signal.signal(signal.SIGTERM, cleanup)
        signal.signal(signal.SIGINT, cleanup)
        # blocking main thread forever.
        signal.pause()
コード例 #17
0
ファイル: pipeline_test.py プロジェクト: sarahwalters/beam
  def test_retry_fork_graph(self):
    pipeline_options = PipelineOptions(['--direct_runner_bundle_retry'])
    p = beam.Pipeline(options=pipeline_options)

    # TODO(mariagh): Remove the use of globals from the test.
    global count_b, count_c # pylint: disable=global-variable-undefined
    count_b, count_c = 0, 0

    def f_b(x):
      global count_b  # pylint: disable=global-variable-undefined
      count_b += 1
      raise Exception('exception in f_b')

    def f_c(x):
      global count_c  # pylint: disable=global-variable-undefined
      count_c += 1
      raise Exception('exception in f_c')

    names = p | 'CreateNodeA' >> beam.Create(['Ann', 'Joe'])

    fork_b = names | 'SendToB' >> beam.Map(f_b) # pylint: disable=unused-variable
    fork_c = names | 'SendToC' >> beam.Map(f_c) # pylint: disable=unused-variable

    with self.assertRaises(Exception):
      p.run().wait_until_finish()
    assert count_b == count_c == 4
コード例 #18
0
def main():
    args = parse_args()
    pipeline_options = PipelineOptions(**vars(args))
    pipeline = beam.Pipeline(options=pipeline_options)
    train_files = glob.glob("./mnist_images/train" + os.sep + "*.jpg")
    eval_files = glob.glob("./mnist_images/eval" + os.sep + "*.jpg")
    _ = (
        pipeline
        | 'ListTrainFiles' >> beam.Create(train_files)
        | 'TrainReadFiles' >> beam.Map(lambda path: (read_from_path(path)))
        | 'WriteToTrainTfrecord' >> beam.io.tfrecordio.WriteToTFRecord(
            file_path_prefix=path.join("mnist_tfrecords", "train", "train"),
            compression_type=beam.io.filesystems.CompressionTypes.UNCOMPRESSED,
            coder=coders.ExampleProtoCoder(tfrecord_schema()),
            file_name_suffix='.tfrecord'))
    _ = (
        pipeline
        | 'ListEvalFiles' >> beam.Create(eval_files)
        | 'EvalReadFiles' >> beam.Map(lambda path: (read_from_path(path)))
        | 'WriteToEvalTfrecord' >> beam.io.tfrecordio.WriteToTFRecord(
            file_path_prefix=path.join("mnist_tfrecords", "eval", "eval"),
            compression_type=beam.io.filesystems.CompressionTypes.UNCOMPRESSED,
            coder=coders.ExampleProtoCoder(tfrecord_schema()),
            file_name_suffix='.tfrecord'))
    pipeline.run().wait_until_finish()
コード例 #19
0
def run():         
    PROJECT_ID = 'corvid-276516'
    BUCKET = 'gs://covid-bucket19' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'format-codes--df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    
    sql = 'SELECT * FROM covid_staging.googleMobility ORDER BY date, country_region' # passing a query. Shouldn't process more than 1000 records w DR
   
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) # direct runner is not running in parallel on several workers. DR is local

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # read results and assign them to a new p-collection

     # call pardo, pipe query results to pardo
    format_alphaCode_pcoll = query_results | 'Change the country code for Greece, the UK, and Hong Kong. Drop Reunion' >> beam.ParDo(format_alphaCodeFn()) 

     # write PCollection to log file
    format_alphaCode_pcoll | 'Write log 1' >> WriteToText('geodist_beam.txt') 

    dataset_id = 'covid_modeled'
    table_id = 'mobility_beam'
    schema_id = 'code:STRING, country:STRING, date:DATE, average_change:INTEGER, retail_and_recreation:INTEGER, grocery_and_pharmacy:INTEGER, parks:INTEGER, transit_stations:INTEGER, workplaces:INTEGER,residential:INTEGER'

     # write PCollection to new BQ table
    format_alphaCode_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                table=table_id, 
                                                schema=schema_id, 
                                                project=PROJECT_ID,
                                                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, 
                                                batch_size=int(100))
     
    result = p.run()
    result.wait_until_finish()      
コード例 #20
0
ファイル: pipeline_test.py プロジェクト: jamesnicolas/beam
    def test_view_as(self):
        generic_options = PipelineOptions(['--slices=3'])
        self.assertEqual(3, generic_options.view_as(Bacon).slices)
        self.assertEqual(3, generic_options.view_as(Breakfast).slices)

        generic_options.view_as(Breakfast).slices = 10
        self.assertEqual(10, generic_options.view_as(Bacon).slices)

        with self.assertRaises(AttributeError):
            generic_options.slices  # pylint: disable=pointless-statement

        with self.assertRaises(AttributeError):
            generic_options.view_as(Eggs).slices  # pylint: disable=expression-not-assigned
コード例 #21
0
def run():

    PROJECT_ID = 'data-lake-290221'
    BUCKET = 'gs://dataflow-log-data'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DirectRunner',
                              project=PROJECT_ID,
                              job_name='transpose',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    options.view_as(SetupOptions).save_main_session = True

    p = beam.pipeline.Pipeline(options=options)

    sql = '''select farm_fingerprint(concat(cast(latitude as string), cast(longitude as string))) as location_id, * from covid19_confirmed.raw_cases'''

    #bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET)
    bq_source = BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Transpose' >> beam.ParDo(Transpose())

    #out_pcoll | 'Write to log' >> WriteToText('records.txt')

    dataset_id = 'covid19_confirmed'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'daily_cases'
    schema_id = 'location_id:INTEGER,date:DATE,cases:INTEGER'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
コード例 #22
0
  def test_view_as(self):
    generic_options = PipelineOptions(['--slices=3'])
    self.assertEquals(3, generic_options.view_as(Bacon).slices)
    self.assertEquals(3, generic_options.view_as(Breakfast).slices)

    generic_options.view_as(Breakfast).slices = 10
    self.assertEquals(10, generic_options.view_as(Bacon).slices)

    with self.assertRaises(AttributeError):
      generic_options.slices  # pylint: disable=pointless-statement

    with self.assertRaises(AttributeError):
      generic_options.view_as(Eggs).slices  # pylint: disable=expression-not-assigned
コード例 #23
0
def main(unused_argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--port',
                        type=int,
                        help='port on which to serve the job api')
    options = parser.parse_args()
    global server
    server = grpc.server(futures.ThreadPoolExecutor(max_workers=2))
    beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server(
        expansion_service.ExpansionServiceServicer(PipelineOptions()), server)
    server.add_insecure_port('localhost:{}'.format(options.port))
    server.start()
    logging.info('Listening for expansion requests at %d', options.port)

    # blocking main thread forever.
    signal.pause()
コード例 #24
0
def main(unused_argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--port',
                        type=int,
                        help='port on which to serve the job api')
    options = parser.parse_args()
    global server
    server = grpc.server(UnboundedThreadPoolExecutor())
    beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server(
        expansion_service.ExpansionServiceServicer(
            PipelineOptions(["--experiments", "beam_fn_api"])), server)
    server.add_insecure_port('localhost:{}'.format(options.port))
    server.start()
    _LOGGER.info('Listening for expansion requests at %d', options.port)

    signal.signal(signal.SIGTERM, cleanup)
    signal.signal(signal.SIGINT, cleanup)
    # blocking main thread forever.
    signal.pause()
コード例 #25
0
def main(unused_argv):
    parser = argparse.ArgumentParser()
    parser.add_argument('-p',
                        '--port',
                        type=int,
                        help='port on which to serve the job api')
    options = parser.parse_args()
    global server
    server = grpc.server(UnboundedThreadPoolExecutor())

    # DOCKER SDK Harness
    beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server(
        expansion_service.ExpansionServiceServicer(
            PipelineOptions([
                "--experiments", "beam_fn_api", "--sdk_location", "container"
            ])), server)

    # PROCESS SDK Harness
    # beam_expansion_api_pb2_grpc.add_ExpansionServiceServicer_to_server(
    #     expansion_service.ExpansionServiceServicer(
    #         PipelineOptions.from_dictionary({
    #             'environment_type': 'PROCESS',
    #             'environment_config': '{"command": "sdks/python/container/build/target/launcher/darwin_amd64/boot"}',
    #             'experiments': 'beam_fn_api',
    #             'sdk_location': 'container',
    #         })
    #     ), server
    # )

    server.add_insecure_port('localhost:{}'.format(options.port))
    server.start()
    _LOGGER.info('Listening for expansion requests at %d', options.port)

    signal.signal(signal.SIGTERM, cleanup)
    signal.signal(signal.SIGINT, cleanup)
    # blocking main thread forever.
    signal.pause()
def run():
    PROJECT_ID = 'electric-spark-266716'  # change to your project id
    BUCKET = 'gs://global_surface_temperatures'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'population-statistics-beam-dataflow-2'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    #create query to select all elements for cleansing
    sql = 'SELECT * FROM kaggle2_modeled.Population_Statistics'

    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    #read desired table from BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    #write inputs to input.txt
    query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt')

    # apply ParDo to filter out dates
    transposed_date_pcoll = query_results | 'Transpose Dates' >> beam.ParDo(
        TransposeDateFn())

    #write filtered dates to filtered.txt
    transposed_date_pcoll | 'Write transpose Dates' >> WriteToText(
        DIR_PATH + 'transposed.txt')

    #flatten list to get individual records
    flatten_pcoll = transposed_date_pcoll | 'Flatten lists' >> beam.FlatMap(
        generate_elements)

    #write resulting PColleciton to output.txt
    flatten_pcoll | 'Write output' >> WriteToText(DIR_PATH +
                                                  'output_final_dates.txt')

    #create new table in BigQuery
    dataset_id = 'kaggle2_modeled'
    table_id = 'Population_Statistics_Beam_DF'
    schema_id = 'dt:DATE,countryName:STRING,countryCode:STRING, \
    metric:STRING,metricCode:STRING,statistic:FLOAT'

    # write PCollection to new BQ table
    flatten_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
コード例 #27
0
def run():
     PROJECT_ID = 'studied-brand-266702' # change to your project id
     BUCKET = 'gs://beam_cs327e_project' # change to your bucket name
     DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

     # Create and set your PipelineOptions.
     options = PipelineOptions(flags=None)

     # For Dataflow execution, set the project, job_name,
     # staging location, temp_location and specify DataflowRunner.
     google_cloud_options = options.view_as(GoogleCloudOptions)
     google_cloud_options.project = PROJECT_ID
     google_cloud_options.job_name = 'adverse-event-df'
     google_cloud_options.staging_location = BUCKET + '/staging'
     google_cloud_options.temp_location = BUCKET + '/temp'
     options.view_as(StandardOptions).runner = 'DataflowRunner'

     # Create the Pipeline with the specified options.
     p = Pipeline(options=options)
     
     # run BigQuery query on dataset
     sql = 'SELECT * FROM vaers_modeled.Adverse_Event'
     bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

     input_pcoll = p | 'Read from BigQuery' >> beam.io.Read(bq_source)
        
     # write input PCollection to input.txt
     input_pcoll | 'Write input_pcoll log 1' >> WriteToText(DIR_PATH + 'input.txt')
        
     # standardize adverse_event RECOVD attribute values into true or false (boolean)
     formatted_recovd_pcoll = input_pcoll | 'Format RECOVD' >> beam.ParDo(FormatRECOVDFn())
        
     # write PCollection to log file
     formatted_recovd_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'formatted_recovd_pcoll.txt')
        
     # standardize adverse_event BIRTH_DEFECT attribute values into true or false (boolean)
     formatted_defect_pcoll = formatted_recovd_pcoll | 'Format BIRTH_DEFECT' >> beam.ParDo(FormatBIRTH_DEFECTFn())
        
     # write PCollection to log file
     formatted_defect_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'formatted_defect_pcoll.txt')
        
     # standardize boolean attribute values which are null into false 
     output_pcoll = formatted_defect_pcoll | 'Format boolean attributes' >> beam.ParDo(FormatBooleanAttributesFn())
        
     # write output PCollection to output.txt
     output_pcoll | 'Write output_pcoll log 4' >> WriteToText(DIR_PATH + 'output.txt')
     
     # specify id and schema
     dataset_id = 'vaers_modeled'
     table_id = 'Adverse_Event_Beam_DF'
     # change RECOVD and BIRTH_DEFECT attributes into BOOLEANS
     schema_id = 'VAERS_ID:INTEGER, ONSET_DATE:DATE, RECOVD:BOOLEAN, DIED:BOOLEAN, DATEDIED:DATE, L_THREAT:BOOLEAN, OFC_VISIT:BOOLEAN, ER_VISIT:BOOLEAN, ER_ED_VISIT:BOOLEAN, HOSPITAL:BOOLEAN, HOSPDAYS:INTEGER, X_STAY:BOOLEAN, DISABLE:BOOLEAN, BIRTH_DEFECT:BOOLEAN, OTHER_MEDS:STRING, CUR_ILL:STRING, HISTORY:STRING, PRIOR_VAX:STRING' 

     # write output PCollection to new BQ table
     output_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                  table=table_id, 
                                                  schema=schema_id,
                                                  project=PROJECT_ID,
                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                                                  batch_size=int(100))
         
     result = p.run()
     result.wait_until_finish()      
コード例 #28
0
import apache_beam as beam
import re

from apache_beam.pipeline import PipelineOptions

pipeline_args = [
    #1. DataflowRunner runs the pipeline on Google Cloud Dataflow
    '--runner=DataflowRunner',
    #2. Google Cloud Project ID
    '--project=your-project-ID',
    #3. Google Cloud Storage path is required for staging local files
    '--staging_location=gs://your-bucket-name/staging',
    #4. Google Cloud Storage path is required for temporary files
    '--temp_location=gs://your-bucket-name/temp',
    #5. (Optional) Job name to be displayed in the logs
    '--job_name=word-count-job'
]
pipeline_options = PipelineOptions(pipeline_args)
pipeline = beam.Pipeline(options=pipeline_options)

# Data Transforms
(pipeline
 | 'read file' >>
 beam.io.ReadFromText('gs://dataflow-samples/shakespeare/kinglear.txt')
 | 'get words' >> beam.FlatMap(lambda x: re.findall(r'\w+', x))
 | 'count words' >> beam.combiners.Count.PerElement()
 | 'save' >> beam.io.WriteToText('gs://your-bucket-name/wordcount-output.txt'))
pipeline.run()
コード例 #29
0
import apache_beam as beam
from apache_beam.pipeline import PipelineOptions

options = PipelineOptions()


class MyOptions(PipelineOptions):
    @classmethod
    def _add_argparse_args(cls, parser):
        parser.add_argument('--input',
                            help='Input for the pipeline',
                            default='./data/')
        parser.add_argument('--output',
                            help='Output for the pipeline',
                            default='./output/')


class Split(beam.DoFn):
    def process(self, element):
        Date, Open, High, Low, Close, Volume = element.split(',')
        try:
            return [{'Open': float(Open), 'Close': float(Close)}]
        except:
            return [{'Open': 0.0, 'Close': 0.0}]


class CollectOpen(beam.DoFn):
    def process(self, element):
        result = [(1, element['Open'])]
        return result
コード例 #30
0
 def __init__(self):
   super(EventsPipeline, self).__init__(options=PipelineOptions())
コード例 #31
0
def run(in_args=None):
    """Runs the pre-processing pipeline."""

    pipeline_options = PipelineOptions.from_dictionary(vars(in_args))
    with beam.Pipeline(options=pipeline_options) as p:
        configure_pipeline(p, in_args)