예제 #1
0
def run():
    PROJECT_ID = 'electric-spark-266716'  # change to your project id
    BUCKET = 'gs://global_surface_temperatures'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'country-beam-dataflow'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    #create query to select all elements for cleansing
    sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, Country \
    FROM kaggle_modeled.Country as x'

    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    #read desired table from BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    #write inputs to input.txt
    query_results | 'Write input' >> WriteToText(DIR_PATH +
                                                 'input_country.txt')

    # apply ParDo to filter out dates
    formatted_country_pcoll = query_results | 'Filter Dates' >> beam.ParDo(
        FilterDateFn())

    # display filtered countries
    formatted_country_pcoll | 'Write filtered dates' >> WriteToText(
        DIR_PATH + 'output_country.txt')

    #create new table in BigQuery
    dataset_id = 'kaggle_modeled'
    table_id = 'Country_Beam_DF'
    schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,Country:STRING'

    # write PCollection to new BQ table
    formatted_country_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
def run():         
    PROJECT_ID = 'cs327e-sp2020' # change to your project id
    BUCKET = 'gs://beam-output-data' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'student-df2'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT sid, fname, lname, dob, status FROM college_workflow_modeled.Student'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # standardize the students' date of birth  
    formatted_dob_pcoll = query_results | 'Format DOB' >> beam.ParDo(FormatDOBFn())

    # write PCollection to log file
    formatted_dob_pcoll | 'Write log 1' >> WriteToText(DIR_PATH + 'formatted_dob_pcoll.txt')

    # group students by sid
    grouped_student_pcoll = formatted_dob_pcoll | 'Group by sid' >> beam.GroupByKey()

    # write PCollection to log file
    #grouped_student_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'grouped_student_pcoll.txt')

    # remove duplicate student records
    distinct_student_pcoll = grouped_student_pcoll | 'Dedup student records' >> beam.ParDo(DedupStudentRecordsFn())

    # write PCollection to log file
    distinct_student_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'distinct_student_pcoll.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Student_Beam_DF'
    schema_id = 'sid:STRING,fname:STRING,lname:STRING,dob:DATE,status:STRING'

    # write PCollection to new BQ table
    distinct_student_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                  table=table_id, 
                                                  schema=schema_id,
                                                  project=PROJECT_ID,
                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
         
    result = p.run()
    result.wait_until_finish()      
def run():
    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'takes-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = Pipeline(options=options)

    takes_sql = 'SELECT sid, cno, grade FROM college_workflow_modeled.Takes'
    class_sql = 'SELECT cid, cno FROM college_workflow_modeled.Class'

    takes_pcoll = p | 'Read from BQ Takes' >> beam.io.Read(
        beam.io.BigQuerySource(query=takes_sql, use_standard_sql=True))
    class_pcoll = p | 'Read from BQ Class' >> beam.io.Read(
        beam.io.BigQuerySource(query=class_sql, use_standard_sql=True))

    # write PCollections to log files
    takes_pcoll | 'Write log 1' >> WriteToText(DIR_PATH +
                                               'takes_query_results.txt')
    class_pcoll | 'Write log 2' >> WriteToText(DIR_PATH +
                                               'class_query_results.txt')

    # ParDo with side-input
    norm_takes_pcoll = takes_pcoll | 'Normalize Record' >> beam.ParDo(
        NormalizeDoFn(), beam.pvalue.AsList(class_pcoll))

    # write PCollection to log file
    norm_takes_pcoll | 'Write log 3' >> WriteToText(DIR_PATH +
                                                    'norm_takes_pcoll.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Takes_Beam_DF'
    schema_id = 'sid:STRING,cid:STRING,grade:STRING'

    # write PCollection to new BQ table
    norm_takes_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'studied-brand-266702'  # change to your project id
    BUCKET = 'gs://beam_cs327e_project'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'vaccination-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    # run BigQuery query on dataset
    sql = 'SELECT * FROM vaers_modeled.Vaccination'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    input_pcoll = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # write input PCollection to input.txt
    input_pcoll | 'Write input_pcoll log 1' >> WriteToText(
        DIR_PATH + 'input_vaccination.txt')

    # standardize vaccination V_FUNDBY, VAX_ROUTE and VAX_SITE unknown/empty attribute
    formatted_vaccination_pcoll = input_pcoll | 'Format Unknown Values' >> beam.ParDo(
        FormatUnknownFn())

    # write PCollection to log file
    formatted_vaccination_pcoll | 'Write log 2' >> WriteToText(
        DIR_PATH + 'formatted_unknown_pcoll.txt')

    # specify id and schema
    dataset_id = 'vaers_modeled'
    table_id = 'Vaccination_Beam_DF'
    schema_id = 'VACCINATION_ID:INTEGER, VAERS_ID:INTEGER, VAX_DATE:DATE, VAX_ID:INTEGER, MANU_ID:INTEGER, V_ADMINBY:STRING, V_FUNDBY:STRING, VAX_ROUTE:STRING, VAX_SITE:STRING'

    # write output PCollection to new BQ table
    formatted_vaccination_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
예제 #5
0
def run():
    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'teacher-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT tid, instructor, dept FROM college_workflow_modeled.Teacher'
    query_results = p | 'Read from BigQuery' >> beam.io.Read(
        beam.io.BigQuerySource(query=sql, use_standard_sql=True))

    query_results | 'Write log 1' >> WriteToText('query_results.txt')

    teacher_pcoll = query_results | 'Standardize' >> beam.ParDo(
        StandardizeDoFn())

    teacher_pcoll | 'Write log 2' >> WriteToText('formatted_teacher_pcoll.txt')

    # group records by tid
    grouped_pcoll = teacher_pcoll | 'Group by tid' >> beam.GroupByKey()

    grouped_pcoll | 'Write log 3' >> WriteToText('grouped_teacher.txt')

    # remove duplicates
    distinct_pcoll = grouped_pcoll | 'Dedup' >> beam.ParDo(DedupRecordsDoFn())

    distinct_pcoll | 'Write log 4' >> WriteToText('distinct_teacher.txt')

    dataset_id = 'college_workflow_modeled'
    table_id = 'Teacher_Beam_DF'
    schema_id = 'tid:STRING,fname:STRING,lname:STRING,dept:STRING'

    distinct_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
예제 #6
0
def run():
    PROJECT_ID = 'starry-center-266501'  # change to your project id
    BUCKET = 'gs://imdb-beam'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'format-date-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT * FROM bollywood_modeled.bollywoodTitles'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # write PCollection to log file
    query_results | 'Write log 1' >> WriteToText(DIR_PATH +
                                                 'query_results.txt')

    # apply ParDo to format the student's date of birth
    formatDate_pcoll = query_results | 'Format the dates' >> beam.ParDo(
        FormatDateFn())

    # write PCollection to log file
    formatDate_pcoll | 'Write log 2' >> WriteToText(DIR_PATH +
                                                    'formatDate_pcoll.txt')

    dataset_id = 'bollywood_modeled'
    table_id = 'bollywoodTitles_Beam_DF'
    schema_id = 'title:STRING,releaseDate:DATE,croresGrossed:NUMERIC'

    # write PCollection to new BQ table
    formatDate_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'swift-area-266618'  # change to your project id
    BUCKET = 'gs://nullbusters_data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'directors'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT name, birth_name, height, bio, birth_details, birth_year, place_of_birth, death_details, death_year, spouses, divorces, children, known_for_titles, imdb_title_id, director_name_id, category, reason_of_death FROM imdb_modeled.Directors WHERE birth_year IS NOT NULL AND death_year IS NOT NULL'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    query_results | 'Write log Input' >> WriteToText('input.txt')

    # apply ParDo to format directors birth year and death years to be ints
    formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo(
        FormatYearsFn())

    # write PCollection to log file
    formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH +
                                                             'output.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Directors_Beam_DF'
    schema_id = 'name:STRING,birth_name:STRING,height:FLOAT,bio:STRING,birth_details:STRING,birth_year:INTEGER,place_of_birth:STRING,death_details:STRING,death_year:INTEGER,spouses:INTEGER,divorces:INTEGER,children:STRING,known_for_titles:STRING,imdb_title_id:STRING,category:STRING,reason_of_death:STRING,director_name_id:STRING'

    # write PCollection to new BQ table
    formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
예제 #8
0
def run():
    PROJECT_ID = 'responsive-cab-267123'  # change to your project id
    BUCKET = 'gs://bmease_cs327e'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'
    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'foodmap-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'
    # Create beam pipeline using local runner
    p = Pipeline(options=options)

    # get average price per year for each food
    sql = "SELECT LOWER(product_name) AS product_name, product_id,p.aisle_id FROM instacart_modeled.Products p WHERE p.product_name not like '%Filters%' and p.aisle_id NOT IN (11,20,22,25,44,55,73,80,109,118,126,127,132,133,10,54,60,74,75,85,87,101,111,114,56,82,102)"
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)
    # write PCollection to input file
    query_results | 'Write to input.txt' >> WriteToText(DIR_PATH + 'input.txt')

    # apply ParDo to format the key, value pairs
    # key is the food_id and value is a tuple of year and average price that year
    nom_match_pcoll = query_results | 'Food and matches from nom' >> beam.ParDo(
        MatchProductFn())

    # write PCollection to output file
    nom_match_pcoll | 'Write to output.txt' >> WriteToText(DIR_PATH +
                                                           'output.txt')

    dataset_id = 'USDA_ERS_modeled'
    table_id = 'Food_Map_Beam_DF'
    schema_id = 'food_id:INTEGER,product_id:INTEGER'

    # write PCollection to new BQ table
    nom_match_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
def run():
    PROJECT_ID = 'swift-area-266618'  # change to your project id
    BUCKET = 'gs://nullbusters_data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'student-df5'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT imdb_title_id, title, original_title, year, genre, duration, country, language, director, writer, production_company, actors, description, avg_vote, votes, budget, usa_gross_income, worlwide_gross_income, metascore, reviews_from_users, reviews_from_critics FROM imdb_modeled.Movies WHERE usa_gross_income IS NOT NULL and worlwide_gross_income IS NOT NULL'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    query_results | 'Write log Input' >> WriteToText('input.txt')

    # apply ParDo to format directors birth year and death years to be ints
    formatted_year_pcoll = query_results | 'Format Years' >> beam.ParDo(
        FormatYearsFn())

    # write PCollection to log file
    formatted_year_pcoll | 'Write log Output' >> WriteToText(DIR_PATH +
                                                             'output.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Movies_Beam_DF'
    schema_id = 'imdb_title_id:STRING, title:STRING, original_title:STRING, year:INTEGER, genre:STRING, duration:INTEGER, country:STRING, language:STRING, director:STRING, writer:STRING, production_company:STRING, actors:STRING, description:STRING, avg_votes:FLOAT, votes:INTEGER, budget_currency:STRING, budget:INTEGER, usa_gross_income:INTEGER, worlwide_gross_income_currency:STRING, worlwide_gross_income:INTEGER, metascore:FLOAT, reviews_from_users:FLOAT, reviews_from_critics:FLOAT'

    # write PCollection to new BQ table
    formatted_year_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
        batch_size=int(100))

    result = p.run()
    result.wait_until_finish()
예제 #10
0
def run():
    PROJECT_ID = 'cs327e-sp2020' # change to your project id
    BUCKET = 'gs://beam-output-data' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'event'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = Pipeline(options=options)

    sql = 'SELECT * FROM covid_19_modeled.Event_SQL_1'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # format timestamp   
    ts_pcoll = query_results | 'Format Timestamp' >> beam.ParDo(FormatTimestampFn())
         
    # group by primary key
    grouped_pcoll = ts_pcoll | 'Group by PK' >> beam.GroupByKey()
         
    # remove duplicate records
    unique_pcoll = grouped_pcoll | 'Remove Duplicates' >> beam.ParDo(RemoveDuplicatesFn())

    # write new PCollection to log file
    unique_pcoll | 'Write log' >> WriteToText(DIR_PATH + 'unique_pcoll.txt')
        
    dataset_id = 'covid_19_modeled'
    table_id = 'Event_Beam_DF'
    schema_id = '''location_id:INTEGER,last_update:DATETIME,confirmed:INTEGER,deaths:INTEGER,recovered:INTEGER,active:INTEGER'''

    # write PCollection to BQ table
    unique_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                  table=table_id, 
                                                  schema=schema_id,
                                                  project=PROJECT_ID,
                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()      
예제 #11
0
def run():
    PROJECT_ID = 'starry-center-266501'  # change to your project id
    BUCKET = 'gs://imdb-beam'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'split-characters-df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT DISTINCT tConst, nConst, job, characters FROM imdb_modeled.Characters'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # apply ParDo to split the directors titles
    # call pardo, pipe query results to pardo
    split_characters_pcoll = query_results | 'Return title: director i dictonaries' >> beam.ParDo(
        SplitCharactersFn())

    # write PCollection to log file
    split_characters_pcoll | 'Write log 1' >> WriteToText(
        DIR_PATH + 'split_characters_pcoll.txt')

    dataset_id = 'imdb_modeled'
    table_id = 'Characters_Beam_DF'
    schema_id = 'tConst:STRING, nConst:STRING, job:STRING, characters:STRING'

    # write PCollection to new BQ table
    split_characters_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)
    result = p.run()
    result.wait_until_finish()
예제 #12
0
def run():
    PROJECT_ID = 'spry-cosine-266801'
    BUCKET = 'gs://icyhot-pack_beam'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'location-df'  # dataflow does not like '_' or special characters
    google_cloud_options.staging_location = BUCKET + '/staging'  #req*
    google_cloud_options.temp_location = BUCKET + '/temp'  #req*
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    sql = 'SELECT id, province_state, country_region FROM covid19_jhu_csse_modeled.location_id'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # format US
    formatted_us_pcoll = query_results | 'Format US' >> beam.ParDo(
        FormatUSFn())

    # write PCollection to log file
    formatted_us_pcoll | 'Write log 1' >> WriteToText(DIR_PATH +
                                                      'formatted_us_pcoll.txt')

    dataset_id = 'covid19_jhu_csse_modeled'
    table_id = 'location_id_Beam_DF'
    schema_id = 'id:INTEGER, province_state:STRING, country_region:STRING'

    # write PCollection to new BQ table
    formatted_us_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
def run():

    PROJECT_ID = 'cs327e-sp2020'  # change to your project id
    BUCKET = 'gs://beam-output-data'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'location'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    p = Pipeline(options=options)

    sql = 'SELECT * FROM covid_19_modeled.Location_SQL_1'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # extract city from state
    state_pcoll = query_results | 'Format State' >> beam.ParDo(FormatStateFn())

    grouped_pcoll = state_pcoll | 'Group Locations' >> beam.GroupByKey()

    unique_pcoll = grouped_pcoll | 'Remove Duplicates' >> beam.ParDo(
        RemoveDuplicatesFn())

    dataset_id = 'covid_19_modeled'
    table_id = 'Location_Beam_DF'
    schema_id = 'id:INTEGER,city:STRING,state:STRING,country:STRING,latitude:NUMERIC,longitude:NUMERIC,fips:INTEGER,admin2:STRING,combined_key:STRING'

    # write PCollection to BQ table
    unique_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
예제 #14
0
def run():
    pipeline_options = PipelineOptions()
    google_cloud_options = pipeline_options.view_as(GoogleCloudOptions)
    google_cloud_options.project = settings['gcp']['project']
    google_cloud_options.job_name = settings['job']['name']
    google_cloud_options.staging_location = f"gs://{settings['gcp']['bucket']}/staging"
    google_cloud_options.temp_location = f"gs://{settings['gcp']['bucket']}/temp"
    pipeline_options.view_as(StandardOptions).runner = 'DataflowRunner'
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as pipe:

        text = pipe | 'Read from text file' >> beam.io.ReadFromText(
            f"gs://twitter_data_etl/stuff/{settings['job']['input']}")

        tokens = (
            text
            | 'Remove RT from text' >> beam.Map(strip_extra, chars='RT')
            | 'Remove twitter handles' >> beam.Regex.replace_all(
                r'@[a-zA-Z0-9_]+', "")
            | 'Remove all url links' >> beam.Regex.replace_all(
                r'http[s]*://[a-zA-Z0-9_\.\/]+', '')
            | 'Remove punctuation' >> beam.Map(lambda text: text.translate(
                str.maketrans("", "", string.punctuation)))
            |
            'Remove all tabs' >> beam.Map(lambda text: text.replace("\t", ""))
            | 'Remove periods (not removed in punctuation step?)' >>
            beam.Map(lambda text: text.replace(".", ""))
            | 'Make all lowercase' >> beam.Map(lambda text: text.lower())
            | 'Split tweets into words' >> beam.ParDo(ExtractWordFromTweets()))

        words = (
            tokens
            | 'Prepare word tuples' >> beam.Map(lambda word: (word, 1))
            |
            'Group and sum the words to get counts' >> beam.CombinePerKey(sum)
            | 'Save to file' >> beam.io.WriteToText(
                settings['job']['output_words']))

        # This worked locally ... was having issues with getting the worker package installation in dataflow
        # emojis = (tokens | 'Filter to keep emojis only' >> beam.Filter(lambda token: token if token in emoji.UNICODE_EMOJI else False)
        # | 'Prepare emoji tuples' >> beam.Map(lambda emoji: (emoji, 1))
        # | 'Group and sum the emojis to get counts' >> beam.CombinePerKey(sum)
        # | 'Save emojis to text' >> beam.io.WriteToText(settings['job']['output_emojis'])
        # )

    result = pipe.run()
    result.wait_until_finish()
예제 #15
0
def run():         
    PROJECT_ID = 'corvid-276516'
    BUCKET = 'gs://covid-bucket19' # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'format-codes--df'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)
    
    sql = 'SELECT * FROM covid_staging.googleMobility ORDER BY date, country_region' # passing a query. Shouldn't process more than 1000 records w DR
   
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) # direct runner is not running in parallel on several workers. DR is local

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # read results and assign them to a new p-collection

     # call pardo, pipe query results to pardo
    format_alphaCode_pcoll = query_results | 'Change the country code for Greece, the UK, and Hong Kong. Drop Reunion' >> beam.ParDo(format_alphaCodeFn()) 

     # write PCollection to log file
    format_alphaCode_pcoll | 'Write log 1' >> WriteToText('geodist_beam.txt') 

    dataset_id = 'covid_modeled'
    table_id = 'mobility_beam'
    schema_id = 'code:STRING, country:STRING, date:DATE, average_change:INTEGER, retail_and_recreation:INTEGER, grocery_and_pharmacy:INTEGER, parks:INTEGER, transit_stations:INTEGER, workplaces:INTEGER,residential:INTEGER'

     # write PCollection to new BQ table
    format_alphaCode_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                table=table_id, 
                                                schema=schema_id, 
                                                project=PROJECT_ID,
                                                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, 
                                                batch_size=int(100))
     
    result = p.run()
    result.wait_until_finish()      
예제 #16
0
    def test_view_as(self):
        generic_options = PipelineOptions(['--slices=3'])
        self.assertEqual(3, generic_options.view_as(Bacon).slices)
        self.assertEqual(3, generic_options.view_as(Breakfast).slices)

        generic_options.view_as(Breakfast).slices = 10
        self.assertEqual(10, generic_options.view_as(Bacon).slices)

        with self.assertRaises(AttributeError):
            generic_options.slices  # pylint: disable=pointless-statement

        with self.assertRaises(AttributeError):
            generic_options.view_as(Eggs).slices  # pylint: disable=expression-not-assigned
예제 #17
0
  def test_view_as(self):
    generic_options = PipelineOptions(['--slices=3'])
    self.assertEquals(3, generic_options.view_as(Bacon).slices)
    self.assertEquals(3, generic_options.view_as(Breakfast).slices)

    generic_options.view_as(Breakfast).slices = 10
    self.assertEquals(10, generic_options.view_as(Bacon).slices)

    with self.assertRaises(AttributeError):
      generic_options.slices  # pylint: disable=pointless-statement

    with self.assertRaises(AttributeError):
      generic_options.view_as(Eggs).slices  # pylint: disable=expression-not-assigned
def run():

    PROJECT_ID = 'data-lake-290221'
    BUCKET = 'gs://dataflow-log-data'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DirectRunner',
                              project=PROJECT_ID,
                              job_name='transpose',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    options.view_as(SetupOptions).save_main_session = True

    p = beam.pipeline.Pipeline(options=options)

    sql = '''select farm_fingerprint(concat(cast(latitude as string), cast(longitude as string))) as location_id, * from covid19_confirmed.raw_cases'''

    #bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET)
    bq_source = BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Transpose' >> beam.ParDo(Transpose())

    #out_pcoll | 'Write to log' >> WriteToText('records.txt')

    dataset_id = 'covid19_confirmed'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'daily_cases'
    schema_id = 'location_id:INTEGER,date:DATE,cases:INTEGER'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
예제 #19
0
def run():
     PROJECT_ID = 'studied-brand-266702' # change to your project id
     BUCKET = 'gs://beam_cs327e_project' # change to your bucket name
     DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

     # Create and set your PipelineOptions.
     options = PipelineOptions(flags=None)

     # For Dataflow execution, set the project, job_name,
     # staging location, temp_location and specify DataflowRunner.
     google_cloud_options = options.view_as(GoogleCloudOptions)
     google_cloud_options.project = PROJECT_ID
     google_cloud_options.job_name = 'adverse-event-df'
     google_cloud_options.staging_location = BUCKET + '/staging'
     google_cloud_options.temp_location = BUCKET + '/temp'
     options.view_as(StandardOptions).runner = 'DataflowRunner'

     # Create the Pipeline with the specified options.
     p = Pipeline(options=options)
     
     # run BigQuery query on dataset
     sql = 'SELECT * FROM vaers_modeled.Adverse_Event'
     bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

     input_pcoll = p | 'Read from BigQuery' >> beam.io.Read(bq_source)
        
     # write input PCollection to input.txt
     input_pcoll | 'Write input_pcoll log 1' >> WriteToText(DIR_PATH + 'input.txt')
        
     # standardize adverse_event RECOVD attribute values into true or false (boolean)
     formatted_recovd_pcoll = input_pcoll | 'Format RECOVD' >> beam.ParDo(FormatRECOVDFn())
        
     # write PCollection to log file
     formatted_recovd_pcoll | 'Write log 2' >> WriteToText(DIR_PATH + 'formatted_recovd_pcoll.txt')
        
     # standardize adverse_event BIRTH_DEFECT attribute values into true or false (boolean)
     formatted_defect_pcoll = formatted_recovd_pcoll | 'Format BIRTH_DEFECT' >> beam.ParDo(FormatBIRTH_DEFECTFn())
        
     # write PCollection to log file
     formatted_defect_pcoll | 'Write log 3' >> WriteToText(DIR_PATH + 'formatted_defect_pcoll.txt')
        
     # standardize boolean attribute values which are null into false 
     output_pcoll = formatted_defect_pcoll | 'Format boolean attributes' >> beam.ParDo(FormatBooleanAttributesFn())
        
     # write output PCollection to output.txt
     output_pcoll | 'Write output_pcoll log 4' >> WriteToText(DIR_PATH + 'output.txt')
     
     # specify id and schema
     dataset_id = 'vaers_modeled'
     table_id = 'Adverse_Event_Beam_DF'
     # change RECOVD and BIRTH_DEFECT attributes into BOOLEANS
     schema_id = 'VAERS_ID:INTEGER, ONSET_DATE:DATE, RECOVD:BOOLEAN, DIED:BOOLEAN, DATEDIED:DATE, L_THREAT:BOOLEAN, OFC_VISIT:BOOLEAN, ER_VISIT:BOOLEAN, ER_ED_VISIT:BOOLEAN, HOSPITAL:BOOLEAN, HOSPDAYS:INTEGER, X_STAY:BOOLEAN, DISABLE:BOOLEAN, BIRTH_DEFECT:BOOLEAN, OTHER_MEDS:STRING, CUR_ILL:STRING, HISTORY:STRING, PRIOR_VAX:STRING' 

     # write output PCollection to new BQ table
     output_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                  table=table_id, 
                                                  schema=schema_id,
                                                  project=PROJECT_ID,
                                                  create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                  write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                                                  batch_size=int(100))
         
     result = p.run()
     result.wait_until_finish()      
def run():
    PROJECT_ID = 'electric-spark-266716'  # change to your project id
    BUCKET = 'gs://global_surface_temperatures'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'population-statistics-beam-dataflow-2'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    #create query to select all elements for cleansing
    sql = 'SELECT * FROM kaggle2_modeled.Population_Statistics'

    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    #read desired table from BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    #write inputs to input.txt
    query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt')

    # apply ParDo to filter out dates
    transposed_date_pcoll = query_results | 'Transpose Dates' >> beam.ParDo(
        TransposeDateFn())

    #write filtered dates to filtered.txt
    transposed_date_pcoll | 'Write transpose Dates' >> WriteToText(
        DIR_PATH + 'transposed.txt')

    #flatten list to get individual records
    flatten_pcoll = transposed_date_pcoll | 'Flatten lists' >> beam.FlatMap(
        generate_elements)

    #write resulting PColleciton to output.txt
    flatten_pcoll | 'Write output' >> WriteToText(DIR_PATH +
                                                  'output_final_dates.txt')

    #create new table in BigQuery
    dataset_id = 'kaggle2_modeled'
    table_id = 'Population_Statistics_Beam_DF'
    schema_id = 'dt:DATE,countryName:STRING,countryCode:STRING, \
    metric:STRING,metricCode:STRING,statistic:FLOAT'

    # write PCollection to new BQ table
    flatten_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()