Ejemplo n.º 1
0
def run():
    import time
    gcs_path = "gs://marcin-playground/dataflow"
    pipeline = beam.Pipeline(runner="DataflowRunner",
                             argv=[
                                 "--project", "project-name",
                                 "--staging_location",
                                 ("%s/staging_location" % gcs_path),
                                 "--temp_location",
                                 ("%s/temp" % gcs_path), "--output",
                                 ("%s/output" % gcs_path), "--setup_file",
                                 "./setup.py"
                             ])
    (pipeline
     | "Load" >> ReadFromText("gs://marcin-playground/books/*.txt")
     | "Count Words" >> CountWordsTransform()
     | "FormatOutput" >>
     beam.Map(lambda (word, count): "{0}: {1}".format(word, count))
     | "Save" >> WriteToText("{0}/output/wordcount{1}".format(
         gcs_path, int(time.time()))))
    pipeline.run()
def run(p, input_file, output_file):
  #pylint: disable=expression-not-assigned
  (
      p
      # Read the lines from a text file.
      | 'Read' >> ReadFromText(input_file)
      # Split the line into individual words.
      | 'Split' >> beam.FlatMap(lambda line: re.split(r'\W+', line))
      # Map each word to an instance of MyRow.
      | 'ToRow' >> beam.Map(MyRow).with_output_types(MyRow)
      # SqlTransform yields a PCollection containing elements with attributes
      # based on the output of the query.
      | 'Sql!!' >> SqlTransform(
          """
                   SELECT
                     word as key,
                     COUNT(*) as `count`
                   FROM PCOLLECTION
                   GROUP BY word""")
      | 'Format' >> beam.Map(lambda row: '{}: {}'.format(row.key, row.count))
      | 'Write' >> WriteToText(output_file))
Ejemplo n.º 3
0
def run(argv=None):
    """Main entry point; defines and runs the tfidf pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--uris', required=True, help='URIs to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Read documents specified by the uris command line option.
        pcoll = read_documents(p, glob.glob(known_args.uris))
        # Compute TF-IDF information for each word.
        output = pcoll | TfIdf()
        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'write' >> WriteToText(known_args.output)
Ejemplo n.º 4
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | 'ReadFromText' >> ReadFromText(known_args.input)
        results = (lines
                   | 'Split' >> beam.Map(split)
                   | 'Group' >> beam.GroupByKey()
                   | 'FormatResult' >> beam.Map(format_result))
        results | 'WriteToText' >> WriteToText(known_args.output)
Ejemplo n.º 5
0
def run():
     # set up location
     PROJECT_ID = 'trim-cistern-288221'
     BUCKET = 'gs://bhnk-milestone1-data'
     DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S') + '/'

     # use DataflowRunner instead of DirectRunner
     options = PipelineOptions(
     flags=None,
     runner='DataflowRunner',
     project=PROJECT_ID,
     job_name='kagglegenre',
     temp_location=BUCKET + '/temp',
     region='us-central1')

     p = beam.pipeline.Pipeline(options=options)

     # retrieve the data from imdb_refined dataset and save this information (location)
     sql = 'SELECT * FROM kaggle_refined.Title_genre'
     bq_source = ReadFromBigQuery(query=sql, use_standard_sql=True, gcs_location=BUCKET)

     # use the previously saved information (location) and read from BigQuery
     # query results is now input P collection
     query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

     # Use ParDo to call function on query results
     out_pcoll = query_results | 'Split genres' >> beam.ParDo(SplitGenre())

     out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output.txt')

     dataset_id = 'kaggle_refined'
     table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Title_genre_Dataflow'
     schema_id = 'Title:STRING,Genre:STRING'

     # write to BigQuery using the location set above
     out_pcoll | 'Write to BQ' >> WriteToBigQuery(table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)
     
     # run and display results after everything is finished
     result = p.run()
     result.wait_until_finish()      
Ejemplo n.º 6
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='../data/spikey_sales_weekly.txt',
        # required=True,
        help='Input File to Process')

    parser.add_argument(
        '--output',
        dest='output',
        default='../output/products',
        # required=True,
        help='output processed file')

    known_args, pipeline_Args = parser.parse_known_args(argv)

    pipeline_option = PipelineOptions(argv)
    pipeline_option.view_as(SetupOptions).save_main_session = True
    logging.getLogger().setLevel(logging.INFO)
    with beam.Pipeline(options=pipeline_option) as p:
        # read input file
        items = p | ReadFromText(known_args.input)
        logging.info("Print the items %s", items)

        split_item_result = (
            items
            | 'Split Result' >> beam.ParDo(
                SplitItemBasedOnSalesFn()).with_outputs(
                    SplitItemBasedOnSalesFn.OUTPUT_TAG_TOP_SELLER,
                    SplitItemBasedOnSalesFn.OUTPUT_TAG_POOR_SELLER))

        top_seller = split_item_result[
            SplitItemBasedOnSalesFn.OUTPUT_TAG_TOP_SELLER]
        poor_seller = split_item_result[
            SplitItemBasedOnSalesFn.OUTPUT_TAG_POOR_SELLER]

        # (top_seller | WriteToText(known_args.output + '_top_seller'))
        (poor_seller | WriteToText(known_args.output + '_poor_seller'))
Ejemplo n.º 7
0
def run(argv=None):
    """Run the workflow."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--output')
    parser.add_argument('--ignore_corpus', default='')
    parser.add_argument('--ignore_word', default='')
    parser.add_argument('--num_groups')

    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        group_ids = []
        for i in range(0, int(known_args.num_groups)):
            group_ids.append('id' + str(i))

        query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare'
        query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare'
        ignore_corpus = known_args.ignore_corpus
        ignore_word = known_args.ignore_word

        pcoll_corpus = p | 'read corpus' >> beam.io.ReadFromBigQuery(
            query=query_corpus)
        pcoll_word = p | 'read_words' >> beam.io.ReadFromBigQuery(
            query=query_word)
        pcoll_ignore_corpus = p | 'create_ignore_corpus' >> beam.Create(
            [ignore_corpus])
        pcoll_ignore_word = p | 'create_ignore_word' >> beam.Create(
            [ignore_word])
        pcoll_group_ids = p | 'create groups' >> beam.Create(group_ids)

        pcoll_groups = create_groups(pcoll_group_ids, pcoll_corpus, pcoll_word,
                                     pcoll_ignore_corpus, pcoll_ignore_word)

        # pylint:disable=expression-not-assigned
        pcoll_groups | WriteToText(known_args.output)
Ejemplo n.º 8
0
    def process_entity(self, entity_name, pk):
        with beam.Pipeline(options=self.pipeline_options) as p:
            # First set up a stream for the data
            data = read_file(
                p,
                entity_name,
                self.get_staging('public.{0}'.format(entity_name)) + '*',
                pk)

            index = None
            try:
                # Also set up a stream for the index
                index = read_file(
                    p,
                    '{0}index'.format(entity_name),
                    self.get_source_index('entity_{0}*'.format(entity_name)),
                    pk)
            except IOError:
                logging.info("Could not open index, maybe doesn't exist")
                # create an empty pcollection, so we can at least run
                index = p | beam.Create([])

            # Generate business keys, checksum, dv_source, load_dtm
            preproc_data = data | 'preprocess_' + entity_name >> \
                beam.Map(add_cksum)

            # Group with index to be able to identify new, updated, deleted
            merge = ({'data': preproc_data, 'index': index}) | \
                'grouped_by_' + pk >> beam.CoGroupByKey()

            # Extract the modified data out of the records
            extract = merge \
                | 'filter_' + entity_name >> beam.Filter(unchanged_rows) \
                | 'extract_' + entity_name >> beam.Map(extract_data)

            # Write them out to disk in staging
            extract | 'Write_' + entity_name >> \
                WriteToText(
                    self.get_psa_location('public.{0}'.format(entity_name)),
                    coder=JsonCoder())
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='./kinglear.txt',
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    # The pipeline will be run on exiting the with block.
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | 'Read' >> ReadFromText(known_args.input)

        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[A-Za-z\]+')).with_output_types(unicode))
                  | 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word, count):
            return '%s: %d' % (word, count)

        output = counts | 'Format' >> beam.MapTuple(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | 'Write' >> WriteToText(known_args.output)
Ejemplo n.º 10
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/data/kinglear.txt',
        help='Input file to process.')

    parser.add_argument(
        '--output',
        dest='output',
        # CHANGE 1/6: The Google Cloud Storage path is required
        # for outputting the results.
        default='gs://YOUR_OUTPUT_BUCKET/AND_OUTPUT_PREFIX',
        help='Output file to write results to.')
    parser.add_argument(
        '--pid',
        dest='pid',
        help='project id')

    parser.add_argument(
        '--mbucket',
        dest='mbucket',
        help='model bucket name')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = save_main_session

    # The pipeline will be run on exiting the with block.
    with beam.Pipeline(options=pipeline_options) as p:
        # Read the text file[pattern] into a PCollection.
        prediction_data = (p | 'CreatePCollection' >> beam.Create([known_args.input])
                           | 'ReadCSVFle' >> beam.FlatMap(get_csv_reader))
        output = (prediction_data | 'Predict' >> beam.ParDo(MyPredictDoFn(project_id=known_args.pid,
                                                                          bucket_name=known_args.mbucket)))
        output | 'WritePR' >> WriteToText(known_args.output)
Ejemplo n.º 11
0
def run():
     PROJECT_ID = 'starry-center-266501' # change to your project id # creating pipeline through direct runner

     # Project ID is required when using the BQ source
     options = {
     'project': PROJECT_ID
     }
     opts = beam.pipeline.PipelineOptions(flags=[], **options)

     # Create beam pipeline using local runner
     p = beam.Pipeline('DirectRunner', options=opts)

     sql = 'SELECT writers, tConst FROM imdb_modeled.Writes limit 100' # passing a query. Shouldn't process more than 1000 records w DR
        # directors is an array of strings, tConst is a string 
     bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True) # direct runner is not running in parallel on several workers. DR is local

     query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source) # read results and assign them to a new p-collection

     # apply ParDo to split the directors titles  
     # call pardo, pipe query results to pardo
     split_directors_pcoll = query_results | 'Return title: writer i dictonaries' >> beam.ParDo(SplitWritersFn()) 

     # write PCollection to log file
     split_directors_pcoll | 'Write log 1' >> WriteToText('formatted_titles_pcoll.txt') 

     dataset_id = 'imdb_modeled'
     table_id = 'Writes_Beam'
     schema_id = 'writer:STRING, tConst:STRING'

     # write PCollection to new BQ table
     split_directors_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(dataset=dataset_id, 
                                                table=table_id, 
                                                schema=schema_id, 
                                                project=PROJECT_ID,
                                                create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
                                                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE, 
                                                batch_size=int(100))
     
     result = p.run()
     result.wait_until_finish()      
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='application',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS case_id, * FROM (SELECT emp.employer_id, CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE AS DECISION_DATE, VISA_CLASS FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Application GROUP BY CASE_NUMBER, CASE_STATUS, CASE_SUBMITTED, DECESION_DATE, VISA_CLASS, employer_name, employer_city HAVING count = 1) AS app JOIN H_1B_refined.Employer_Dataflow AS emp ON emp.employer_name = app.employer_name AND emp.employer_city = app.employer_city) AS t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Application Transfromation' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH +
                                            'output_appplication.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Application_Dataflow'

    schema_id = 'case_id:INTEGER, employer_id:INTEGER, CASE_NUMBER:STRING, CASE_STATUS:STRING, CASE_SUBMITTED:DATE, DECISION_DATE:DATE, VISA_CLASS:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
def run(argv=None):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        # Read the text file[pattern] into a PCollection.
        lines = p | ReadFromText(known_args.input)

        # Count the occurrences of each word.
        counts = (lines
                  | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                      r'[A-Za-z\']+', x)).with_output_types(unicode))
                  | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
                  | 'GroupAndSum' >> beam.CombinePerKey(sum))

        # Format the counts into a PCollection of strings.
        def format_result(word_count):
            (word, count) = word_count
            return '%s: %s' % (word, count)

        output = counts | 'Format' >> beam.Map(format_result)

        # Write the output using a "Write" transform that has side effects.
        # pylint: disable=expression-not-assigned
        output | WriteToText(known_args.output, shard_name_template='')
Ejemplo n.º 14
0
def run(argv=None):
  """Main entry point; defines and runs the wordcount pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--input',
                      dest='input',
                      default='gs://dataflow-samples/shakespeare/kinglear.txt',
                      help='Input file to process.')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)
  # We use the save_main_session option because one or more DoFn's in this
  # workflow rely on global context (e.g., a module imported at module level).
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = True
  p = beam.Pipeline(options=pipeline_options)

  # Read the text file[pattern] into a PCollection.
  lines = p | 'read' >> ReadFromText(known_args.input)

  # Count the occurrences of each word.
  counts = (lines
            | 'split' >> (beam.ParDo(WordExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(lambda (word, ones): (word, sum(ones))))

  # Format the counts into a PCollection of strings.
  output = counts | 'format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))

  # Write the output using a "Write" transform that has side effects.
  # pylint: disable=expression-not-assigned
  output | 'write' >> WriteToText(known_args.output)

  # Actually run the pipeline (all operations above are deferred).
  result = p.run()
  result.wait_until_finish()
Ejemplo n.º 15
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        default='gs://loganalysis/error_log.txt',
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        default='gs://loganalysis/output',
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_args.extend([
        '--runner=DirectRunner',
        '--project=springmldemoproject',
        '--staging_location=gs://loganalysis/staging',
        '--temp_location=gs://loganalysis/temp',
        '--job_name=log-job',
    ])

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:
        lines = p | ReadFromText(known_args.input)
        counts = (
            lines
            | 'window' >> beam.WindowInto(window.GlobalWindows())
            | 'Split' >> (beam.FlatMap(lambda x: re.findall(
                r'((?:(?:[0-1][0-9])|(?:[2][0-3])|(?:[0-9])):(?:[0-5][0-9])(?::[0-5][0-9])?(?:\\s?(?:am|AM|pm|PM))?)',
                x)).with_output_types(unicode))
            | 'PairWithOne' >> beam.Map(lambda x: (x, 1))
            | 'GroupAndSum' >> beam.CombinePerKey(sum))

        def format_result(time_count):
            (time, count) = time_count
            return '%s: %s' % (time, count)

        output = counts | 'Format' >> beam.Map(format_result)

        output | WriteToText(known_args.output)
Ejemplo n.º 16
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='ownership',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) as ownership_id, * FROM (SELECT * FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Ownership GROUP BY occ_code, occ_title, ownership, naics_title, grp, tot_emp, emp_prse, h_mean, a_mean, mean_prse, a_pct10, a_pct25, a_median, a_pct75, a_pct90 HAVING count = 1)) as  t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Ownership' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_ownership.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Ownership_Dataflow'

    schema_id = 'ownership_id:INTEGER, occ_code:STRING, occ_title:STRING, ownership:STRING, naics_title:STRING, grp:STRING, tot_emp:INTEGER, emp_prse:FLOAT, h_mean:FLOAT, a_mean:INTEGER, mean_prse:FLOAT, a_pct10:INTEGER, a_pct25:INTEGER, a_median:INTEGER, a_pct75:INTEGER, a_pct90:INTEGER'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 17
0
def run_pipeline(source, target):
    header = get_header(source)
    fields = header.split(CSV_DELIMITER)

    (bq_schema, schema) = get_schema(target)

    input_path = 'gs://dotz-hiring-datalake/raw/{}.csv'.format(source)
    output_path = 'gs://dotz-hiring-datalake/processed/{}.json/part'.format(
        target)

    pipeline_args = [
        '--job_name={}-{}'.format(target,
                                  str(time.time()).replace('.', '-')),
        '--input={}'.format(input_path), '--output={}'.format(output_path)
    ]

    pipeline_args.extend(BASE_PIPELINE_ARGS)

    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as pipeline:
        lines = pipeline | ReadFromText(input_path)

        # not so bright way to remove a CSV header
        lines = lines | 'RemoveHeader' >> beam.Filter(
            lambda line: line != header)
        objs = lines | 'CSV2JSON' >> beam.Map(csv2json(fields))
        proc_objs = objs | 'ProcessJSONs' >> beam.Map(process(schema))
        filtered_proc_objs = proc_objs | 'FilterEmpties' >> beam.Filter(
            lambda x: x)

        dumped_objs = filtered_proc_objs | 'DumpJSONs' >> beam.Map(json.dumps)
        dumped_objs | WriteToText(output_path)

        filtered_proc_objs | WriteToBigQuery(
            'dotz-hiring:tubulation.{}'.format(target),
            write_disposition=BigQueryDisposition.WRITE_TRUNCATE,
            create_disposition=BigQueryDisposition.CREATE_NEVER)
Ejemplo n.º 18
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam_us'
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    options = PipelineOptions(flags=None,
                              runner='DataflowRunner',
                              project=PROJECT_ID,
                              job_name='employer',
                              temp_location=BUCKET + '/temp',
                              region='us-central1')

    p = beam.pipeline.Pipeline(options=options)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT FARM_FINGERPRINT(TO_JSON_STRING(t)) AS employer_id, * FROM (SELECT employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Employer GROUP BY employer_name, employer_address, employer_city, employer_state, employer_postal_code, employer_country, employer_province, h_1b_dependent, willful_violator HAVING count = 1)) AS t"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll = query_results | 'Remove Dups Employer' >> beam.ParDo(
        NoDuplicates())

    out_pcoll | 'Log output' >> WriteToText(DIR_PATH + 'output_employer.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Employer_Dataflow'

    schema_id = 'employer_id:INTEGER, employer_name:STRING, employer_address:STRING, employer_city:STRING, employer_state:STRING, employer_postal_code:STRING, employer_country:STRING, employer_province:STRING, h_1b_dependent:BOOLEAN, willful_violator:BOOLEAN'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 19
0
def run(argv=None, save_main_session=True):

  parser = argparse.ArgumentParser()
  parser.add_argument(
      '--input',
      dest='input',
      default='gs://dataflow-samples/shakespeare/kinglear.txt',
      help='Input file to process.')
  parser.add_argument(
      '--output',
      dest='output',
      required=True,
      help='Output file to write results to.')
  known_args, pipeline_args = parser.parse_known_args(argv)

  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session


  with beam.Pipeline(options=pipeline_options) as p:

    # Read the text file
    lines = p | 'Read' >> ReadFromText(known_args.input)

    counts = (
        lines
        | 'Split' >>
        (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
        | 'PairWIthOne' >> beam.Map(lambda x: (x, 1))
        | 'GroupAndSum' >> beam.CombinePerKey(sum))


    def format_result(word, count):
      return '%s: %d' % (word, count)

    output = counts | 'Format' >> beam.MapTuple(format_result)

    #Write the word count in output
    output | 'Write' >> WriteToText(known_args.output)
Ejemplo n.º 20
0
def main():
    options = PipelineOptions()
    options.view_as(SetupOptions).save_main_session = True

    opt = options.view_as(_Options)
    inputs = opt.inputs
    output_prefix = opt.output_prefix or os.path.join(
        options.view_as(GoogleCloudOptions).temp_location, 'output')
    shards = opt.shards

    p = Pipeline(options=options)

    def generate(n):
        yield from range(n * _ELEMENTS_PER_INPUT,
                         (n + 1) * _ELEMENTS_PER_INPUT)

    (p
     | Create(range(inputs))
     | ParDo(generate).with_output_types(int)
     | WriteToText(output_prefix, num_shards=shards))

    p.run()
Ejemplo n.º 21
0
def examples_wordcount_templated(renames):
    """Templated WordCount example snippet."""
    import re

    import apache_beam as beam
    from apache_beam.io import ReadFromText
    from apache_beam.io import WriteToText
    from apache_beam.options.pipeline_options import PipelineOptions

    # [START example_wordcount_templated]
    class WordcountTemplatedOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            # Use add_value_provider_argument for arguments to be templatable
            # Use add_argument as usual for non-templatable arguments
            parser.add_value_provider_argument(
                '--input', help='Path of the file to read from')
            parser.add_argument('--output',
                                required=True,
                                help='Output file to write results to.')

    pipeline_options = PipelineOptions(['--output', 'some/output_path'])
    p = beam.Pipeline(options=pipeline_options)

    wordcount_options = pipeline_options.view_as(WordcountTemplatedOptions)
    lines = p | 'Read' >> ReadFromText(wordcount_options.input)
    # [END example_wordcount_templated]

    (lines
     | 'ExtractWords' >> beam.FlatMap(lambda x: re.findall(r'[A-Za-z\']+', x))
     | 'PairWithOnes' >> beam.Map(lambda x: (x, 1))
     | 'Group' >> beam.GroupByKey()
     | 'Sum' >> beam.Map(lambda (word, ones): (word, sum(ones)))
     | 'Format' >> beam.Map(lambda (word, c): '%s: %s' % (word, c))
     | 'Write' >> WriteToText(wordcount_options.output))

    p.visit(SnippetUtils.RenameFiles(renames))
    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 22
0
def run(args=None, save_main_session=True):
    """Runs the workflow computing total points from a collection of matches."""

    if args is None:
        args = sys.argv[1:]
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(args)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session
    with beam.Pipeline(options=pipeline_options) as p:

        # Register the custom coder for the Player class, so that it will be used in
        # the computation.
        coders.registry.register_coder(Player, PlayerCoder)

        (p  # pylint: disable=expression-not-assigned
         | ReadFromText(known_args.input)
         # The get_players function is annotated with a type hint above, so the type
         # system knows the output type of the following operation is a key-value
         # pair of a Player and an int. Please see the documentation for details on
         # types that are inferred automatically as well as other ways to specify
         # type hints.
         | beam.Map(get_players)
         # The output type hint of the previous step is used to infer that the key
         # type of the following operation is the Player type. Since a custom coder
         # is registered for the Player class above, a PlayerCoder will be used to
         # encode Player objects as keys for this combine operation.
         | beam.CombinePerKey(sum)
         | beam.Map(lambda k_v: '%s,%d' % (k_v[0].name, k_v[1]))
         | WriteToText(known_args.output))
Ejemplo n.º 23
0
def examples_ptransforms_templated(renames):
    # [START examples_ptransforms_templated]
    import apache_beam as beam
    from apache_beam.io import WriteToText
    from apache_beam.options.pipeline_options import PipelineOptions
    from apache_beam.options.value_provider import StaticValueProvider

    class TemplatedUserOptions(PipelineOptions):
        @classmethod
        def _add_argparse_args(cls, parser):
            parser.add_value_provider_argument('--templated_int', type=int)

    class MySumFn(beam.DoFn):
        def __init__(self, templated_int):
            self.templated_int = templated_int

        def process(self, an_int):
            yield self.templated_int.get() + an_int

    pipeline_options = PipelineOptions()
    p = beam.Pipeline(options=pipeline_options)

    user_options = pipeline_options.view_as(TemplatedUserOptions)
    my_sum_fn = MySumFn(user_options.templated_int)
    sum = (p
           | 'ReadCollection' >>
           beam.io.ReadFromText('gs://some/integer_collection')
           | 'StringToInt' >> beam.Map(lambda w: int(w))
           | 'AddGivenInt' >> beam.ParDo(my_sum_fn)
           | 'WriteResultingCollection' >> WriteToText('some/output_path'))
    # [END examples_ptransforms_templated]

    # Templates are not supported by DirectRunner (only by DataflowRunner)
    # so a value must be provided at graph-construction time
    my_sum_fn.templated_int = StaticValueProvider(int, 10)

    p.visit(SnippetUtils.RenameFiles(renames))
    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 24
0
def run(argv=None):
    """Runs the workflow computing total points from a collection of matches."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True

    with beam.Pipeline(options=pipeline_options) as p:
        (p  # pylint: disable=expression-not-assigned
         | 'read' >> ReadFromText(known_args.input, coder=JsonCoder())
         | 'points' >> beam.FlatMap(compute_points)
         | beam.CombinePerKey(sum)
         | 'write' >> WriteToText(known_args.output, coder=JsonCoder()))
Ejemplo n.º 25
0
def run():
    PROJECT_ID = 'acquired-rarity-288205'
    BUCKET = 'gs://ykdb_beam/temp'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    p = beam.Pipeline('DirectRunner', options=opts)

    # ***************************************** REMOVE DUPLICATES ****************************************************
    sql = "SELECT job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code FROM (SELECT  *, COUNT(*) AS count FROM H_1B_refined.Occupation WHERE prevailing_wage_YR > 5000 AND length(soc_code) > 5 AND soc_code NOT LIKE '%-%' GROUP BY job_title, employer_name, employer_city, employment_start_date, employment_end_date, soc_code, soc_title, prevailing_wage_YR, pw_wage_level, pw_wage_source, pw_wage_source_year, pw_wage_source_other, worksite_city, worksite_country, worksite_state, worksite_postal_code HAVING count = 1) LIMIT 50"
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    out_pcoll_no_dup = query_results | 'Format prevailing_wage_YR and Remove dups' >> beam.ParDo(
        NoDuplicates())

    out_pcoll_fix_date = out_pcoll_no_dup | 'Format Date' >> beam.ParDo(
        FormatDate())

    out_pcoll = out_pcoll_fix_date | 'Format Soc' >> beam.ParDo(
        FormatSocCode())

    out_pcoll | 'Log output' >> WriteToText('output_occ_codeTest.txt')

    # ***************************************** INSERT INTO BQ ****************************************************
    dataset_id = 'H_1B_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Occ_CodeTest'
    schema_id = 'job_title:STRING, employer_name:STRING, employer_city:STRING, employment_start_date:Date, employment_end_date:Date, soc_code:STRING, soc_title:STRING, prevailing_wage_YR:FLOAT, pw_wage_level:STRING, pw_wage_source:STRING, pw_wage_source_year:INTEGER, pw_wage_source_other:STRING, worksite_city:STRING, worksite_country:STRING, worksite_state:STRING, worksite_postal_code:STRING'

    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 26
0
    def run(argv=None):
        import os
        d = os.path.dirname(os.path.realpath(__file__))
        parser = argparse.ArgumentParser()
        parser.add_argument("--input",
                            dest="input",
                            default=os.path.join(d, "data", "auctions.txt"))
        parser.add_argument("--output", dest="output", required=True)
        known_args, pipeline_args = parser.parse_known_args(argv)
        pipeline_args.extend([
            "--runner=DirectRunner",
            "--temp_location=/tmp/beam_tmp",
            "--job_name=test-job",
        ])

        pipeline_options = PipelineOptions(pipeline_args)
        pipeline_options.view_as(SetupOptions).save_main_session = True
        with beam.Pipeline(options=pipeline_options) as p:

            lines = p | ReadFromText(known_args.input)

            data = (
                lines
                | "Split" >>
                (beam.Map(lambda x: tuple(x.split("\t"))).with_output_types(
                    beam.typehints.Tuple[str, str]))
                | "Clean" >>
                beam.Map(lambda x:
                         (AvgPrice.format_artis(x[0]), int(x[1].strip()))))

            counts = AvgPrice.computeAvgPerKey(data)

            def format_result(word_count):
                (word, count) = word_count
                return "%s: %s" % (word, count)

            output = counts | "Format" >> beam.Map(format_result)
            output | WriteToText(known_args.output)
Ejemplo n.º 27
0
def run():
    # set up location
    PROJECT_ID = 'trim-cistern-288221'
    BUCKET = 'gs://bhnk-milestone1-data'

    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    # executed with DirectRunner
    p = beam.Pipeline('DirectRunner', options=opts)

    # retrieve the data from imdb_refined dataset and save this information (location)
    sql = 'SELECT * FROM imdb_refined.Primary_Professions limit 250'
    bq_source = ReadFromBigQuery(query=sql,
                                 use_standard_sql=True,
                                 gcs_location=BUCKET)

    # use the previously saved information (location) and read from BigQuery
    # query results is now input P collection
    query_results = p | 'Read from BQ' >> beam.io.Read(bq_source)

    # Use ParDo to call function on query results
    out_pcoll = query_results | 'Split Primary Professions' >> beam.ParDo(
        SplitPrimaryProfessions())

    out_pcoll | 'Log output' >> WriteToText('output.txt')

    dataset_id = 'imdb_refined'
    table_id = PROJECT_ID + ':' + dataset_id + '.' + 'Primary_Professions_Beam'
    schema_id = 'nconst:STRING,primaryProfession:STRING'

    # write to BigQuery using the location set above
    out_pcoll | 'Write to BQ' >> WriteToBigQuery(
        table=table_id, schema=schema_id, custom_gcs_temp_location=BUCKET)

    # run and display results after everything is finished
    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 28
0
def run(argv=None):  # pylint: disable=missing-docstring

    parser = argparse.ArgumentParser()
    parser.add_argument('--grid_size',
                        dest='grid_size',
                        default=1000,
                        help='Size of the NxN matrix')
    parser.add_argument(
        '--coordinate_output',
        dest='coordinate_output',
        required=True,
        help='Output file to write the color coordinates of the image to.')
    parser.add_argument('--image_output',
                        dest='image_output',
                        default=None,
                        help='Output file to write the resulting image to.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    with beam.Pipeline(argv=pipeline_args) as p:
        n = int(known_args.grid_size)

        coordinates = generate_julia_set_colors(p, complex(-.62772, .42193), n,
                                                100)

        def x_coord_key(x_y_i):
            (x, y, i) = x_y_i
            return (x, (x, y, i))

        # Group each coordinate triplet by its x value, then write the coordinates
        # to the output file with an x-coordinate grouping per line.
        # pylint: disable=expression-not-assigned
        (coordinates
         | 'x coord key' >> beam.Map(x_coord_key)
         | 'x coord' >> beam.GroupByKey()
         |
         'format' >> beam.Map(lambda k_coords: ' '.join('(%s, %s, %s)' % c
                                                        for c in k_coords[1]))
         | WriteToText(known_args.coordinate_output))
Ejemplo n.º 29
0
def run():
    PROJECT_ID = 'cs327e-sp2020'  # change to your project id

    # Project ID is required when using the BQ source
    options = {'project': PROJECT_ID}
    opts = beam.pipeline.PipelineOptions(flags=[], **options)

    # Create beam pipeline using local runner
    p = beam.Pipeline('DirectRunner', options=opts)

    sql = 'SELECT * FROM covid_19_modeled.Event'
    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    # format timestamp
    ts_pcoll = query_results | 'Format Timestamp' >> beam.ParDo(
        FormatTimestampFn())

    # write new PCollection to log file
    ts_pcoll | 'Write log' >> WriteToText('ts_pcoll.txt')

    dataset_id = 'covid_19_modeled'
    table_id = 'Event_Beam'
    schema_id = '''location_id:INTEGER,last_update:DATETIME,
                    confirmed:INTEGER,deaths:INTEGER,recovered:INTEGER'''

    # write new PCollection to BQ table
    ts_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 30
0
def run(p, input_file, output_file):
    # Read the text file[pattern] into a PCollection.
    lines = p | 'read' >> ReadFromText(input_file)

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(bytes))
              | 'count' >> beam.ExternalTransform(
                  'beam:transforms:xlang:count', None, EXPANSION_SERVICE_ADDR))

    # Format the counts into a PCollection of strings.
    def format_result(word_count):
        (word, count) = word_count
        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.
    # pylint: disable=expression-not-assigned
    output | 'write' >> WriteToText(output_file)

    result = p.run()
    result.wait_until_finish()