Esempio n. 1
0
def main(gcs_path, out, start=None, end=None, pipeline_args=None):
    steps = [
        apache_beam.FlatMap('Parse XML and filter', parse_xml),
        apache_beam.Map(
            'Coerce "wikitext" key to string type',
            force_string_function('wikitext')),
        apache_beam.FlatMap('Parse markdown into plaintext', parse_wikitext),
        apache_beam.Map(
            'Coerce "text" key to string type', force_string_function('text')),
        apache_beam.Map(
            'Filter out any vestigial HTML', html_to_text),

        core.ParDo('batch', BatchFn(10)),
        apache_beam.FlatMap(
            'Entities (batch)', analyze_entities_batch),
    ]

    p = apache_beam.Pipeline(argv=pipeline_args)

    if start:
        value = p | apache_beam.Read(
            'Pick up at step {}'.format(start), apache_beam.io.TextFileSource(
                gcs_path)) | \
            apache_beam.Map('Parse JSON', json.loads)
    else:
        value = p | apache_beam.Read(
            'Read XML', custom_sources.XmlFileSource('page', gcs_path))

    for step in steps[start:end]:
        value = value | step

    if end:
        if not out.startswith('gs://'):
            raise ValueError('Output must be GCS path if an end is specified.')
        value = value | apache_beam.Map('to JSON', json.dumps) | \
            apache_beam.Write('Dump to GCS', apache_beam.io.TextFileSink(out))
    else:
        value = value | apache_beam.Write(
            'Dump metadata to BigQuery', apache_beam.io.BigQuerySink(
                out,
                schema=', '.join([
                    'article_id:STRING',
                    'article_title:STRING',
                    'article_sentiment_polarity:FLOAT',
                    'article_sentiment_magnitude:FLOAT',
                    'entity_name:STRING',
                    'entity_type:STRING',
                    'entity_wikipedia_url:STRING',
                    'entity_salience:FLOAT',
                    'entity_num_mentions:INTEGER',
                ]),
                create_disposition=(
                    apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                write_disposition=(
                    apache_beam.io.BigQueryDisposition.WRITE_APPEND)))

    p.run()
 def __run_ingestion(self, storage_input_path, columns, output_table):
     (self.pipeline
      | output_table + ': read table ' >> ReadFromAvro(storage_input_path)
      | output_table + ': filter columns' >> beam.Map(self.__filter_columns, columns=columns)
      | output_table + ': write to BigQuery' >> beam.Write(
         beam.io.BigQuerySink(output_table,
                              create_disposition=beam.io.BigQueryDisposition.CREATE_NEVER,
                              write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND)))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--model',
                        dest='model',
                        required=True,
                        help='Checkpoint file of the model.')
    parser.add_argument('--source',
                        dest='source',
                        required=True,
                        help='Data source location (cs|bq).')
    known_args, pipeline_args = parser.parse_known_args(argv)

    if known_args.source == 'cs':

        def _to_dictionary(line):
            result = {}
            result['key'], result['image'] = line.split(':')
            return result

        p = beam.Pipeline(argv=pipeline_args)
        images = (p | 'ReadFromText' >> beam.io.ReadFromText(known_args.input)
                  | 'ConvertToDict' >> beam.Map(_to_dictionary))
        predictions = images | 'Prediction' >> beam.ParDo(
            PredictDoFn(), known_args.model)
        predictions | 'WriteToText' >> beam.io.WriteToText(known_args.output)

    else:
        schema = 'key:INTEGER'
        for i in range(10):
            schema += (', pred%d:FLOAT' % i)
        p = beam.Pipeline(argv=pipeline_args)
        images = p | 'ReadFromBQ' >> beam.Read(
            beam.io.BigQuerySource(known_args.input))
        predictions = images | 'Prediction' >> beam.ParDo(
            PredictDoFn(), known_args.model)
        predictions | 'WriteToBQ' >> beam.Write(
            beam.io.BigQuerySink(
                known_args.output,
                schema=schema,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    logging.getLogger().setLevel(logging.INFO)
    p.run()
def main(src_path, dest_table, pipeline_args):
    p = apache_beam.Pipeline(argv=pipeline_args)

    value = p | 'Read JSON' >> apache_beam.Read(JsonFileSource(src_path))

    value |= (
        'Remove records that lack location or year data' >>
        apache_beam.FlatMap(discard_incomplete))

    value |= (
        'Convert string values to their types' >>
        apache_beam.Map(convert_types))

    value |= (
        'Filter bad data' >>
        apache_beam.FlatMap(filter_suspicious))

    value |= (
        'Massage fields with "rec" prefix' >>
        apache_beam.Map(massage_rec))

    value |= (
        'Dump data to BigQuery' >>
        apache_beam.Write(apache_beam.io.BigQuerySink(
            dest_table,
            schema=', '.join([
                'fall:STRING',
                'year:INTEGER',
                'nametype:STRING',
                'mass:FLOAT',
                'name:STRING',
                'class:STRING',
                'latitude:FLOAT',
                'longitude:FLOAT',
                'id:STRING',
            ]),
            create_disposition=(
                apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
            write_disposition=(
                apache_beam.io.BigQueryDisposition.WRITE_TRUNCATE))))

    p.run()
Esempio n. 5
0
                    (SELECT __key__.id as accnt_id
                    FROM [lead-pages:leadpages.Account_cleansed] LIMIT 100)"""

options = PipelineOptions(flags=sys.argv)

# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = project
google_cloud_options.job_name = 'lp-analysis'
google_cloud_options.staging_location = 'gs://lp_activity_transform/staging'
google_cloud_options.temp_location = 'gs://lp_activity_transform/temp'
options.view_as(StandardOptions).runner = 'DataflowRunner'

p = beam.Pipeline(options=options)
(p
 | 'read' >> beam.Read(beam.io.BigQuerySource(query=input_query))
 | 'cast ints' >> beam.Map(lambda row: (row['account_id'], int(row['views'])))
 | beam.CombinePerKey(sum)
 | 'format for gbq' >> beam.Map(lambda (k, v): {
     'account_id': k,
     'total_views': v
 })
 | 'save' >> beam.Write(
     beam.io.BigQuerySink(
         output_table,
         schema='account_id:INTEGER, total_views:INTEGER',
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
p.run()
Esempio n. 6
0
            'gamma': 1.2
        }

        model = xgb.train(best_params,
                          dtrain,
                          num_boost_round=1000,
                          evals=watchlist,
                          evals_result=evals_result,
                          verbose_eval=True)

        test.loc[:, "predict"] = model.predict(dtest)

        return test[["shop_id", "date", "predict",
                     "sales"]].to_dict(orient='records')

    (pipeline
     | "Query data" >> beam.Read(beam.io.BigQuerySource(query=query))
     | "Assign time" >> beam.Map(assign_timevalue)
     | "Set window" >> beam.WindowInto(window.SlidingWindows(size=3, period=1))
     | "Set group key" >> beam.Map(lambda v: ('shop_id', v))
     | beam.GroupByKey()
     | "Learn and predict" >> beam.FlatMap(learn_predict)
     | "Write data" >> beam.Write(
         beam.io.BigQuerySink(
             'dataset.table',
             schema="shop_id:STRING, date:STRING, predict:FLOAT, sales:INTEGER",
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)))

    pipeline.run()
Esempio n. 7
0
import apache_beam as beam
project = 'teak-proton-148317'
input_table = 'clouddataflow-readonly:samples.weather_stations'
output_table = 'mydataset.weather_copy_from_dataflow1'

p = beam.Pipeline(argv=['--project', project])

read = beam.Read(beam.io.BigQuerySource(input_table))

tornadoesMonths = beam.FlatMap(lambda row: [(int(row['month']), 1)]
                               if row['tornado'] else [])

monthlyCount = beam.CombinePerKey(sum)
frmat = beam.Map(lambda (k, v): {'month': k, 'tornado_count': v})
sve = beam.Write(
    beam.io.BigQuerySink(
        output_table,
        schema='month:INTEGER, tornado_count:INTEGER',
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

(p | read | tornadoesMonths | monthlyCount | frmat | sve)

p.run()
Esempio n. 8
0
import apache_beam as beam
p = beam.Pipeline('DirectPipelineRunner')
# Read a file containing names, add a greeting to each name, and write to a file.
"""(p
 | 'load names' >> beam.Read(beam.io.TextFileSource('gs://dataflow-samples/shakespeare/kinglear.txt'))
 | 'add greeting' >> beam.Map(lambda name, msg: '%s, %s!' % (msg, name), 'Hello')
 | 'save' >> beam.Write(beam.io.TextFileSink('./greetings')))
"""

read = beam.Read(beam.io.TextFileSource('./names'))
maps = beam.Map(lambda a, b: "Hii:" + b, "Last Params")
wr = beam.Write(beam.io.TextFileSink('./namesoutput4'))

p | read | maps | wr

p.run()
Esempio n. 9
0
def write_text_file(pcollection, label, output_name,
                    coder=beam.coders.ToStringCoder()):
  return pcollection | label >> beam.Write(beam.io.TextFileSink(
      os.path.join(args.output_dir, output_name),
      shard_name_template='',
      coder=coder))
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = 'teak-proton-148317'
google_cloud_options.job_name = 'myjob'
google_cloud_options.staging_location = 'gs://sanjay-mybucket/binaries'
google_cloud_options.temp_location = 'gs://sanjay-mybucket/temp'
options.view_as(StandardOptions).runner = 'DataflowPipelineRunner'

p = beam.Pipeline('DataflowPipelineRunner', options)

(p
 | 'read' >> beam.Read(
     beam.io.TextFileSource('gs://dataflow-samples/shakespeare/kinglear.txt'))
 | 'split' >> beam.FlatMap(lambda x: re.findall(r'\w+', x))
 | 'count words' >> beam.combiners.Count.PerElement()
 | 'save' >> beam.Write(beam.io.TextFileSink('./word_count')))

p.run()

beam.combiners.Count.PerElement

beam.core.ParDo

beam.ParDo

beam.core.ParDo

beam.ParDo

beam.combiners.core.ParDo