Exemple #1
0
def main(gcs_path, out, start=None, end=None, pipeline_args=None):
    steps = [
        apache_beam.FlatMap('Parse XML and filter', parse_xml),
        apache_beam.Map(
            'Coerce "wikitext" key to string type',
            force_string_function('wikitext')),
        apache_beam.FlatMap('Parse markdown into plaintext', parse_wikitext),
        apache_beam.Map(
            'Coerce "text" key to string type', force_string_function('text')),
        apache_beam.Map(
            'Filter out any vestigial HTML', html_to_text),

        core.ParDo('batch', BatchFn(10)),
        apache_beam.FlatMap(
            'Entities (batch)', analyze_entities_batch),
    ]

    p = apache_beam.Pipeline(argv=pipeline_args)

    if start:
        value = p | apache_beam.Read(
            'Pick up at step {}'.format(start), apache_beam.io.TextFileSource(
                gcs_path)) | \
            apache_beam.Map('Parse JSON', json.loads)
    else:
        value = p | apache_beam.Read(
            'Read XML', custom_sources.XmlFileSource('page', gcs_path))

    for step in steps[start:end]:
        value = value | step

    if end:
        if not out.startswith('gs://'):
            raise ValueError('Output must be GCS path if an end is specified.')
        value = value | apache_beam.Map('to JSON', json.dumps) | \
            apache_beam.Write('Dump to GCS', apache_beam.io.TextFileSink(out))
    else:
        value = value | apache_beam.Write(
            'Dump metadata to BigQuery', apache_beam.io.BigQuerySink(
                out,
                schema=', '.join([
                    'article_id:STRING',
                    'article_title:STRING',
                    'article_sentiment_polarity:FLOAT',
                    'article_sentiment_magnitude:FLOAT',
                    'entity_name:STRING',
                    'entity_type:STRING',
                    'entity_wikipedia_url:STRING',
                    'entity_salience:FLOAT',
                    'entity_num_mentions:INTEGER',
                ]),
                create_disposition=(
                    apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
                write_disposition=(
                    apache_beam.io.BigQueryDisposition.WRITE_APPEND)))

    p.run()
Exemple #2
0
def preprocess(pipeline, training_data, eval_data, predict_data, output_dir):
    feature_set = iris.IrisFeatures()

    training_data_source = beam.io.TextFileSource(
        training_data,
        strip_trailing_newlines=True,
        coder=io.CsvCoder.from_feature_set(feature_set,
                                           feature_set.csv_columns))

    eval_data_source = beam.io.TextFileSource(
        eval_data,
        strip_trailing_newlines=True,
        coder=io.CsvCoder.from_feature_set(feature_set,
                                           feature_set.csv_columns))

    predict_data_source = beam.io.TextFileSource(
        predict_data,
        strip_trailing_newlines=True,
        coder=io.CsvCoder.from_feature_set(feature_set,
                                           feature_set.csv_columns,
                                           has_target_columns=False))

    train = pipeline | beam.Read('ReadTrainingData', training_data_source)
    evaluate = pipeline | beam.Read('ReadEvalData', eval_data_source)
    predict = pipeline | beam.Read('ReadPredictData', predict_data_source)

    # TODO(b/32726166) Update input_format and format_metadata to read from these
    # values directly from the coder.
    (metadata, train_features, eval_features, predict_features) = (
        (train, evaluate, predict)
        | 'Preprocess' >> ml.Preprocess(
            feature_set,
            input_format='csv',
            format_metadata={'headers': feature_set.csv_columns}))

    # Writes metadata.yaml, features_train, features_eval, and features_eval files
    # pylint: disable=expression-not-assigned
    (metadata | 'SaveMetadata' >> io.SaveMetadata(
        os.path.join(output_dir, 'metadata.yaml')))

    # We turn off sharding of these feature files because the dataset very small.
    (train_features | 'SaveTrain' >> io.SaveFeatures(
        os.path.join(output_dir, 'features_train')))
    (eval_features |
     'SaveEval' >> io.SaveFeatures(os.path.join(output_dir, 'features_eval')))
    (predict_features | 'SavePredict' >> io.SaveFeatures(
        os.path.join(output_dir, 'features_predict')))
    # pylint: enable=expression-not-assigned

    return metadata, train_features, eval_features, predict_features
Exemple #3
0
def run(project, bucket):
   argv = [
      '--project={0}'.format(project),
      '--job_name=ch03timecorr',
      '--save_main_session',
      '--staging_location=gs://{0}/flights/staging/'.format(bucket),
      '--temp_location=gs://{0}/flights/temp/'.format(bucket),
      '--setup_file=./setup.py',
      '--max_num_workers=10',
      '--autoscaling_algorithm=THROUGHPUT_BASED',
      '--runner=DataflowPipelineRunner'
   ]
   airports_filename = 'gs://{}/flights/airports/airports.csv.gz'.format(bucket)
   flights_raw_files = 'gs://{}/flights/raw/*.csv'.format(bucket)
   flights_output = 'gs://{}/flights/tzcorr/all_flights'.format(bucket)
   events_output = '{}:flights.simevents'.format(project)

   pipeline = beam.Pipeline(argv=argv)
   
   airports = (pipeline 
      | 'airports:read' >> beam.Read(beam.io.TextFileSource(airports_filename))
      | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line])))
      | 'airports:tz' >> beam.Map(lambda fields: (fields[0], addtimezone(fields[21], fields[26])))
   )

   flights = (pipeline 
      | 'flights:read' >> beam.Read(beam.io.TextFileSource(flights_raw_files))
      | 'flights:tzcorr' >> beam.FlatMap(tz_correct, beam.pvalue.AsDict(airports))
   )

   (flights 
      | 'flights:tostring' >> beam.Map(lambda fields: ','.join(fields)) 
      | 'flights:out' >> beam.io.textio.WriteToText(flights_output)
   )

   events = flights | beam.FlatMap(get_next_event)

   schema = 'FL_DATE:date,UNIQUE_CARRIER:string,AIRLINE_ID:string,CARRIER:string,FL_NUM:string,ORIGIN_AIRPORT_ID:string,ORIGIN_AIRPORT_SEQ_ID:integer,ORIGIN_CITY_MARKET_ID:string,ORIGIN:string,DEST_AIRPORT_ID:string,DEST_AIRPORT_SEQ_ID:integer,DEST_CITY_MARKET_ID:string,DEST:string,CRS_DEP_TIME:timestamp,DEP_TIME:timestamp,DEP_DELAY:float,TAXI_OUT:float,WHEELS_OFF:timestamp,WHEELS_ON:timestamp,TAXI_IN:float,CRS_ARR_TIME:timestamp,ARR_TIME:timestamp,ARR_DELAY:float,CANCELLED:string,CANCELLATION_CODE:string,DIVERTED:string,DISTANCE:float,DEP_AIRPORT_LAT:float,DEP_AIRPORT_LON:float,DEP_AIRPORT_TZOFFSET:float,ARR_AIRPORT_LAT:float,ARR_AIRPORT_LON:float,ARR_AIRPORT_TZOFFSET:float,EVENT:string,NOTIFY_TIME:timestamp,EVENT_DATA:string'

   (events 
      | 'events:totablerow' >> beam.Map(lambda fields: create_row(fields)) 
      | 'events:out' >> beam.io.Write(beam.io.BigQuerySink(
                              events_output, schema=schema,
                              write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE,
                              create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED))
   )

   pipeline.run()
def run():
   parser = argparse.ArgumentParser(description='Compute monthly NDVI')
   parser.add_argument('--index_file', default='2015index.txt.gz', help='default=2015index.txt.gz  Use gs://gcp-public-data-landsat/index.csv.gz to process full dataset')
   parser.add_argument('--output_file', default='output.txt', help='default=output.txt Supply a location on GCS when running on cloud')
   parser.add_argument('--output_dir', required=True, help='Where should the ndvi images be stored? Supply a GCS location when running on cloud')
   known_args, pipeline_args = parser.parse_known_args()
 
   p = beam.Pipeline(argv=pipeline_args)
   index_file = known_args.index_file
   output_file = known_args.output_file
   output_dir = known_args.output_dir

   #lat =   4.37; lon =  -7.71  # Cape Palmas
   lat =-21.1; lon = 55.50     # Reunion Island

   # Read the index file and find the best look
   scenes = (p
      | 'read_index' >> beam.Read(beam.io.TextFileSource(index_file))
      | 'filter_scenes' >> beam.FlatMap(lambda line: filterScenes(line, lat, lon) )
      | 'least_cloudy' >> beam.CombinePerKey(clearest)
   )

   # write out info about scene
   scenes | beam.Map(lambda (yrmon, scene): scene.__dict__) | 'scene_info' >> beam.io.textio.WriteToText(output_file)

   # compute ndvi on scene
   scenes | 'compute_ndvi' >> beam.Map(lambda (yrmon, scene): ndvi.computeNdvi(scene.BASE_URL, output_dir))

   p.run()
def configure_pipeline(p, opt):
    """Specify PCollection and transformations in pipeline."""
    input_source = beam.io.TextFileSource(opt.input_path,
                                          strip_trailing_newlines=True)
    label_source = beam.io.TextFileSource(opt.input_dict,
                                          strip_trailing_newlines=True)
    labels = (p | 'Read dictionary' >> beam.Read(label_source))
    _ = (p
         | 'Read input' >> beam.Read(input_source)
         | 'Parse input' >> beam.Map(lambda line: csv.reader([line]).next())
         | 'Extract label ids' >> beam.ParDo(ExtractLabelIdsDoFn(),
                                             beam.pvalue.AsIter(labels))
         | 'Read and convert to JPEG' >> beam.ParDo(
             ReadImageAndConvertToJpegDoFn())
         | 'Embed and make TFExample' >> beam.ParDo(TFExampleFromImageDoFn())
         | 'Save to disk' >> SaveFeatures(opt.output_path))
Exemple #6
0
    def test_run_direct(self):
        file_name = self._create_temp_file('aaaa\nbbbb\ncccc\ndddd')
        pipeline = TestPipeline()
        pcoll = pipeline | beam.Read(LineSource(file_name))
        assert_that(pcoll, equal_to(['aaaa', 'bbbb', 'cccc', 'dddd']))

        pipeline.run()
Exemple #7
0
def run(argv=None):
    """Constructs and runs the example filtering pipeline."""

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--input',
        help='BigQuery table to read from.',
        default='clouddataflow-readonly:samples.weather_stations')
    parser.add_argument('--output',
                        required=True,
                        help='BigQuery table to write to.')
    parser.add_argument('--month_filter',
                        default=7,
                        help='Numeric value of month to filter on.')
    known_args, pipeline_args = parser.parse_known_args(argv)

    p = beam.Pipeline(argv=pipeline_args)

    input_data = p | beam.Read(beam.io.BigQuerySource(known_args.input))

    # pylint: disable=expression-not-assigned
    (filter_cold_days(input_data, known_args.month_filter)
     | 'SaveToBQ' >> beam.io.Write(
         beam.io.BigQuerySink(
             known_args.output,
             schema='year:INTEGER,month:INTEGER,day:INTEGER,mean_temp:FLOAT',
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))

    # Actually run the pipeline (all operations above are deferred).
    p.run()
Exemple #8
0
def run():
    import os
    parser = argparse.ArgumentParser(description='Compute monthly NDVI')
    parser.add_argument(
        '--index_file',
        default='2015index.txt.gz',
        help=
        'default=2015.txt.gz ... gs://cloud-training-demos/landsat/2015index.txt.gz  Use gs://gcp-public-data-landsat/index.csv.gz to process full dataset'
    )
    parser.add_argument(
        '--output_file',
        default='output.txt',
        help='default=output.txt Supply a location on GCS when running on cloud'
    )
    parser.add_argument(
        '--output_dir',
        required=True,
        help=
        'Where should the ndvi images be stored? Supply a GCS location when running on cloud'
    )
    known_args, pipeline_args = parser.parse_known_args()

    p = beam.Pipeline(argv=pipeline_args)
    index_file = known_args.index_file
    output_file = known_args.output_file
    output_dir = known_args.output_dir

    lat = -21.1
    lon = 55.50  # center of Reunion Island
    dlat = 0.4
    dlon = 0.4

    # Read the index file and find all scenes that cover this area
    allscenes = (p
                 |
                 'read_index' >> beam.Read(beam.io.TextFileSource(index_file))
                 | 'to_scene' >> beam.Map(lambda line: SceneInfo(line))
                 | 'by_area' >> beam.FlatMap(lambda scene: filterByArea(
                     scene, lat + dlat, lon - dlon, lat - dlat, lon + dlon)))

    # for each month and spacecraft-coverage-pattern (given by the path and row), find clearest scene
    scenes = (
        allscenes
        |
        'cov_month' >> beam.Map(lambda scene: (scene.month_path_row(), scene))
        | 'least_cloudy' >> beam.CombinePerKey(clearest)
        |
        'yrmon-scene' >> beam.Map(lambda (key, scene): (scene.yrmon(), scene)))

    # write out info about scene
    scenes | beam.Map(
        lambda (yrmon, scene): '{}: {}'.format(yrmon, scene.SCENE_ID)
    ) | 'scene_info' >> beam.io.textio.WriteToText(output_file)

    # compute ndvi on scene
    scenes | 'compute_ndvi' >> beam.Map(lambda (yrmon, scene): ndvi.computeNdvi(
        scene.BASE_URL, os.path.join(output_dir, yrmon), scene.SPACECRAFT_ID))

    p.run()
Exemple #9
0
def preprocess(pipeline):
  feature_set = iris.IrisFeatures()

  training_data = beam.io.TextFileSource(
      args.training_data, strip_trailing_newlines=True,
      coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns))

  eval_data = beam.io.TextFileSource(
      args.eval_data, strip_trailing_newlines=True,
      coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns))

  predict_data = beam.io.TextFileSource(
      args.predict_data, strip_trailing_newlines=True,
      coder=io.CsvCoder.from_feature_set(feature_set, feature_set.csv_columns,
                                         has_target_columns=False))

  train = pipeline | beam.Read('ReadTrainingData', training_data)
  evaluate = pipeline | beam.Read('ReadEvalData', eval_data)
  predict = pipeline | beam.Read('ReadPredictData', predict_data)

  (metadata, train_features, eval_features, predict_features) = (
      (train, evaluate, predict)
      | 'Preprocess'
      >> ml.Preprocess(feature_set, input_format='csv',
                       format_metadata={'headers': feature_set.csv_columns}))

  # Writes metadata.yaml (text file), features_train, features_eval, and
  # features_eval (TFRecord files)
  (metadata | 'SaveMetadata'
            >> io.SaveMetadata(os.path.join(args.output_dir, 'metadata.yaml')))

  # We turn off sharding of the feature files because the dataset is very small.
  (train_features | 'SaveTrain'
                  >> io.SaveFeatures(
                      os.path.join(args.output_dir, 'features_train')))
  (eval_features | 'SaveEval'
                 >> io.SaveFeatures(
                     os.path.join(args.output_dir, 'features_eval')))
  (predict_features | 'SavePredict'
                    >> io.SaveFeatures(
                        os.path.join(args.output_dir, 'features_predict')))

  return metadata, train_features, eval_features, predict_features
  def test_run_concat_direct(self):
    source = ConcatSource([RangeSource(0, 10),
                           RangeSource(10, 100),
                           RangeSource(100, 1000),
                          ])
    pipeline = TestPipeline()
    pcoll = pipeline | beam.Read(source)
    assert_that(pcoll, equal_to(range(1000)))

    pipeline.run()
Exemple #11
0
 def test_process_auto(self):
     path = os.path.join(self._new_tempdir(), 'result.gz')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with beam.Pipeline(DirectRunner()) as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.AUTO)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
 def test_process_single(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file(path, FOO_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=fileio.CompressionTypes.AUTO)))
         beam.assert_that(result, beam.equal_to(['foo']))
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')
    parser.add_argument('--model',
                        dest='model',
                        required=True,
                        help='Checkpoint file of the model.')
    parser.add_argument('--source',
                        dest='source',
                        required=True,
                        help='Data source location (cs|bq).')
    known_args, pipeline_args = parser.parse_known_args(argv)

    if known_args.source == 'cs':

        def _to_dictionary(line):
            result = {}
            result['key'], result['image'] = line.split(':')
            return result

        p = beam.Pipeline(argv=pipeline_args)
        images = (p | 'ReadFromText' >> beam.io.ReadFromText(known_args.input)
                  | 'ConvertToDict' >> beam.Map(_to_dictionary))
        predictions = images | 'Prediction' >> beam.ParDo(
            PredictDoFn(), known_args.model)
        predictions | 'WriteToText' >> beam.io.WriteToText(known_args.output)

    else:
        schema = 'key:INTEGER'
        for i in range(10):
            schema += (', pred%d:FLOAT' % i)
        p = beam.Pipeline(argv=pipeline_args)
        images = p | 'ReadFromBQ' >> beam.Read(
            beam.io.BigQuerySource(known_args.input))
        predictions = images | 'Prediction' >> beam.ParDo(
            PredictDoFn(), known_args.model)
        predictions | 'WriteToBQ' >> beam.Write(
            beam.io.BigQuerySink(
                known_args.output,
                schema=schema,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

    logging.getLogger().setLevel(logging.INFO)
    p.run()
Exemple #14
0
 def test_process_gzip(self):
     path = os.path.join(self._new_tempdir(), 'result')
     self._write_file_gzip(path, FOO_BAR_RECORD_BASE64)
     with TestPipeline() as p:
         result = (p
                   | beam.Read(
                       _TFRecordSource(
                           path,
                           coder=coders.BytesCoder(),
                           compression_type=CompressionTypes.GZIP,
                           validate=True)))
         beam.assert_that(result, beam.equal_to(['foo', 'bar']))
Exemple #15
0
  def test_read_auto_single_file_gzip(self):
    _, lines = write_data(10)
    filename = tempfile.NamedTemporaryFile(
        delete=False, prefix=tempfile.template, suffix='.gz').name
    with gzip.GzipFile(filename, 'wb') as f:
      f.write('\n'.join(lines))

    pipeline = TestPipeline()
    pcoll = pipeline | 'Read' >> beam.Read(LineSource(
        filename,
        compression_type=CompressionTypes.AUTO))
    assert_that(pcoll, equal_to(lines))
    pipeline.run()
    def test_read_file_bzip2(self):
        _, lines = write_data(10)
        filename = tempfile.NamedTemporaryFile(delete=False,
                                               prefix=tempfile.template).name
        with bz2.BZ2File(filename, 'wb') as f:
            f.write('\n'.join(lines))

        pipeline = TestPipeline()
        pcoll = pipeline | 'Read' >> beam.Read(
            LineSource(filename,
                       splittable=False,
                       compression_type=fileio.CompressionTypes.BZIP2))
        assert_that(pcoll, equal_to(lines))
        pipeline.run()
Exemple #17
0
 def test_read_pattern_bzip2(self):
   _, lines = write_data(200)
   splits = [0, 34, 100, 140, 164, 188, 200]
   chunks = [lines[splits[i-1]:splits[i]] for i in xrange(1, len(splits))]
   compressed_chunks = []
   for c in chunks:
     compressobj = bz2.BZ2Compressor()
     compressed_chunks.append(
         compressobj.compress('\n'.join(c)) + compressobj.flush())
   file_pattern = write_prepared_pattern(compressed_chunks)
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> beam.Read(LineSource(
       file_pattern,
       splittable=False,
       compression_type=CompressionTypes.BZIP2))
   assert_that(pcoll, equal_to(lines))
   pipeline.run()
Exemple #18
0
 def test_read_auto_pattern(self):
   _, lines = write_data(200)
   splits = [0, 34, 100, 140, 164, 188, 200]
   chunks = [lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits))]
   compressed_chunks = []
   for c in chunks:
     out = cStringIO.StringIO()
     with gzip.GzipFile(fileobj=out, mode="w") as f:
       f.write('\n'.join(c))
     compressed_chunks.append(out.getvalue())
   file_pattern = write_prepared_pattern(
       compressed_chunks, suffixes=['.gz']*len(chunks))
   pipeline = TestPipeline()
   pcoll = pipeline | 'Read' >> beam.Read(LineSource(
       file_pattern,
       compression_type=CompressionTypes.AUTO))
   assert_that(pcoll, equal_to(lines))
   pipeline.run()
def main(src_path, dest_table, pipeline_args):
    p = apache_beam.Pipeline(argv=pipeline_args)

    value = p | 'Read JSON' >> apache_beam.Read(JsonFileSource(src_path))

    value |= (
        'Remove records that lack location or year data' >>
        apache_beam.FlatMap(discard_incomplete))

    value |= (
        'Convert string values to their types' >>
        apache_beam.Map(convert_types))

    value |= (
        'Filter bad data' >>
        apache_beam.FlatMap(filter_suspicious))

    value |= (
        'Massage fields with "rec" prefix' >>
        apache_beam.Map(massage_rec))

    value |= (
        'Dump data to BigQuery' >>
        apache_beam.Write(apache_beam.io.BigQuerySink(
            dest_table,
            schema=', '.join([
                'fall:STRING',
                'year:INTEGER',
                'nametype:STRING',
                'mass:FLOAT',
                'name:STRING',
                'class:STRING',
                'latitude:FLOAT',
                'longitude:FLOAT',
                'id:STRING',
            ]),
            create_disposition=(
                apache_beam.io.BigQueryDisposition.CREATE_IF_NEEDED),
            write_disposition=(
                apache_beam.io.BigQueryDisposition.WRITE_TRUNCATE))))

    p.run()
def run():
    argv = [
        '--project={0}'.format(PROJECT), '--job_name=examplejob2',
        '--save_main_session',
        '--staging_location=gs://{0}/staging/'.format(BUCKET),
        '--temp_location=gs://{0}/staging/'.format(BUCKET),
        '--runner=BlockingDataflowPipelineRunner'
    ]

    p = beam.Pipeline(argv=argv)
    input = 'gs://{0}/javahelp/*.java'.format(BUCKET)
    output_prefix = 'gs://{0}/javahelp/output'.format(BUCKET)
    searchTerm = 'import'

    # find all lines that contain the searchTerm
    (p
     | 'GetJava' >> beam.Read(beam.io.TextFileSource(input))
     | 'Grep' >> beam.FlatMap(lambda line: my_grep(line, searchTerm))
     | 'write' >> beam.io.textio.WriteToText(output_prefix))

    p.run()
def run(argv=None):
    """Run the workflow."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--output')
    parser.add_argument('--ignore_corpus', default='')
    parser.add_argument('--ignore_word', default='')
    parser.add_argument('--num_groups')

    known_args, pipeline_args = parser.parse_known_args(argv)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    p = beam.Pipeline(options=pipeline_options)

    group_ids = []
    for i in xrange(0, int(known_args.num_groups)):
        group_ids.append('id' + str(i))

    query_corpus = 'select UNIQUE(corpus) from publicdata:samples.shakespeare'
    query_word = 'select UNIQUE(word) from publicdata:samples.shakespeare'
    ignore_corpus = known_args.ignore_corpus
    ignore_word = known_args.ignore_word

    pcoll_corpus = p | 'read corpus' >> beam.io.Read(
        beam.io.BigQuerySource(query=query_corpus))
    pcoll_word = p | 'read_words' >> beam.Read(
        beam.io.BigQuerySource(query=query_word))
    pcoll_ignore_corpus = p | 'create_ignore_corpus' >> beam.Create(
        [ignore_corpus])
    pcoll_ignore_word = p | 'create_ignore_word' >> beam.Create([ignore_word])
    pcoll_group_ids = p | 'create groups' >> beam.Create(group_ids)

    pcoll_groups = create_groups(pcoll_group_ids, pcoll_corpus, pcoll_word,
                                 pcoll_ignore_corpus, pcoll_ignore_word)

    # pylint:disable=expression-not-assigned
    pcoll_groups | WriteToText(known_args.output)
    p.run()
 def test_read_auto_pattern_compressed_and_uncompressed(self):
     _, lines = write_data(200)
     splits = [0, 34, 100, 140, 164, 188, 200]
     chunks = [
         lines[splits[i - 1]:splits[i]] for i in xrange(1, len(splits))
     ]
     chunks_to_write = []
     for i, c in enumerate(chunks):
         if i % 2 == 0:
             out = cStringIO.StringIO()
             with gzip.GzipFile(fileobj=out, mode="w") as f:
                 f.write('\n'.join(c))
             chunks_to_write.append(out.getvalue())
         else:
             chunks_to_write.append('\n'.join(c))
     file_pattern = write_prepared_pattern(chunks_to_write,
                                           suffixes=(['.gz', ''] * 3))
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> beam.Read(
         LineSource(file_pattern,
                    compression_type=fileio.CompressionTypes.AUTO))
     assert_that(pcoll, equal_to(lines))
     pipeline.run()
 def _run_source_test(self, pattern, expected_data, splittable=True):
     pipeline = TestPipeline()
     pcoll = pipeline | 'Read' >> beam.Read(
         LineSource(pattern, splittable=splittable))
     assert_that(pcoll, equal_to(expected_data))
     pipeline.run()
Exemple #24
0
            'gamma': 1.2
        }

        model = xgb.train(best_params,
                          dtrain,
                          num_boost_round=1000,
                          evals=watchlist,
                          evals_result=evals_result,
                          verbose_eval=True)

        test.loc[:, "predict"] = model.predict(dtest)

        return test[["shop_id", "date", "predict",
                     "sales"]].to_dict(orient='records')

    (pipeline
     | "Query data" >> beam.Read(beam.io.BigQuerySource(query=query))
     | "Assign time" >> beam.Map(assign_timevalue)
     | "Set window" >> beam.WindowInto(window.SlidingWindows(size=3, period=1))
     | "Set group key" >> beam.Map(lambda v: ('shop_id', v))
     | beam.GroupByKey()
     | "Learn and predict" >> beam.FlatMap(learn_predict)
     | "Write data" >> beam.Write(
         beam.io.BigQuerySink(
             'dataset.table',
             schema="shop_id:STRING, date:STRING, predict:FLOAT, sales:INTEGER",
             write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED)))

    pipeline.run()
Exemple #25
0
def run(argv=None):
    """Runs the workflow."""

    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        required=True,
                        help='Input file to process.')
    parser.add_argument('--output',
                        required=True,
                        help='Output BigQuery table: PROJECT:DATASET.TABLE')
    known_args, pipeline_args = parser.parse_known_args(argv)

    schema = bigquery.TableSchema()
    schema.fields.append(field('Alexa_rank', 'integer'))
    schema.fields.append(field('Alexa_domain'))

    schema.fields.append(field('DMOZ_title'))
    schema.fields.append(field('DMOZ_description'))
    schema.fields.append(field('DMOZ_url'))
    schema.fields.append(field('DMOZ_topic', 'string', 'repeated'))

    schema.fields.append(field('Host'))
    schema.fields.append(field('FinalLocation'))
    schema.fields.append(field('HTTPOk', 'boolean'))
    schema.fields.append(field('HTTPSOk', 'boolean'))
    schema.fields.append(field('HTTPSOnly', 'boolean'))

    schema.fields.append(build_response_schema('HTTPResponses'))
    schema.fields.append(build_response_schema('HTTPSResponses'))
    schema.fields.append(field('Error'))

    options = PipelineOptions(pipeline_args)
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    options.view_as(SetupOptions).save_main_session = True

    # https://cloud.google.com/dataflow/pipelines/specifying-exec-params
    gc_options = options.view_as(GoogleCloudOptions)
    gc_options.project = 'httparchive'
    gc_options.job_name = 'host-scan-import-' + str(datetime.date.today())
    gc_options.staging_location = 'gs://httparchive/dataflow-binaries'
    gc_options.temp_location = 'gs://httparchive/dataflow-tmp'

    wk_options = options.view_as(WorkerOptions)
    wk_options.num_workers = 10

    # options.view_as(StandardOptions).runner = 'DirectPipelineRunner'
    options.view_as(StandardOptions).runner = 'DataflowPipelineRunner'

    p = beam.Pipeline(options=options)
    (p
     | 'read' >> beam.Read(
         beam.io.TextFileSource(known_args.input, coder=JsonCoder()))
     | 'process' >> beam.FlatMap(process_record)
     # | 'local-write' >> beam.Write(beam.io.TextFileSink('./results')))
     | 'bq-write' >> beam.io.Write(
         beam.io.BigQuerySink(
             known_args.output,
             schema=schema,
             create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
             write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
    p.run()
Exemple #26
0
        for f in [13, 14, 17]:  #crsdeptime, deptime, wheelsoff
            fields[f] = as_utc(fields[0], fields[f], dep_timezone)
        for f in [18, 20, 21]:  #wheelson, crsarrtime, arrtime
            fields[f] = as_utc(fields[0], fields[f], arr_timezone)

        yield ','.join(fields)


if __name__ == '__main__':
    pipeline = beam.Pipeline('DirectPipelineRunner')

    airports = (
        pipeline
        |
        'airports:read' >> beam.Read(beam.io.TextFileSource('airports.csv.gz'))
        | 'airports:fields' >> beam.Map(lambda line: next(csv.reader([line])))
        | 'airports:tz' >> beam.Map(
            lambda fields: (fields[0], addtimezone(fields[21], fields[26]))))

    flights = (
        pipeline
        |
        'flights:read' >> beam.Read(beam.io.TextFileSource('201501_part.csv'))
        | 'flights:tzcorr' >> beam.FlatMap(tz_correct,
                                           beam.pvalue.AsDict(airports)))

    flights | beam.io.textio.WriteToText('all_flights')

    pipeline.run()
Exemple #27
0
                    (SELECT __key__.id as accnt_id
                    FROM [lead-pages:leadpages.Account_cleansed] LIMIT 100)"""

options = PipelineOptions(flags=sys.argv)

# For Cloud execution, set the Cloud Platform project, job_name,
# staging location, temp_location and specify DataflowRunner.
google_cloud_options = options.view_as(GoogleCloudOptions)
google_cloud_options.project = project
google_cloud_options.job_name = 'lp-analysis'
google_cloud_options.staging_location = 'gs://lp_activity_transform/staging'
google_cloud_options.temp_location = 'gs://lp_activity_transform/temp'
options.view_as(StandardOptions).runner = 'DataflowRunner'

p = beam.Pipeline(options=options)
(p
 | 'read' >> beam.Read(beam.io.BigQuerySource(query=input_query))
 | 'cast ints' >> beam.Map(lambda row: (row['account_id'], int(row['views'])))
 | beam.CombinePerKey(sum)
 | 'format for gbq' >> beam.Map(lambda (k, v): {
     'account_id': k,
     'total_views': v
 })
 | 'save' >> beam.Write(
     beam.io.BigQuerySink(
         output_table,
         schema='account_id:INTEGER, total_views:INTEGER',
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))
p.run()
Exemple #28
0
import apache_beam as beam
project = 'teak-proton-148317'
input_table = 'clouddataflow-readonly:samples.weather_stations'
output_table = 'mydataset.weather_copy_from_dataflow1'

p = beam.Pipeline(argv=['--project', project])

read = beam.Read(beam.io.BigQuerySource(input_table))

tornadoesMonths = beam.FlatMap(lambda row: [(int(row['month']), 1)]
                               if row['tornado'] else [])

monthlyCount = beam.CombinePerKey(sum)
frmat = beam.Map(lambda (k, v): {'month': k, 'tornado_count': v})
sve = beam.Write(
    beam.io.BigQuerySink(
        output_table,
        schema='month:INTEGER, tornado_count:INTEGER',
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE))

(p | read | tornadoesMonths | monthlyCount | frmat | sve)

p.run()
Exemple #29
0
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

options = PipelineOptions()
pipeline = beam.Pipeline('DirectPipelineRunner')
airports = (pipeline
            | beam.Read(beam.io.TextFileSource('airports.csv.gz'))
            | beam.Map(lambda line: next(csv.reader([line])))
            | beam.Map(lambda fields: (fields[0], (fields[21], fields[26]))))
Exemple #30
0
 def expand(self, pvalue):
     return pvalue.pipeline | beam.Read(_TFRecordSource(*self._args))