def expand(self,
               lifts: beam.pvalue.PCollection) -> beam.pvalue.PCollection:
        """Takes top k and bottom k x values (sorted by lift) per slice and y value.

    Args:
      lifts: A PCollection of tuples of the form: (
        _SlicedFeatureKey(slice_key, x_path),
        _LiftInfo(x, y, lift, xy_count, x_count, y_count)).

    Returns:
      A PCollection resulting from a group by with the keys of the form
      (slice_key, x_path) and a stream of values of the form
      (y, y_count, [(x, lift, xy_count, x_count)], in which the stream of values
      has been limited to the top k and bottom k elements per key.
    """
        def move_y_info_to_key(key, value):
            slice_key, x_path = key
            return (_LiftSeriesKey(slice_key=slice_key,
                                   x_path=x_path,
                                   y=value.y,
                                   y_count=value.y_count),
                    _LiftValue(x=value.x,
                               lift=value.lift,
                               xy_count=value.xy_count,
                               x_count=value.x_count))

        # Push y_* into key so that we get per-slice, per-x-path, per-y top and
        # bottom k when calling {Largest,Smallest}PerKey.
        # (_LiftSequenceKey(slice, x_path, y, y_count),
        #      _LiftValue(x, lift, xy_count, x_count))
        lifts = lifts | 'MoveYToKey' >> beam.MapTuple(move_y_info_to_key)

        top_key = operator.attrgetter('lift', 'x')
        if self._top_k_per_y:
            # (_LiftSequenceKey(slice, x_path, y, y_count),
            #      [_LiftValue(x, lift, xy_count, x_count)])
            top_k = (lifts
                     | 'TopK' >> beam.transforms.combiners.Top.PerKey(
                         n=self._top_k_per_y, key=top_key))
        if self._bottom_k_per_y:
            # (_LiftSequenceKey(slice, x_path, y, y_count),
            #      [_LiftValue(x, lift, xy_count, x_count)])
            bottom_k = (lifts
                        | 'BottomK' >> beam.transforms.combiners.Top.PerKey(
                            n=self._bottom_k_per_y, reverse=True, key=top_key))

        if self._top_k_per_y and self._bottom_k_per_y:
            # (_LiftSeriesKey(slice, x_path, y, y_count),
            #      [_LiftValue(x, lift, xy_count, x_count)])
            grouped_lifts = (
                (top_k, bottom_k)
                | 'MergeTopAndBottom' >> beam.Flatten()
                | 'FlattenTopAndBottomLifts' >>
                beam.FlatMapTuple(lambda k, vs: ((k, v) for v in vs))
                | 'ReGroupTopAndBottom' >> beam.GroupByKey())
        elif self._top_k_per_y:
            grouped_lifts = top_k
        elif self._bottom_k_per_y:
            grouped_lifts = bottom_k
        else:
            grouped_lifts = lifts | 'GroupByYs' >> beam.GroupByKey()

        def move_y_info_to_value(key, lift_values):
            return (_SlicedFeatureKey(key.slice_key, key.x_path),
                    _LiftSeries(y=key.y,
                                y_count=key.y_count,
                                lift_values=lift_values))

        # (_SlicedFeatureKey(slice, x_path),
        #      _LiftSeries(y, y_count, [_LiftValue(x, lift, xy_count, x_count)]))
        return (grouped_lifts
                | 'MoveYInfoToValue' >> beam.MapTuple(move_y_info_to_value))
Ejemplo n.º 2
0
    def _load_data(self, partitions_using_temp_tables,
                   partitions_direct_to_destination, load_job_name_pcv,
                   singleton_pc):
        """Load data to BigQuery

    Data is loaded into BigQuery in the following two ways:
      1. Single partition:
         When there is a single partition of files destined to a single
         destination, a single load job is triggered.
      2. Multiple partitions and/or Dynamic Destinations:
         When there are multiple partitions of files destined for a single
         destination or when Dynamic Destinations are used, multiple load jobs
         need to be triggered for each partition/destination. Load Jobs are
         triggered to temporary tables, and those are later copied to the actual
         appropriate destination table. This ensures atomicity when only some
         of the load jobs would fail but not other. If any of them fails, then
         copy jobs are not triggered.
    """
        # Load data using temp tables
        trigger_loads_outputs = (
            partitions_using_temp_tables
            | "TriggerLoadJobsWithTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=True,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format), load_job_name_pcv, *
                self.schema_side_inputs).with_outputs(
                    TriggerLoadJobs.TEMP_TABLES, main='main'))

        temp_tables_load_job_ids_pc = trigger_loads_outputs['main']
        temp_tables_pc = trigger_loads_outputs[TriggerLoadJobs.TEMP_TABLES]

        destination_copy_job_ids_pc = (
            singleton_pc
            | "WaitForTempTableLoadJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(temp_tables_load_job_ids_pc))
            | beam.ParDo(
                TriggerCopyJobs(create_disposition=self.create_disposition,
                                write_disposition=self.write_disposition,
                                test_client=self.test_client),
                load_job_name_pcv))

        finished_copy_jobs_pc = (
            singleton_pc
            | "WaitForCopyJobs" >> beam.ParDo(
                WaitForBQJobs(self.test_client),
                beam.pvalue.AsList(destination_copy_job_ids_pc)))

        _ = (
            finished_copy_jobs_pc
            | "RemoveTempTables/PassTables" >> beam.FlatMap(
                lambda x, deleting_tables: deleting_tables,
                pvalue.AsIter(temp_tables_pc))
            |
            "RemoveTempTables/AddUselessValue" >> beam.Map(lambda x: (x, None))
            | "RemoveTempTables/DeduplicateTables" >> beam.GroupByKey()
            | "RemoveTempTables/GetTableNames" >> beam.Map(lambda elm: elm[0])
            | "RemoveTempTables/Delete" >> beam.ParDo(
                DeleteTablesFn(self.test_client)))

        # Load data directly to destination table
        destination_load_job_ids_pc = (
            partitions_direct_to_destination
            | "TriggerLoadJobsWithoutTempTables" >> beam.ParDo(
                TriggerLoadJobs(
                    schema=self.schema,
                    write_disposition=self.write_disposition,
                    create_disposition=self.create_disposition,
                    test_client=self.test_client,
                    temporary_tables=False,
                    additional_bq_parameters=self.additional_bq_parameters,
                    source_format=self._temp_file_format), load_job_name_pcv, *
                self.schema_side_inputs))

        _ = (singleton_pc
             | "WaitForDestinationLoadJobs" >> beam.ParDo(
                 WaitForBQJobs(self.test_client),
                 beam.pvalue.AsList(destination_load_job_ids_pc)))

        destination_load_job_ids_pc = (
            (temp_tables_load_job_ids_pc, destination_load_job_ids_pc)
            | beam.Flatten())

        return destination_load_job_ids_pc, destination_copy_job_ids_pc
Ejemplo n.º 3
0
def run():
    PROJECT_ID = 'electric-spark-266716'  # change to your project id
    BUCKET = 'gs://global_surface_temperatures'  # change to your bucket name
    DIR_PATH = BUCKET + '/output/' + datetime.datetime.now().strftime(
        '%Y_%m_%d_%H_%M_%S') + '/'

    # Create and set your PipelineOptions.
    options = PipelineOptions(flags=None)

    # For Dataflow execution, set the project, job_name,
    # staging location, temp_location and specify DataflowRunner.
    google_cloud_options = options.view_as(GoogleCloudOptions)
    google_cloud_options.project = PROJECT_ID
    google_cloud_options.job_name = 'city-beam-dataflow'
    google_cloud_options.staging_location = BUCKET + '/staging'
    google_cloud_options.temp_location = BUCKET + '/temp'
    options.view_as(StandardOptions).runner = 'DataflowRunner'

    # Create the Pipeline with the specified options.
    p = Pipeline(options=options)

    #create query to select all elements for cleansing
    sql = 'SELECT dt, AverageTemperature, AverageTemperatureUncertainty, City, Country, Latitude,\
     Longitude, major_city FROM kaggle_modeled.City as x'

    bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

    #read desired table from BigQuery
    query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)

    #write inputs to input.txt
    query_results | 'Write input' >> WriteToText(DIR_PATH + 'input.txt')

    # apply ParDo to filter out dates
    formatted_date_pcoll = query_results | 'Filter Dates' >> beam.ParDo(
        FilterDateFn())

    #write filtered dates to filtered.txt
    formatted_date_pcoll | 'Write filtered dates' >> WriteToText(
        DIR_PATH + 'filtered.txt')

    # group city records by (dt,city) tuple created
    grouped_city_pcoll = formatted_date_pcoll | 'Group by city, dt' >> beam.GroupByKey(
    )

    # display grouped city records
    grouped_city_pcoll | 'Write group by' >> WriteToText(DIR_PATH +
                                                         'grouped.txt')

    #remove duplicate city records
    distinct_city_pcoll = grouped_city_pcoll | 'Delete duplicate records' >> beam.ParDo(
        DedupCityRecordsFn())

    #write resulting PColleciton to output.txt
    distinct_city_pcoll | 'Write output' >> WriteToText(DIR_PATH +
                                                        'output.txt')

    #create new table in BigQuery
    dataset_id = 'kaggle_modeled'
    table_id = 'City_Beam_DF'
    schema_id = 'dt:DATE,AverageTemperature:FLOAT,AverageTemperatureUncertainty:FLOAT,\
    City:STRING,Country:STRING,Latitude:STRING,Longitude:STRING,major_city:INTEGER'

    # write PCollection to new BQ table
    distinct_city_pcoll | 'Write BQ table' >> beam.io.WriteToBigQuery(
        dataset=dataset_id,
        table=table_id,
        schema=schema_id,
        project=PROJECT_ID,
        create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
        write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)

    result = p.run()
    result.wait_until_finish()
Ejemplo n.º 4
0
 def Reshard(pcoll):  # pylint: disable=invalid-name
   return (
       pcoll
       | 'PairWithNone' >> beam.Map(lambda x: (None, x))
       | 'GroupByNone' >> beam.GroupByKey()
       | 'ExtractValues' >> beam.FlatMap(lambda x: x[1]))
Ejemplo n.º 5
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the wordcount pipeline."""

    parser = argparse.ArgumentParser()

    parser.add_argument(
        '--input',
        dest='input',
        default='gs://dataflow-samples/shakespeare/kinglear.txt',
        help='Input file to process.')

    parser.add_argument('--output',
                        dest='output',
                        required=True,
                        help='Output file to write results to.')

    known_args, pipeline_args = parser.parse_known_args(argv)

    # We use the save_main_session option because one or more DoFn's in this

    # workflow rely on global context (e.g., a module imported at module level).

    pipeline_options = PipelineOptions(pipeline_args)

    pipeline_options.view_as(
        SetupOptions).save_main_session = save_main_session

    p = beam.Pipeline(options=pipeline_options)

    # Read the text file[pattern] into a PCollection.

    lines = p | 'read' >> ReadFromText(known_args.input)

    # Count the occurrences of each word.

    def count_ones(word_ones):

        (word, ones) = word_ones

        return (word, sum(ones))

    counts = (lines
              | 'split' >>
              (beam.ParDo(WordExtractingDoFn()).with_output_types(unicode))
              | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
              | 'group' >> beam.GroupByKey()
              | 'count' >> beam.Map(count_ones))

    # Format the counts into a PCollection of strings.

    def format_result(word_count):

        (word, count) = word_count

        return '%s: %d' % (word, count)

    output = counts | 'format' >> beam.Map(format_result)

    # Write the output using a "Write" transform that has side effects.

    # pylint: disable=expression-not-assigned

    output | 'write' >> WriteToText(known_args.output)

    result = p.run()

    result.wait_until_finish()

    # Do not query metrics when creating a template which doesn't run

    if (not hasattr(result, 'has_job')  # direct runner
            or result.has_job):  # not just a template creation

        empty_lines_filter = MetricsFilter().with_name('empty_lines')

        query_result = result.metrics().query(empty_lines_filter)

        if query_result['counters']:

            empty_lines_counter = query_result['counters'][0]

            logging.info('number of empty lines: %d',
                         empty_lines_counter.result)

        word_lengths_filter = MetricsFilter().with_name('word_len_dist')

        query_result = result.metrics().query(word_lengths_filter)

        if query_result['distributions']:

            word_lengths_dist = query_result['distributions'][0]

            logging.info('average word length: %d',
                         word_lengths_dist.result.mean)
Ejemplo n.º 6
0
    def test_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(
            direct_runner.DirectRunner()))

        # Count the occurrences of each word.
        counts = (p
                  | beam.Create(['to be or not to be that is the question'])
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that counts will be cached.
        ib.watch(locals())

        result = p.run()
        result.wait_until_finish()

        actual = list(result.get(counts))
        self.assertSetEqual(
            set(actual),
            set([
                ('or', 1),
                ('that', 1),
                ('be', 2),
                ('is', 1),
                ('question', 1),
                ('to', 2),
                ('the', 1),
                ('not', 1),
            ]))

        # Truncate the precision to millis because the window coder uses millis
        # as units then gets upcast to micros.
        end_of_window = (GlobalWindow().max_timestamp().micros // 1000) * 1000
        df_counts = ib.collect(counts, include_window_info=True)
        df_expected = pd.DataFrame(
            {
                0: [e[0] for e in actual],
                1: [e[1] for e in actual],
                'event_time': [end_of_window for _ in actual],
                'windows': [[GlobalWindow()] for _ in actual],
                'pane_info': [
                    PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)
                    for _ in actual
                ]
            },
            columns=[0, 1, 'event_time', 'windows', 'pane_info'])

        pd.testing.assert_frame_equal(df_expected, df_counts)

        actual_reified = result.get(counts, include_window_info=True)
        expected_reified = [
            WindowedValue(e, Timestamp(micros=end_of_window), [GlobalWindow()],
                          PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0))
            for e in actual
        ]
        self.assertEqual(actual_reified, expected_reified)
    events = (csv_formatted_data
              | "FormatToBigquery" >> beam.Map(format_csv_data_bq)
              | "WriteAllDataToBigQuery" >> beam.io.WriteToBigQuery(
                  "samhitha-data228-project:Vaccine_Dataset.vaccine_format",
                  write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    (csv_formatted_data
     | "FormatToBigquery old" >> beam.Map(format_data_old)
     | "Write all data to BigQuery asd" >> beam.io.WriteToBigQuery(
         "samhitha-data228-project:Vaccine_Dataset.Vaccination_Data_new",
         write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND))

    grouped_by_country = (
        csv_formatted_data
        | "CollectingCountryKey" >> beam.ParDo(CollectLocationKey())
        | "GroupingByCountry" >> beam.GroupByKey()
        #|beam.Map(print)
    )

    # Which country Vaccinating more people?

    Total_vaccinations = (
        grouped_by_country
        | "ExtractTotalVaccinations" >> beam.Map(lambda x: {
            'country': x[0],
            'total_vaccines': sum(x[1])
        }))

    Top_10_country_vaccinations = (
        Total_vaccinations
        | 'AddKey' >> beam.Map(addKey)
Ejemplo n.º 8
0
# setting input and output files
input_filename = "./data/sp500.csv"
output_filename = "./output/result.txt"

# instantiate the pipeline
options = PipelineOptions()

with beam.Pipeline(options=options) as p:
    # reading the csv and splitting lines by elements we want to retain
    csv_lines = (p | beam.io.ReadFromText(input_filename, skip_header_lines=1)
                 | beam.ParDo(Split()))

    # calculate the mean for Open values
    mean_open = (csv_lines | beam.ParDo(CollectOpen())
                 | "Grouping keys Open" >> beam.GroupByKey()
                 | "Calculating mean for Open" >> beam.CombineValues(
                     beam.combiners.MeanCombineFn()))

    # calculate the mean for Close values
    mean_close = (csv_lines | beam.ParDo(CollectClose())
                  | "Grouping keys Close" >> beam.GroupByKey()
                  | "Calculating mean for Close" >> beam.CombineValues(
                      beam.combiners.MeanCombineFn()))

    # writing results to file
    output = ({
        'Mean Open': mean_open,
        'Mean Close': mean_close
    } | beam.CoGroupByKey() | beam.io.WriteToText(output_filename))
Ejemplo n.º 9
0
    def test_multiple_outputs_with_watermark_advancement(self):
        """Tests that the TestStream can independently control output watermarks."""

        # Purposely set the watermark of numbers to 20 then letters to 5 to test
        # that the watermark advancement is per PCollection.
        #
        # This creates two PCollections, (a, b, c) and (1, 2, 3). These will be
        # emitted at different times so that they will have different windows. The
        # watermark advancement is checked by checking their windows. If the
        # watermark does not advance, then the windows will be [-inf, -inf). If the
        # windows do not advance separately, then the PCollections will both
        # windowed in [15, 30).
        letters_elements = [
            TimestampedValue('a', 6),
            TimestampedValue('b', 7),
            TimestampedValue('c', 8),
        ]
        numbers_elements = [
            TimestampedValue('1', 21),
            TimestampedValue('2', 22),
            TimestampedValue('3', 23),
        ]
        test_stream = (TestStream()
                       .advance_watermark_to(0, tag='letters')
                       .advance_watermark_to(0, tag='numbers')
                       .advance_watermark_to(20, tag='numbers')
                       .advance_watermark_to(5, tag='letters')
                       .add_elements(letters_elements, tag='letters')
                       .advance_watermark_to(10, tag='letters')
                       .add_elements(numbers_elements, tag='numbers')
                       .advance_watermark_to(30, tag='numbers')) # yapf: disable

        options = StandardOptions(streaming=True)
        options.view_as(DebugOptions).add_experiment(
            'passthrough_pcollection_output_ids')
        p = TestPipeline(options=options)

        main = p | test_stream

        # Use an AfterWatermark trigger with an early firing to test that the
        # watermark is advancing properly and that the element is being emitted in
        # the correct window.
        letters = (
            main['letters']
            | 'letter windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'letter with key' >> beam.Map(lambda x: ('k', x))
            | 'letter gbk' >> beam.GroupByKey())

        numbers = (
            main['numbers']
            | 'number windows' >> beam.WindowInto(
                FixedWindows(15),
                trigger=trigger.AfterWatermark(early=trigger.AfterCount(1)),
                accumulation_mode=trigger.AccumulationMode.DISCARDING)
            | 'number with key' >> beam.Map(lambda x: ('k', x))
            | 'number gbk' >> beam.GroupByKey())

        # The letters were emitted when the watermark was at 5, thus we expect to
        # see the elements in the [0, 15) window. We used an early trigger to make
        # sure that the ON_TIME empty pane was also emitted with a TestStream.
        # This pane has no data because of the early trigger causes the elements to
        # fire before the end of the window and because the accumulation mode
        # discards any data after the trigger fired.
        expected_letters = {
            window.IntervalWindow(0, 15): [
                ('k', ['a', 'b', 'c']),
                ('k', []),
            ],
        }

        # Same here, except the numbers were emitted at watermark = 20, thus they
        # are in the [15, 30) window.
        expected_numbers = {
            window.IntervalWindow(15, 30): [
                ('k', ['1', '2', '3']),
                ('k', []),
            ],
        }
        assert_that(letters,
                    equal_to_per_window(expected_letters),
                    label='letters assert per window')
        assert_that(numbers,
                    equal_to_per_window(expected_numbers),
                    label='numbers assert per window')

        p.run()
Ejemplo n.º 10
0
def pipeline(root):
  """Beam pipeline.

  Args:
    root: the root of the pipeline.
  """
  stage1_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage1')
  stage2_matched_conformers = dat_input_and_parsing_pipeline(root, 'stage2')

  # Create a collection of conformers with duplicate information
  equivalent_files = gfile.glob(FLAGS.input_equivalent_glob)
  equivalent_conformers = (
      root
      | 'CreateEquivInputs' >> beam.Create(equivalent_files)
      | 'ParseEquiv' >> beam.FlatMap(parse_equivalent_file))

  # Merge by bond_topology_id
  merged_results = (
      (stage1_matched_conformers, stage2_matched_conformers,
       equivalent_conformers)
      | 'FlattenAllConformers' >> beam.Flatten()
      | 'GroupByCID' >> beam.GroupBy(lambda c: c.conformer_id)
      | 'MergeConformers' >> beam.ParDo(MergeConformersFn()).with_outputs(
          MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT, main='conformers'))
  merged_conformers = merged_results['conformers']

  # Write out the merge conflicts
  _ = (
      merged_results[MergeConformersFn.OUTPUT_TAG_MERGE_CONFLICT]
      | 'ConflictsCSVFormat' >> beam.Map(csv_format)
      | 'ConflictsReshuffle' >> beam.Reshuffle()
      | 'WriteConflictsCSV' >> beam.io.WriteToText(
          FLAGS.output_stem + '_conflicts',
          header=csv_format(smu_utils_lib.MERGE_CONFLICT_FIELDS),
          num_shards=1,
          file_name_suffix='.csv'))

  cleaned_conformers = (
      merged_conformers
      | 'CleanUpConformers' >> beam.Map(clean_up_conformer))

  # Get the bond length distributions
  bond_length_dists_pcoll = (
      cleaned_conformers
      | 'ExtractBondLengths' >> beam.FlatMap(
          extract_bond_lengths,
          dist_sig_digits=_BOND_LENGTHS_SIG_DIGITS,
          unbonded_max=_BOND_LENGTHS_UNBONDED_MAX)
      | 'CountBondLengths' >> beam.combiners.Count.PerElement()
      | 'ToListBondLengths' >> beam.combiners.ToList())
  _ = (
      bond_length_dists_pcoll
      | 'WriteBondLengths' >> beam.ParDo(
          write_bond_lengths, filename=f'{FLAGS.output_stem}_bond_lengths.csv'))

  # Get the SMILES to id mapping needed for UpdateConformerFn
  smiles_id_pcoll = (
      root
      | 'BTInputForSmiles' >> beam.Create([FLAGS.input_bond_topology_csv])
      | 'GenerateSmilesToID' >> beam.FlatMap(smiles_to_id))
  smiles_id_dict = beam.pvalue.AsDict(smiles_id_pcoll)

  # Various per conformer processing
  update_results = (
      cleaned_conformers
      | 'UpdateConformers' >> beam.ParDo(
          UpdateConformerFn(), beam.pvalue.AsSingleton(bond_length_dists_pcoll),
          smiles_id_dict).with_outputs(
              UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH, main='conformers'))
  updated_conformers = update_results['conformers']

  # Output SMILES mismatches
  _ = (
      update_results[UpdateConformerFn.OUTPUT_TAG_SMILES_MISMATCH]
      | 'ReshuffleSmilesOutput' >> beam.Reshuffle()
      | 'SmilesCSVFormat' >> beam.Map(csv_format)
      | 'WriteSmilesCSV' >> beam.io.WriteToText(
          FLAGS.output_stem + '_smiles_compare',
          header='conformer_id,compare,smiles_given,smiles_with_h,smiles_without_h',
          num_shards=1,
          file_name_suffix='.csv'))

  # Process duplicate information
  final_conformers = (
      updated_conformers
      | 'KeyedForDuplicates' >>
      beam.FlatMap(generate_keyed_conformers_for_duplicates)
      | 'DupGroupByKey' >> beam.GroupByKey()
      | 'MergeDupInfo' >> beam.MapTuple(merge_duplicate_information))

  # Pull the stats of various sorts write to a file
  _ = (
      final_conformers
      | 'ExtractStats' >> beam.FlatMap(conformer_to_stat_values)
      | 'CountStats' >> beam.combiners.Count.PerElement()
      | 'StatsCSVFormat' >> beam.MapTuple(lambda x, c: f'{x[0]},{x[1]},{c}')
      | 'WriteStatsCSV' >> beam.io.WriteToText(
          FLAGS.output_stem + '_stats',
          header='primary_key,secondary_key,count',
          num_shards=1,
          file_name_suffix='.csv'))

  # Generate the summary by bond topology.
  bare_bt_summaries = (
      root
      | 'BondTopologyInput' >> beam.Create([FLAGS.input_bond_topology_csv])
      | 'GenerateBareBTSummaries' >>
      beam.FlatMap(bond_topology_summaries_from_csv))
  real_bt_summaries = (
      final_conformers
      | 'GenerateBTSummaries' >> beam.FlatMap(to_keyed_bond_topology_summary))
  _ = ((bare_bt_summaries, real_bt_summaries)
       | 'FlattenAllBTSummaries' >> beam.Flatten()
       | 'FinishBTSummary' >> CombineAndWriteBondTopologySummary())

  # Make the filtered versions of the dataset
  complete_conformers = (
      final_conformers
      | 'MakeComplete' >> beam.Map(make_complete_conformer))

  standard_conformers = (
      final_conformers
      | 'MakeStandard' >> beam.FlatMap(make_standard_conformer))

  # Write the complete and standard conformers as binary protobuf in TFRecord.
  for id_str, collection in [['complete', complete_conformers],
                             ['standard', standard_conformers]]:
    _ = (
        collection
        | ('TFRecordReshuffle_' + id_str) >> beam.Reshuffle()
        | ('WriteTFRecord_' + id_str) >> beam.io.tfrecordio.WriteToTFRecord(
            f'{FLAGS.output_stem}_{id_str}_tfrecord',
            coder=beam.coders.ProtoCoder(dataset_pb2.Conformer),
            num_shards=FLAGS.output_shards))


  # Write the complete and standard conformers as JSON.
  # Bit of a hack here: the slowest part of the whole pipeline is writing out
  # the JSON for the complete conformers. So we just hard code a tripling of the
  # shards to get more parallelism.
  for id_str, collection, num_shards in [[
      'complete', complete_conformers, FLAGS.output_shards * 3
  ], ['standard', standard_conformers, FLAGS.output_shards]]:
    _ = (
        collection
        | ('JSONReshuffle_' + id_str) >> beam.Reshuffle()
        | ('ToJSON_' + id_str) >> beam.Map(conformer_to_json)
        | ('WriteJSON_' + id_str) >> beam.io.WriteToText(
            f'{FLAGS.output_stem}_{id_str}_json',
            compression_type='gzip',
            num_shards=num_shards,
            file_name_suffix='.json.gz'))
Ejemplo n.º 11
0
def Shuffle(examples):  # pylint: disable=invalid-name
  return (examples
          | 'PairWithRandom' >> beam.Map(lambda x: (random.random(), x))
          | 'GroupByRandom' >> beam.GroupByKey()
          | 'DropRandom' >> beam.FlatMap(lambda (k, vs): vs))
Ejemplo n.º 12
0
  def test_progress_metrics(self):
    p = self.create_pipeline()
    if not isinstance(p.runner, fn_api_runner.FnApiRunner):
      # This test is inherited by others that may not support the same
      # internal way of accessing progress metrics.
      self.skipTest('Progress metrics not supported.')
      return

    _ = (p
         | beam.Create([0, 0, 0, 5e-3 * DEFAULT_SAMPLING_PERIOD_MS])
         | beam.Map(time.sleep)
         | beam.Map(lambda x: ('key', x))
         | beam.GroupByKey()
         | 'm_out' >> beam.FlatMap(lambda x: [
             1, 2, 3, 4, 5,
             beam.pvalue.TaggedOutput('once', x),
             beam.pvalue.TaggedOutput('twice', x),
             beam.pvalue.TaggedOutput('twice', x)]))

    res = p.run()
    res.wait_until_finish()

    def has_mi_for_ptransform(monitoring_infos, ptransform):
      for mi in monitoring_infos:
        if ptransform in mi.labels['PTRANSFORM']:
          return True
      return False

    try:
      # TODO(ajamato): Delete this block after deleting the legacy metrics code.
      # Test the DEPRECATED legacy metrics
      pregbk_metrics, postgbk_metrics = list(
          res._metrics_by_stage.values())
      if 'Create/Read' not in pregbk_metrics.ptransforms:
        # The metrics above are actually unordered. Swap.
        pregbk_metrics, postgbk_metrics = postgbk_metrics, pregbk_metrics
      self.assertEqual(
          4,
          pregbk_metrics.ptransforms['Create/Read']
          .processed_elements.measured.output_element_counts['out'])
      self.assertEqual(
          4,
          pregbk_metrics.ptransforms['Map(sleep)']
          .processed_elements.measured.output_element_counts['None'])
      self.assertLessEqual(
          4e-3 * DEFAULT_SAMPLING_PERIOD_MS,
          pregbk_metrics.ptransforms['Map(sleep)']
          .processed_elements.measured.total_time_spent)
      self.assertEqual(
          1,
          postgbk_metrics.ptransforms['GroupByKey/Read']
          .processed_elements.measured.output_element_counts['None'])

      # The actual stage name ends up being something like 'm_out/lamdbda...'
      m_out, = [
          metrics for name, metrics in list(postgbk_metrics.ptransforms.items())
          if name.startswith('m_out')]
      self.assertEqual(
          5,
          m_out.processed_elements.measured.output_element_counts['None'])
      self.assertEqual(
          1,
          m_out.processed_elements.measured.output_element_counts['once'])
      self.assertEqual(
          2,
          m_out.processed_elements.measured.output_element_counts['twice'])

      # Test the new MonitoringInfo monitoring format.
      self.assertEqual(2, len(res._monitoring_infos_by_stage))
      pregbk_mis, postgbk_mis = list(res._monitoring_infos_by_stage.values())

      if not has_mi_for_ptransform(pregbk_mis, 'Create/Read'):
        # The monitoring infos above are actually unordered. Swap.
        pregbk_mis, postgbk_mis = postgbk_mis, pregbk_mis

      def assert_has_monitoring_info(
          monitoring_infos, urn, labels, value=None, ge_value=None):
        def contains_labels(monitoring_info, labels):
          return len([x for x in labels.items() if
                      x[0] in monitoring_info.labels and monitoring_info.labels[
                          x[0]] == x[1]]) == len(labels)

        # TODO(ajamato): Consider adding a matcher framework
        found = 0
        for mi in monitoring_infos:
          if contains_labels(mi, labels) and mi.urn == urn:
            if (ge_value is not None and
                mi.metric.counter_data.int64_value >= ge_value):
              found = found + 1
            elif (value is not None and
                  mi.metric.counter_data.int64_value == value):
              found = found + 1
        ge_value_str = {'ge_value' : ge_value} if ge_value else ''
        value_str = {'value' : value} if value else ''
        self.assertEqual(
            1, found, "Found (%s) Expected only 1 monitoring_info for %s." %
            (found, (urn, labels, value_str, ge_value_str),))

      # pregbk monitoring infos
      labels = {'PCOLLECTION' : 'ref_PCollection_PCollection_1'}
      assert_has_monitoring_info(
          pregbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=4)
      labels = {'PCOLLECTION' : 'ref_PCollection_PCollection_2'}
      assert_has_monitoring_info(
          pregbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=4)
      labels = {'PTRANSFORM' : 'Map(sleep)'}
      assert_has_monitoring_info(
          pregbk_mis, monitoring_infos.TOTAL_MSECS_URN,
          labels, ge_value=4 * DEFAULT_SAMPLING_PERIOD_MS)

      # postgbk monitoring infos
      labels = {'PCOLLECTION' : 'ref_PCollection_PCollection_6'}
      assert_has_monitoring_info(
          postgbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=1)
      labels = {'PCOLLECTION' : 'ref_PCollection_PCollection_7'}
      assert_has_monitoring_info(
          postgbk_mis, monitoring_infos.ELEMENT_COUNT_URN, labels, value=5)
    except:
      print(res._monitoring_infos_by_stage)
      raise
Ejemplo n.º 13
0
def run_pipeline(root, input_note_events, input_ratings, output_path, vocab,
                 section_markers, cur_augmentation_config):
    """Create beam pipeline to generate TF examples.

  Args:
    root: beam.Pipeline root.
    input_note_events: Path to csv of notes.
    input_ratings: Path to csv of ratings.
    output_path: Directory path to write output to.
    vocab: List of tokens in the vocabulary.
    section_markers: Dict of markers as accepted by note sectioning.
    cur_augmentation_config: AugmentationConfig dataclass instance, defines the
      kinds of augmentations to apply.
  """

    # Load and process ratings:
    raw_ratings = data_lib.read_raw_ratings(root, input_ratings)

    ratings = (raw_ratings
               | "GetLabels" >> beam.Map(data_lib.convert_ratings)
               | "GroupRatingsByNoteId" >> beam.GroupByKey()
               | "UnpackRatings" >> beam.Map(lambda x: (x[0], list(x[1]))))

    # Load and process notes:
    notes = data_lib.read_filter_notes(root, input_note_events)

    note_partitions = (
        raw_ratings
        | "PartitionMap" >>
        (beam.Map(lambda x: (str(x.note_id), x.partition))).with_output_types(
            Tuple[str, str])
        | "DedupPartitionMap" >> beam.Distinct())

    # Join.
    non_rated_notes, rated_notes = (
        ({
            "ratings": ratings,
            "notes": notes,
            "note_partition": note_partitions
        })
        | "Join" >> beam.CoGroupByKey().with_output_types(
            Tuple[str, Dict[str, Any]])
        | "SplitRated" >> beam.Partition(
            lambda x, n_part: int(bool(x[1]["ratings"])), 2))

    # Downsample non-rated.
    non_rated_notes = data_lib.downsample(non_rated_notes, _N_DOWNSAMPLE.value,
                                          _RANDOM_SEED.value)

    # Process notes.
    features_and_labels = (
        (non_rated_notes, rated_notes)
        | beam.Flatten()
        | "ReshuffleJoin" >> beam.Reshuffle()
        | "ProcessAPData" >> beam.ParDo(data_lib.ProcessAPData(),
                                        section_markers)
        | "FilterAPData" >> beam.Filter(data_lib.filter_by_labels)
        | "ReshuffleForSubjectId" >> beam.Reshuffle()
        | "RekeyBySubjectId" >> beam.Map(lambda x: (x[1].subject_id, x[1]))
        | "GroupBySubjectId" >> beam.GroupByKey()
        | "OneNoteIdPerRatedSubjectId" >> beam.ParDo(
            data_lib.OneNoteIdPerRatedSubjectId(), seed=_RANDOM_SEED.value)
        | "RekeyByNoteId" >> beam.Map(lambda x: (x.note_id, x))
        | "ApplyAugmentations" >> beam.ParDo(data_lib.ApplyAugmentations(),
                                             cur_augmentation_config,
                                             _RANDOM_SEED.value)
        | "GetFeaturesAndLabels" >> beam.ParDo(
            data_lib.ProcessFeaturesAndLabels(vocab, _MAX_SEQ_LENGTH.value))
        | "ReshuffleFeaturesAndLabels" >> beam.Reshuffle())

    # Convert and save tf examples:
    data_lib.convert_and_save_tf_examples(features_and_labels, output_path,
                                          _DEBUG_OUTPUT.value)
def run(argv=None, save_main_session=True):
  """Main entry point to pipeline."""
  parser = argparse.ArgumentParser()
  parser.add_argument('--corpus_home',
                      dest='corpus_home',
                      help='The directory or bucke of the corpus home')
  parser.add_argument('--input',
                      dest='input',
                      help='A single input file')
  parser.add_argument('--corpus_prefix',
                      dest='corpus_prefix',
                      help='Prefix after corpus home where the files are')
  parser.add_argument('--ignorelines',
                      dest='ignorelines',
                      help='Ignore lines containing these words')
  parser.add_argument('--output',
                      dest='output',
                      required=True,
                      help='Output file')
  known_args, pipeline_args = parser.parse_known_args(argv)
  pipeline_options = PipelineOptions(pipeline_args)
  pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
  p = beam.Pipeline(options=pipeline_options)
  ignorepatterns = []
  if known_args.ignorelines:
    ignorepatterns = load_ignore(known_args.ignorelines)
  if known_args.corpus_home:
    logging.info('corpus_home: %s', known_args.corpus_home)
    corpus_data_dir = '{}/data/corpus'.format(known_args.corpus_home)
    corpus_index = '{}/collections.csv'.format(corpus_data_dir)
    corpus_dir = known_args.corpus_home
    if known_args.corpus_prefix:
      corpus_dir = '{}/{}'.format(known_args.corpus_home,
                                  known_args.corpus_prefix)
    lines = (p | 'read_top_index' >> ReadFromText(corpus_index)
              | 'split_top_index' >> beam.ParDo(ExtractIndexEntry())
              | 'add_prefix_corpus_data' >> beam.FlatMap(add_prefix,
                                                         corpus_data_dir)
              | 'read_secondary_index' >> ReadAllFromText()
              | 'split_secondary_index' >> beam.ParDo(ExtractIndexEntry())
              | 'add_prefix_corpus_dir' >> beam.FlatMap(add_prefix, corpus_dir)
              | 'read_files' >> ReadAllFromText())
  else:
    lines = p | 'read' >> ReadFromText(known_args.input)

  # Count the occurrences of each character.
  def count_ones(char_ones):
    (c, ones) = char_ones
    return (c, sum(ones))

  # Ignore counts for lines that are boilerplate (copyright notices, etc)
  re_patterns = []
  for val in ignorepatterns:
    pat = '.*{}.*'.format(val)
    re_patterns.append(re.compile(pat, re.IGNORECASE))

  def not_boilerplate(line):
    """true if the line does not match a boilerplate pattern """
    for re_pattern in re_patterns:
      if re_pattern.match(line) != None:
        return False
    return True

  counts = (lines
            | 'filter' >> beam.Filter(not_boilerplate)
            | 'split' >> (beam.ParDo(CharBigramExtractingDoFn())
                          .with_output_types(unicode))
            | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
            | 'group' >> beam.GroupByKey()
            | 'count' >> beam.Map(count_ones))

  # Format the result
  def format_result(char_bigram_count):
    (char_bigram, count) = char_bigram_count
    return '%s\t%d' % (char_bigram, count)

  output = counts | 'format' >> beam.Map(format_result)

  output | 'write' >> WriteToText(known_args.output)
  result = p.run()
  result.wait_until_finish()
  if (not hasattr(result, 'has_job') or result.has_job):
    char_bigram_filter = MetricsFilter().with_name('char_bigrams')
    query_result = result.metrics().query(char_bigram_filter)
    if query_result['counters']:
      char_bigram_counter = query_result['counters'][0]
      logging.info('Total char bigrams: %d', char_bigram_counter.result)
Ejemplo n.º 15
0
def ComputeWithConfidenceIntervals(  # pylint: disable=invalid-name
        sliced_extracts: beam.pvalue.PCollection,
        compute_per_slice_metrics_cls: Type[beam.PTransform],
        num_bootstrap_samples: Optional[int] = DEFAULT_NUM_BOOTSTRAP_SAMPLES,
        random_seed_for_testing: Optional[int] = None,
        **kwargs) -> beam.pvalue.PCollection:
    """PTransform for computing metrics using T-Distribution values.

  Args:
    sliced_extracts: Incoming PCollection consisting of slice key and extracts.
    compute_per_slice_metrics_cls: PTransform class that takes a PCollection of
      (slice key, extracts) as input and returns (slice key, dict of metrics) as
      output. The class will be instantiated multiple times to compute metrics
      both with and without sampling. The class will be initialized using kwargs
      'compute_with_sampling' and 'random_seed_for_testing' along with any
      kwargs passed in **kwargs.
    num_bootstrap_samples: Number of replicas to use in calculating uncertainty
      using bootstrapping. If 1 is provided (default), aggregate metrics will be
      calculated with no uncertainty. If num_bootstrap_samples is > 0, multiple
      samples of each slice will be calculated using the Poisson bootstrap
      method. To calculate standard errors, num_bootstrap_samples should be 20
      or more in order to provide useful data. More is better, but you pay a
      performance cost.
    random_seed_for_testing: Seed to use for unit testing, because
      nondeterministic tests stink. Each partition will use this value + i.
    **kwargs: Additional args to pass to compute_per_slice_metrics_cls init.

  Returns:
    PCollection of (slice key, dict of metrics)
  """
    if not num_bootstrap_samples:
        num_bootstrap_samples = 1
    # TODO(ckuhn): Cap the number of bootstrap samples at 20.
    if num_bootstrap_samples < 1:
        raise ValueError('num_bootstrap_samples should be > 0, got %d' %
                         num_bootstrap_samples)

    output_results = (
        sliced_extracts
        | 'ComputeUnsampledMetrics' >> compute_per_slice_metrics_cls(
            compute_with_sampling=False,
            random_seed_for_testing=None,
            **kwargs))

    if num_bootstrap_samples > 1:
        multicombine = []
        for i in range(num_bootstrap_samples):
            seed = (None if random_seed_for_testing is None else
                    random_seed_for_testing + i)
            multicombine.append(
                sliced_extracts
                | 'ComputeSampledMetrics%d' % i >>
                compute_per_slice_metrics_cls(compute_with_sampling=True,
                                              random_seed_for_testing=seed,
                                              **kwargs))
        output_results = (
            multicombine
            | 'FlattenBootstrapPartitions' >> beam.Flatten()
            | 'GroupBySlice' >> beam.GroupByKey()
            | 'MergeBootstrap' >> beam.ParDo(
                _MergeBootstrap(), beam.pvalue.AsDict(output_results)))
    return output_results
Ejemplo n.º 16
0
    def test_windowing(self):
        test_stream = (TestStream()
                       .advance_watermark_to(0)
                       .add_elements(['a', 'b', 'c'])
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_processing_time(1)
                       .advance_watermark_to(5)
                       .add_elements(['1', '2', '3'])
                       .advance_processing_time(1)
                       .advance_watermark_to(6)
                       .advance_processing_time(1)
                       .advance_watermark_to(7)
                       .advance_processing_time(1)
                       .advance_watermark_to(8)
                       .advance_processing_time(1)
                       .advance_watermark_to(9)
                       .advance_processing_time(1)
                       .advance_watermark_to(10)
                       .advance_processing_time(1)
                       .advance_watermark_to(11)
                       .advance_processing_time(1)
                       .advance_watermark_to(12)
                       .advance_processing_time(1)
                       .advance_watermark_to(13)
                       .advance_processing_time(1)
                       .advance_watermark_to(14)
                       .advance_processing_time(1)
                       .advance_watermark_to(15)
                       .advance_processing_time(1)
                       )  # yapf: disable

        options = StandardOptions(streaming=True)
        p = TestPipeline(options=options)

        records = (p
                   | test_stream
                   | 'letter windows' >> beam.WindowInto(
                       FixedWindows(5),
                       accumulation_mode=trigger.AccumulationMode.DISCARDING)
                   | 'letter with key' >> beam.Map(lambda x: ('k', x))
                   | 'letter gbk' >> beam.GroupByKey()
                   | ReverseTestStream(sample_resolution_sec=1,
                                       output_tag=None))

        assert_that(
            records,
            equal_to_per_window({
                beam.window.GlobalWindow(): [
                    [ProcessingTimeEvent(5),
                     WatermarkEvent(4999998)],
                    [
                        ElementEvent([
                            TimestampedValue(('k', ['a', 'b', 'c']), 4.999999)
                        ])
                    ],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(5000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(6000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(7000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(8000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(9000000)],
                    [
                        ElementEvent([
                            TimestampedValue(('k', ['1', '2', '3']), 9.999999)
                        ])
                    ],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(10000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(11000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(12000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(13000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(14000000)],
                    [ProcessingTimeEvent(1),
                     WatermarkEvent(15000000)],
                ],
            }))

        p.run()
Ejemplo n.º 17
0
    def test_streaming_wordcount(self):
        class WordExtractingDoFn(beam.DoFn):
            def process(self, element):
                text_line = element.strip()
                words = text_line.split()
                return words

        # Add the TestStream so that it can be cached.
        ib.options.capturable_sources.add(TestStream)
        ib.options.capture_duration = timedelta(seconds=1)

        p = beam.Pipeline(runner=interactive_runner.InteractiveRunner(),
                          options=StandardOptions(streaming=True))

        data = (
            p
            | TestStream()
                .advance_watermark_to(0)
                .advance_processing_time(1)
                .add_elements(['to', 'be', 'or', 'not', 'to', 'be'])
                .advance_watermark_to(20)
                .advance_processing_time(1)
                .add_elements(['that', 'is', 'the', 'question'])
            | beam.WindowInto(beam.window.FixedWindows(10))) # yapf: disable

        counts = (data
                  | 'split' >> beam.ParDo(WordExtractingDoFn())
                  | 'pair_with_one' >> beam.Map(lambda x: (x, 1))
                  | 'group' >> beam.GroupByKey()
                  | 'count' >> beam.Map(lambda wordones:
                                        (wordones[0], sum(wordones[1]))))

        # Watch the local scope for Interactive Beam so that referenced PCollections
        # will be cached.
        ib.watch(locals())

        # This is normally done in the interactive_utils when a transform is
        # applied but needs an IPython environment. So we manually run this here.
        ie.current_env().track_user_pipelines()

        # This tests that the data was correctly cached.
        pane_info = PaneInfo(True, True, PaneInfoTiming.UNKNOWN, 0, 0)
        expected_data_df = pd.DataFrame([
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('or', 0, [IntervalWindow(0, 10)], pane_info),
            ('not', 0, [IntervalWindow(0, 10)], pane_info),
            ('to', 0, [IntervalWindow(0, 10)], pane_info),
            ('be', 0, [IntervalWindow(0, 10)], pane_info),
            ('that', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('is', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('the', 20000000, [IntervalWindow(20, 30)], pane_info),
            ('question', 20000000, [IntervalWindow(20, 30)], pane_info)
        ], columns=[0, 'event_time', 'windows', 'pane_info']) # yapf: disable

        data_df = ib.collect(data, include_window_info=True)
        pd.testing.assert_frame_equal(expected_data_df, data_df)

        # This tests that the windowing was passed correctly so that all the data
        # is aggregated also correctly.
        pane_info = PaneInfo(True, False, PaneInfoTiming.ON_TIME, 0, 0)
        expected_counts_df = pd.DataFrame([
            ('be', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('not', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('or', 1, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('to', 2, 9999999, [IntervalWindow(0, 10)], pane_info),
            ('is', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('question', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('that', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
            ('the', 1, 29999999, [IntervalWindow(20, 30)], pane_info),
        ], columns=[0, 1, 'event_time', 'windows', 'pane_info']) # yapf: disable

        counts_df = ib.collect(counts, include_window_info=True)

        # The group by key has no guarantee of order. So we post-process the DF by
        # sorting so we can test equality.
        sorted_counts_df = (counts_df
                            .sort_values(['event_time', 0], ascending=True)
                            .reset_index(drop=True)) # yapf: disable
        pd.testing.assert_frame_equal(expected_counts_df, sorted_counts_df)
Ejemplo n.º 18
0
class Order(object):
    def __init__(self, date, time, transaction, item):
        self.date = date
        self.time = time
        self.transaction = transaction
        self.item = item


class parse_item(beam.DoFn):
    def process(self, element):
        if element:
            date, time, transaction, item = element.split(',')
            return [Order(date, time, transaction, item)]

class GetTotal(beam.DoFn):
    def process(self, element):
        # get the total transactions for one item
        return [(str(element[0]), sum(element[1]))]


data_from_source = (p
                    | 'ReadMyFile' >> ReadFromText('gs://play_with_data/BreadBasket_DMS.csv')
                    | 'Splitter using beam.Map' >> beam.Map(lambda record: (record.split(','))[0])
                    | 'Map record to 1' >> beam.Map(lambda record: (record, 1))
                    | 'GroupBy the data' >> beam.GroupByKey()
                    | 'Get the total in each day' >> beam.ParDo(GetTotal())
                    | 'Export results to new file' >> WriteToText('gs://play_with_data/output/day-list', '.txt')
                    )

result = p.run()
Ejemplo n.º 19
0
 def test_combine_values(self):
   occurences = [('cat', 1), ('cat', 5), ('cat', 9), ('dog', 5), ('dog', 2)]
   # [START combine_values]
   first_occurences = occurences | beam.GroupByKey() | beam.CombineValues(min)
   # [END combine_values]
   self.assertEqual({('cat', 1), ('dog', 2)}, set(first_occurences))
Ejemplo n.º 20
0
    return window.TimestampedValue(v, int(time.time()) + random.randint(0, 1))


def modify_data3(kvpair):
    # groupbyによりkeyとそのkeyを持つデータのリストのタプルが渡される
    # windowで分割されているのでデータ数が少なくなる
    # kvpair = (u'word only', [4, 4, 6, 6, 7])

    return {'count_type': kvpair[0], 'sum': sum(kvpair[1])}


p7 = beam.Pipeline(options=options)

query = 'SELECT * FROM [PROJECTID:testdataset.testtable3] LIMIT 20'
(p7 | 'read' >> beam.io.Read(
    beam.io.BigQuerySource(
        project='PROJECTID', use_standard_sql=False, query=query))
 | "assign tv" >> beam.Map(assign_timevalue)
 | 'window' >> beam.WindowInto(beam.window.FixedWindows(1))
 | 'pair' >> beam.Map(lambda x: (x['count_type'], x['word_count']))
 | "groupby" >> beam.GroupByKey()
 | 'modify' >> beam.Map(modify_data3)
 | 'write' >> beam.io.Write(
     beam.io.BigQuerySink(
         'testdataset.testtable5',
         schema='count_type:STRING, sum:INTEGER',
         create_disposition=beam.io.BigQueryDisposition.CREATE_IF_NEEDED,
         write_disposition=beam.io.BigQueryDisposition.WRITE_TRUNCATE)))

p7.run()  # .wait_until_finish()
Ejemplo n.º 21
0
    def test_multiple_destinations_transform(self):
        output_table_1 = '%s%s' % (self.output_table, 1)
        output_table_2 = '%s%s' % (self.output_table, 2)
        output_table_3 = '%s%s' % (self.output_table, 3)
        output_table_4 = '%s%s' % (self.output_table, 4)
        schema1 = bigquery.WriteToBigQuery.get_dict_table_schema(
            bigquery_tools.parse_table_schema_from_json(self.BIG_QUERY_SCHEMA))
        schema2 = bigquery.WriteToBigQuery.get_dict_table_schema(
            bigquery_tools.parse_table_schema_from_json(
                self.BIG_QUERY_SCHEMA_2))

        schema_kv_pairs = [
            (output_table_1, schema1), (output_table_2, schema2),
            (output_table_3, schema1), (output_table_4, schema2)
        ]
        pipeline_verifiers = [
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_1,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_2,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_3,
                data=[(d['name'], d['language']) for d in _ELEMENTS
                      if 'language' in d]),
            BigqueryFullResultMatcher(
                project=self.project,
                query="SELECT * FROM %s" % output_table_4,
                data=[(d['name'], d['foundation']) for d in _ELEMENTS
                      if 'foundation' in d])
        ]

        args = self.test_pipeline.get_full_options_as_args(
            on_success_matcher=all_of(*pipeline_verifiers),
            experiments='use_beam_bq_sink')

        with beam.Pipeline(argv=args) as p:
            input = p | beam.Create(_ELEMENTS)

            schema_map_pcv = beam.pvalue.AsDict(
                p | "MakeSchemas" >> beam.Create(schema_kv_pairs))

            table_record_pcv = beam.pvalue.AsDict(
                p | "MakeTables" >> beam.Create([('table1', output_table_1),
                                                 ('table2', output_table_2)]))

            # Get all input in same machine
            input = (input
                     | beam.Map(lambda x: (None, x))
                     | beam.GroupByKey()
                     | beam.FlatMap(lambda elm: elm[1]))

            _ = (
                input
                | "WriteWithMultipleDestsFreely" >> bigquery.WriteToBigQuery(
                    table=lambda x, tables:
                    (tables['table1']
                     if 'language' in x else tables['table2']),
                    table_side_inputs=(table_record_pcv, ),
                    schema=lambda dest, schema_map: schema_map.get(dest, None),
                    schema_side_inputs=(schema_map_pcv, ),
                    create_disposition=beam.io.BigQueryDisposition.
                    CREATE_IF_NEEDED,
                    write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY))

            _ = (input | "WriteWithMultipleDests" >> bigquery.WriteToBigQuery(
                table=lambda x:
                (output_table_3 if 'language' in x else output_table_4),
                schema=lambda dest, schema_map: schema_map.get(dest, None),
                schema_side_inputs=(schema_map_pcv, ),
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_EMPTY,
                max_file_size=20,
                max_files_per_bundle=-1))
Ejemplo n.º 22
0
    out_pcoll = in_pcoll | 'Extract Actor and Actress' >> beam.ParDo(
        ActorActressCountFn()).with_outputs(
            ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT,
            ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT)

    actor_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTOR_COUNT]
    actress_pcoll = out_pcoll[ActorActressCountFn.OUTPUT_TAG_ACTRESS_COUNT]

    # write PCollections to files
    actor_pcoll | 'Write Actor File 1' >> WriteToText(DIR_PATH_OUT +
                                                      'actor_output.txt')
    actress_pcoll | 'Write Actress File 1' >> WriteToText(DIR_PATH_OUT +
                                                          'actress_output.txt')

    # apply GroupByKey
    grouped_actor_pcoll = actor_pcoll | 'Group by Actor' >> beam.GroupByKey()
    grouped_actress_pcoll = actress_pcoll | 'Group by Actress' >> beam.GroupByKey(
    )

    # write PCollections to files
    grouped_actor_pcoll | 'Write Actor File 2' >> WriteToText(
        DIR_PATH_OUT + 'grouped_actor_output.txt')
    grouped_actress_pcoll | 'Write Actress File 2' >> WriteToText(
        DIR_PATH_OUT + 'grouped_actress_output.txt')

    # apply ParDo
    summed_actor_pcoll = grouped_actor_pcoll | 'Sum up Actor Nominations' >> beam.ParDo(
        SumNominationsFn())
    summed_actress_pcoll = grouped_actress_pcoll | 'Sum up Actress Nominations' >> beam.ParDo(
        SumNominationsFn())
Ejemplo n.º 23
0
Archivo: c4.py Proyecto: vinayya/nlp
    def _get_page_content(self, pipeline, file_paths, dl_manager):
        """Build PCollection of un-split page content."""
        import apache_beam as beam

        wet_file_paths = pipeline | "create_wet_files" >> beam.Create(
            file_paths["wet_files"])
        if "wet_urls" in file_paths:

            def download_url(url, downloader, pipeline):
                path = downloader.download(url)
                if not pipeline.is_local():
                    path = downloader.ship_files_with_pipeline(path, pipeline)
                return path

            dl_wet_file_paths = (
                pipeline
                | "create_wet_urls" >> beam.Create(file_paths["wet_urls"])
                | beam.Map(
                    download_url, downloader=dl_manager, pipeline=pipeline))
            wet_file_paths = (wet_file_paths,
                              dl_wet_file_paths) | beam.Flatten()

        # Parse WET files and filter by length.
        # Output: url, text
        page_content = wet_file_paths | beam.FlatMap(
            split_wet_file) | beam.Filter(is_valid_length)

        # Optionally filter for RealNews domains.
        # Output: url, text
        if self.config.realnewslike:
            with open(file_paths["realnews_domains"], "r",
                      encoding="utf-8") as f:
                realnews_domains = json.load(f)
            page_content = page_content | beam.Filter(is_realnews_domain,
                                                      realnews_domains)

        # Normalize and deduplicate by URL.
        # Output: url, text
        page_content = (page_content
                        | "normalize_url" >> beam.Map(normalize_url)
                        | "group_url" >> beam.GroupByKey()
                        | beam.Map(dedupe_urls))

        # Optionally filter for WebText-like URLs.
        # Output: url, text
        if self.config.webtextlike:
            webtextlike_urls = (
                pipeline
                | "read_webtextlike_urls" >> beam.io.ReadFromText(
                    os.path.join(file_paths["openwebtext_urls_zip"],
                                 _OPENWEBTEXT_URLS_FILE_PATTERN))
                | "add_dummy_page" >> beam.Map(lambda x: (x, ""))
                | "normal_webtext_url" >> beam.Map(normalize_url))
            page_content = ({
                "text": page_content,
                "webtextlike_urls": webtextlike_urls
            }
                            | "group_webtextlike_urls" >> beam.CoGroupByKey()
                            | beam.FlatMap(filter_by_webtextlike))

        # Optionally clean pages of badwords, boilerpolate text, and duplicate
        # spans of sentences.
        # Output: url, text
        if self.config.clean:
            with open(file_paths["badwords"], "r", encoding="utf-8") as f:
                badwords = [l.strip() for l in f]
            page_content = page_content | "clean_pages" >> beam.FlatMap(
                get_clean_page_fn(badwords))
            page_content = remove_duplicate_text(page_content)

        # Optionally filter out non-`language` pages. We do this after cleaning
        # since it may change the predominate language.
        if self.config.lang != "all":
            page_content |= beam.Filter(is_language, language=self.config.lang)

        return page_content
Ejemplo n.º 24
0
    def _execute(self, window_fn, trigger_fn, accumulation_mode,
                 timestamp_combiner, transcript, spec):

        runner_name = TestPipeline().runner.__class__.__name__
        if runner_name in spec.get('broken_on', ()):
            self.skipTest('Known to be broken on %s' % runner_name)

        test_stream = TestStream()
        for action, params in transcript:
            if action == 'expect':
                test_stream.add_elements([('expect', params)])
            else:
                test_stream.add_elements([('expect', [])])
                if action == 'input':
                    test_stream.add_elements([('input', e) for e in params])
                elif action == 'watermark':
                    test_stream.advance_watermark_to(params)
                elif action == 'clock':
                    test_stream.advance_processing_time(params)
                elif action == 'state':
                    pass  # Requires inspection of implementation details.
                else:
                    raise ValueError('Unexpected action: %s' % action)
        test_stream.add_elements([('expect', [])])

        class Check(beam.DoFn):
            """A StatefulDoFn that verifies outputs are produced as expected.

      This DoFn takes in two kinds of inputs, actual outputs and
      expected outputs.  When an actual output is received, it is buffered
      into state, and when an expected output is received, this buffered
      state is retrieved and compared against the expected value(s) to ensure
      they match.

      The key is ignored, but all items must be on the same key to share state.
      """
            def process(self,
                        element,
                        seen=beam.DoFn.StateParam(
                            beam.transforms.userstate.BagStateSpec(
                                'seen', beam.coders.FastPrimitivesCoder()))):
                _, (action, data) = element
                if action == 'actual':
                    seen.add(data)

                elif action == 'expect':
                    actual = list(seen.read())
                    seen.clear()

                    if len(actual) > len(data):
                        raise AssertionError(
                            'Unexpected output: expected %s but got %s' %
                            (data, actual))
                    elif len(data) > len(actual):
                        raise AssertionError(
                            'Unmatched output: expected %s but got %s' %
                            (data, actual))
                    else:

                        def diff(actual, expected):
                            for key in sorted(expected.keys(), reverse=True):
                                if key in actual:
                                    if actual[key] != expected[key]:
                                        return key

                        for output in actual:
                            diffs = [
                                diff(output, expected) for expected in data
                            ]
                            if all(diffs):
                                raise AssertionError(
                                    'Unmatched output: %s not found in %s (diffs in %s)'
                                    % (output, data, diffs))

                else:
                    raise ValueError('Unexpected action: %s' % action)

        with TestPipeline(options=PipelineOptions(streaming=True)) as p:
            # Split the test stream into a branch of to-be-processed elements, and
            # a branch of expected results.
            inputs, expected = (
                p
                | test_stream
                | beam.MapTuple(lambda tag, value: beam.pvalue.TaggedOutput(
                    tag, ('key', value))).with_outputs('input', 'expect'))
            # Process the inputs with the given windowing to produce actual outputs.
            outputs = (
                inputs
                | beam.MapTuple(lambda key, value: TimestampedValue(
                    (key, value), value))
                | beam.WindowInto(window_fn,
                                  trigger=trigger_fn,
                                  accumulation_mode=accumulation_mode,
                                  timestamp_combiner=timestamp_combiner)
                | beam.GroupByKey()
                | beam.MapTuple(
                    lambda k, vs, window=beam.DoFn.WindowParam, t=beam.DoFn.
                    TimestampParam:
                    (k,
                     self._windowed_value_info(
                         WindowedValue(vs, windows=[window], timestamp=t))))
                # Place outputs back into the global window to allow flattening
                # and share a single state in Check.
                | 'Global' >> beam.WindowInto(
                    beam.transforms.window.GlobalWindows()))
            # Feed both the expected and actual outputs to Check() for comparison.
            tagged_expected = (
                expected
                | beam.MapTuple(lambda key, value: (key, ('expect', value))))
            tagged_outputs = (
                outputs
                | beam.MapTuple(lambda key, value: (key, ('actual', value))))
            # pylint: disable=expression-not-assigned
            (tagged_expected, tagged_outputs) | beam.Flatten() | beam.ParDo(
                Check())
Ejemplo n.º 25
0
options.view_as(StandardOptions).runner = 'DirectRunner'

#Set pipeline options
p = beam.Pipeline(options=options)
# Lines transform read the text from input file and to create a PCollection which contains all the text lines
lines = p | "read" >> ReadFromText(inFile)
#Counts is a ParDo transform that invokes a function process_lines
#on each element that tokenizes the text lines into individual words
#this is then transformed to a tuple ('word',count) and grouped and counted to
#emit the outputs.
debug = 2
if (debug == 0):
    counts = (lines
              | "split" >> beam.ParDo(classExample).with_output_types(str)
              | "pair_with_1" >> beam.Map(lambda x: (x, 1))
              | "group" >> beam.GroupByKey()
              | "count" >> beam.Map(lambda x: (x[0], sum(x[1]))))
    output = counts | "format" >> beam.Map(lambda x: "%s: %s" % (x[0], x[1]))
    output | "write" >> WriteToText(outFile)
elif (debug == 1):
    sum = (lines
           | "get_counts" >> beam.ParDo(countWordsOfLine2)
           | "pair_with_1" >> beam.Map(lambda x: (x, 1))
           | "group" >> beam.GroupByKey()
           | "total" >> beam.Map(lambda x: (x[0], sum(x[1]))))

    output2 = sum | "format" >> beam.Map(lambda x: "%s: %s" % (x[0], x[1]))
    output2 | "write" >> WriteToText(outFile2)
else:
    counts = (lines
              | "split" >> beam.ParDo(classExample).with_output_types(str)
Ejemplo n.º 26
0
def run(argv=None):
    """Main entry point; defines and runs the tfidf pipeline."""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_baseline',
                        required=False,
                        help='baseline URIs to process.')
    parser.add_argument('--input_updates',
                        required=False,
                        help='updates URIs to process.')
    parser.add_argument('--input_enriched',
                        required=False,
                        help='updates URIs to process.')
    parser.add_argument('--output',
                        required=False,
                        help='Output file to write results to.')
    parser.add_argument('--output_enriched',
                        required=False,
                        help='Output file to write results to.')
    parser.add_argument('--output_splitted',
                        required=False,
                        help='Output file to write results to.')
    known_args, pipeline_args = parser.parse_known_args(argv)
    # bq_table_schema = parse_bq_json_schema(json.load(open('schemas/medline.papers.json')))
    bq_table_schema = parse_bq_json_schema(json.loads(BQ_SCHEMA))
    # We use the save_main_session option because one or more DoFn's in this
    # workflow rely on global context (e.g., a module imported at module level).
    pipeline_options = PipelineOptions(pipeline_args)
    pipeline_options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=pipeline_options) as p:

        if known_args.input_baseline or known_args.input_updates:

            if known_args.input_baseline and known_args.input_updates:
                medline_articles_base = p | 'BaselineEmitXML' >> ReadMedlineFiles(known_args.input_baseline)
                medline_articles_updates = p | 'UpdatesEmitXML' >> ReadMedlineFiles(known_args.input_updates)

                medline_articles = (
                    (medline_articles_base, medline_articles_updates)
                    | beam.Flatten())
            elif known_args.input_baseline:
                medline_articles = p | 'BaselineEmitXML' >> ReadMedlineFiles(known_args.input_baseline)
            elif known_args.input_updates:
                medline_articles = p | 'UpdatesEmitXML' >> ReadMedlineFiles(known_args.input_updates)
            else:
                raise AttributeError('at least an XML input is required')

            parsed_medline_articles = medline_articles | 'ParseXMLtoDict' >> beam.ParDo(MedlineXMLParser())

            medline_articles_grouped_by_id = parsed_medline_articles | 'GroupByPMID' >> beam.GroupByKey()

            unique_medline_articles = medline_articles_grouped_by_id | 'SortByFilename' >> beam.ParDo(
                GetLatestVersion())

            enriched_articles = unique_medline_articles | 'NLPAnalysis' >> beam.ParDo(NLPAnalysis())

            json_enriched_medline_articles = enriched_articles | 'EnrichedMedlineToJSON' >> beam.ParDo(ToJSON())

            json_enriched_medline_articles | 'WriteEnrichedJSONToGS' >> WriteToText(known_args.output_enriched,
                                                                                    file_name_suffix='_enriched.json.gz')

        elif known_args.input_enriched:

            json_enriched_medline_articles = p | 'GetEnrichedArticles' >> ReadFromText(known_args.input_enriched)

        else:
            raise AttributeError('missing json enriched data  input')

        if known_args.output_splitted:

            concepts = json_enriched_medline_articles | 'ArticleToConcepts' >> beam.ParDo(ExtractConcepts())
            concepts | 'WriteConceptJSONToGS' >> WriteToText(known_args.output_splitted,
                                                             file_name_suffix='_concepts.json.gz')

            bioentities = json_enriched_medline_articles | 'ArticleToBioentities' >> beam.ParDo(ExtractBioentities())
            bioentities | 'WriteBioentityJSONToGS' >> WriteToText(known_args.output_splitted,
                                                                  file_name_suffix='_bioentities.json.gz')

            taggedtext = json_enriched_medline_articles | 'ArticleToTaggedText' >> beam.ParDo(ExtractTaggedText())
            taggedtext | 'WriteTaggedTextJSONToGS' >> WriteToText(known_args.output_splitted,
                                                                  file_name_suffix='_taggedtext.json.gz')

            smallarticles = json_enriched_medline_articles | 'ArticleToSmallArticles' >> beam.ParDo(CleanPublication())
            smallarticles | 'WriteSmallArticleJSONToGS' >> WriteToText(known_args.output_splitted,
                                                                       file_name_suffix='_small.json.gz')
Ejemplo n.º 27
0
    def expand(self, pcoll):
        p = pcoll.pipeline

        temp_location = p.options.view_as(GoogleCloudOptions).temp_location

        empty_pc = p | "ImpulseEmptyPC" >> beam.Create([])
        singleton_pc = p | "ImpulseSingleElementPC" >> beam.Create([None])

        load_job_name_pcv = pvalue.AsSingleton(
            singleton_pc
            | beam.Map(lambda _: _generate_load_job_name()))

        file_prefix_pcv = pvalue.AsSingleton(
            singleton_pc
            | "GenerateFilePrefix" >> beam.Map(
                file_prefix_generator(self._validate,
                                      self._custom_gcs_temp_location,
                                      temp_location)))

        destination_data_kv_pc = (
            pcoll
            | "RewindowIntoGlobal" >> self._window_fn()
            | "AppendDestination" >> beam.ParDo(
                bigquery_tools.AppendDestinationsFn(self.destination), *
                self.table_side_inputs))

        all_destination_file_pairs_pc = self._write_files(
            destination_data_kv_pc, file_prefix_pcv)

        grouped_files_pc = (
            all_destination_file_pairs_pc
            | "GroupFilesByTableDestinations" >> beam.GroupByKey())

        partitions = (
            grouped_files_pc
            | beam.ParDo(
                PartitionFiles(self.max_partition_size,
                               self.max_files_per_partition)).with_outputs(
                                   PartitionFiles.MULTIPLE_PARTITIONS_TAG,
                                   PartitionFiles.SINGLE_PARTITION_TAG))

        multiple_partitions_per_destination_pc = partitions[
            PartitionFiles.MULTIPLE_PARTITIONS_TAG]
        single_partition_per_destination_pc = partitions[
            PartitionFiles.SINGLE_PARTITION_TAG]

        # When using dynamic destinations, elements with both single as well as
        # multiple partitions are loaded into BigQuery using temporary tables to
        # ensure atomicity.
        if self.dynamic_destinations:
            all_partitions = ((multiple_partitions_per_destination_pc,
                               single_partition_per_destination_pc)
                              | "FlattenPartitions" >> beam.Flatten())
            destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
              _load_data(all_partitions, empty_pc, load_job_name_pcv,
                         singleton_pc)
        else:
            destination_load_job_ids_pc, destination_copy_job_ids_pc = self.\
              _load_data(multiple_partitions_per_destination_pc,
                         single_partition_per_destination_pc,
                         load_job_name_pcv, singleton_pc)

        return {
            self.DESTINATION_JOBID_PAIRS: destination_load_job_ids_pc,
            self.DESTINATION_FILE_PAIRS: all_destination_file_pairs_pc,
            self.DESTINATION_COPY_JOBID_PAIRS: destination_copy_job_ids_pc,
        }
Ejemplo n.º 28
0
 def expand(self, pcoll):
     return pcoll | 'TestLabel' >> beam.GroupByKey()
Ejemplo n.º 29
0
# Get the BQ file I want to manipulate
sql = 'SELECT * FROM seanlahman_modeled.players2'

bq_source = beam.io.BigQuerySource(query=sql, use_standard_sql=True)

query_results = p | 'Read from BigQuery' >> beam.io.Read(bq_source)


# write PCollection to log file
query_results | 'Write to log 1' >> WriteToText(DIR_PATH + 'input.txt')

# apply a ParDo to the PCollection 
t1_pcoll = query_results | 'Extract Players' >> beam.ParDo(CombineDOBFn())

# apply GroupByKey to the PCollection
intermediate_pcoll = t1_pcoll | 'Group by players' >> beam.GroupByKey()

# write PCollection to a file 
intermediate_pcoll | 'Write File Intermediately' >> WriteToText(DIR_PATH + 'unwindowed.txt')

#nipulate the file to send to BQ
done = intermediate_pcoll | 'Make BQ Record' >> beam.ParDo(MakeRecordFn())

# Make the clean data a txt file
done | 'Write File' >> WriteToText(DIR_PATH + 'output.txt')
    
# make the BQ table
qualified_table_name = PROJECT_ID + ':seanlahman_modeled.players2_Beam_DF'
    
dataset_id = 'seanlahman_modeled'
table_id = 'players2_Beam_DF'
Ejemplo n.º 30
0
#   Licensed to the Apache Software Foundation (ASF) under one
#   or more contributor license agreements.  See the NOTICE file
#   distributed with this work for additional information
#   regarding copyright ownership.  The ASF licenses this file
#   to you under the Apache License, Version 2.0 (the
#   "License"); you may not use this file except in compliance
#   with the License.  You may obtain a copy of the License at
#
#       http://www.apache.org/licenses/LICENSE-2.0
#
#   Unless required by applicable law or agreed to in writing, software
#   distributed under the License is distributed on an "AS IS" BASIS,
#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#   See the License for the specific language governing permissions and
#   limitations under the License.

import apache_beam as beam

from log_elements import LogElements

p = beam.Pipeline()

(p | beam.Create(['apple', 'ball', 'car', 'bear', 'cheetah', 'ant'])
 | beam.Map(lambda word: (word[0], word))
 | beam.GroupByKey()
 | LogElements())

p.run()