def testRow(self): self.assertReturnType( row_type.RowTypeConstraint([('x', int), ('y', str)]), lambda x, y: beam.Row(x=x + 1, y=y), [int, str]) self.assertReturnType( row_type.RowTypeConstraint([('x', int), ('y', str)]), lambda x: beam.Row(x=x, y=str(x)), [int])
def main(): logging.basicConfig(level=logging.INFO, format='%(levelname)-8s: %(message)s') parser = argparse.ArgumentParser(description=__doc__.strip()) parser.add_argument('filename', help='Beancount ledger filename') args, pipeline_args = parser.parse_known_args() # Read the ledger. logging.info("Reading ledger.") t1 = time.time() entries, errors, options_map = loader.load_file(args.filename) postings = (beam.Row(posting.account, posting.units.number, posting.units.currency) for entry in data.filter_txns(entries) for posting in entry.postings) price_map = prices.build_price_map(entries) t2 = time.time() logging.info("Read ledger in %.1fsecs.", t2 - t1) with CreatePipeline(pipeline_args) as pipeline: _ = (pipeline | beam.Create(postings) | SqlTransform(""" SELECT account FROM PCOLLECTION """, dialect="zetasql") | beam.Map(print))
def reformat_record(field): post_id = str(field.get("id")) return beam.Row( post_id=post_id, title=field.get("title"), tags=field.get("tags"), )
def Stats(samples): # First, convert the Python list of floats to a numpy array, to save # scipy from repeatedly converting it implicitly. arr = np.array(samples, dtype=np.float64) # Immediately clear the Python list to minimise memory pressure. samples.clear() # Calculate the various summary statistics. stats = scipy.stats.describe(arr) quartiles = scipy.stats.mstats.mquantiles(arr) return beam.Row( num_samples=stats.nobs, kurtosis=stats.kurtosis, skewness=stats.skewness, iqr=scipy.stats.iqr(arr), variance=stats.variance, min=stats.minmax[0], max=stats.minmax[1], mean=stats.mean, first_quartile=quartiles[0], median=quartiles[1], third_quartile=quartiles[2], cv=scipy.stats.mstats.variation(arr), std_dev=math.sqrt(stats.variance), std_err=scipy.stats.sem(arr), )
def process(self, element, *args, **kwargs): res = beam.Row( user=element['actor'], action=element['action'], created_at=datetime.fromisoformat( element['created_at']).strftime('%Y-%m-%d %H:%M:%S.%f')) yield res
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # Import this here to avoid pickling the main session. import re # The pipeline will be run on exiting the with block. with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: # Read the text file[pattern] into a PCollection. lines = p | 'Read' >> ReadFromText(known_args.input) words = ( lines | 'Split' >> beam.FlatMap( lambda line: re.findall(r'[\w]+', line)).with_output_types(str) # Map to Row objects to generate a schema suitable for conversion # to a dataframe. | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word))) df = to_dataframe(words) df['count'] = 1 counted = df.groupby('word').sum() counted.to_csv(known_args.output)
def test_as_external_transform(self): with FullyQualifiedNamedTransform.with_filter('*'): with beam.Pipeline() as p: assert_that( p | beam.Create(['a', 'b', 'c']) | beam.ExternalTransform( PYTHON_FULLY_QUALIFIED_NAMED_TRANSFORM_URN, ImplicitSchemaPayloadBuilder({ 'constructor': 'apache_beam.transforms' '.fully_qualified_named_transform_test._TestTransform', 'args': beam.Row(arg0='x'), 'kwargs': beam.Row(suffix='y'), }), expansion_service.ExpansionServiceServicer()), equal_to(['xay', 'xby', 'xcy']))
def test_row(self): with TestPipeline() as p: out = (p | beam.Create([1, 2, 10]) | beam.Map(lambda x: beam.Row(a=x, b=str(x))) | SqlTransform( "SELECT a*a as s, LENGTH(b) AS c FROM PCOLLECTION")) assert_that(out, equal_to([(1, 1), (4, 1), (100, 2)]))
def ToRowAirport(values): return beam.Row( i94port=str(values["i94port"]) if "i94port" in values.keys() else str("NaN"), municipality=str(values["municipality"]) if "municipality" in values.keys() else str("NaN"), )
def DateTransform(yyyymmdd): if yyyymmdd == 'yesterday': d = (datetime.datetime.utcnow() - datetime.timedelta(days=1)).date() else: d = datetime.date(int(yyyymmdd[:4]), int(yyyymmdd[4:6]), int(yyyymmdd[6:8])) return beam.Row(date=d)
def normalize(x): if isinstance(x, tuple) and hasattr(x, '_fields'): # A named tuple. return beam.Row(**dict(zip(x._fields, x))) elif isinstance(x, typing.Iterable) and not isinstance(x, (str, beam.Row)): return UnorderedList(normalize(e) for e in x) else: return x
def test_instance_check_windowed_value_holder(self): windowed_value = WindowedValue( 'a', Timestamp(5), [beam.window.IntervalWindow(5, 10)], PaneInfo(True, True, PaneInfoTiming.ON_TIME, 0, 0)) self.assertTrue( isinstance(WindowedValueHolder(windowed_value), WindowedValueHolder)) self.assertTrue( isinstance( beam.Row(windowed_value=windowed_value, urn=common_urns.coders.ROW.urn), WindowedValueHolder)) self.assertFalse( isinstance(beam.Row(windowed_value=windowed_value), WindowedValueHolder)) self.assertFalse(isinstance(windowed_value, WindowedValueHolder)) self.assertFalse( isinstance(beam.Row(x=windowed_value), WindowedValueHolder)) self.assertFalse( isinstance(beam.Row(windowed_value=1), WindowedValueHolder))
def test_unbatching_series(self): with beam.Pipeline() as p: result = ( p | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.), (u'Parrot', 26.)]) | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])) | transforms.DataframeTransform(lambda df: df.Animal)) assert_that(result, equal_to(['Falcon', 'Falcon', 'Parrot', 'Parrot']))
def process_weather_entries(el): key = el[0] weather = el[1] return beam.Row( country_code=key.country_code, obsdate=key.obsdate, temp=mean([x.temp for x in weather if x.temp < 9999.9]), windspeed=mean( [x.windspeed for x in weather if x.windspeed < 999.9]), tornadoes=any([x.tornado_or_funnel for x in weather]), )
def test_batching_beam_row_input(self): with beam.Pipeline() as p: result = ( p | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.), (u'Parrot', 26.)]) | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])) | transforms.DataframeTransform( lambda df: df.groupby('Animal').mean(), include_indexes=True)) assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # Import this here to avoid pickling the main session. import re # The pipeline will be run on exiting the with block. with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: # Read the text file[pattern] into a PCollection. lines = p | 'Read' >> ReadFromText(known_args.input) words = ( lines | 'Split' >> beam.FlatMap( lambda line: re.findall(r'[\w]+', line)).with_output_types(str) # Map to Row objects to generate a schema suitable for conversion # to a dataframe. | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word))) df = to_dataframe(words) for k, pc in print(df) """ df['count'] = 1 counted = df.groupby('word').sum() counted.to_csv(known_args.output) # Deferred DataFrames can also be converted back to schema'd PCollections counted_pc = to_pcollection(counted, include_indexes=True) # Print out every word that occurred >50 times _ = ( counted_pc | beam.Filter(lambda row: row.count > 50) | beam.Map(lambda row: f'{row.word}: {row.count}') | beam.Map(print)) """ if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) run()
def calc_max_consecutive_tornado_days(el): key, entries = el entries = sorted(entries, key=lambda x: x.obsdate, reverse=True) return ( key.year, beam.Row( country_code=key.country_code, max_consecutive_tornado_days=max_consec_sequence_len( [x.tornadoes for x in entries], True), ), )
def test_batching_beam_row_to_dataframe(self): with beam.Pipeline() as p: df = convert.to_dataframe( p | beam.Create([(u'Falcon', 380.), ( u'Falcon', 370.), (u'Parrot', 24.), (u'Parrot', 26.)]) | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))) result = convert.to_pcollection(df.groupby('Animal').mean(), include_indexes=True) assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
def test_batching_beam_row_to_dataframe(self): with beam.Pipeline() as p: df = convert.to_dataframe( p | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), ( u'Parrot', 24.), (u'Parrot', 26.)]) | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))) result = convert.to_pcollection(df.groupby('Animal').mean()) assert_that( result, df_equal_to( pd.DataFrame({ 'Animal': ['Falcon', 'Parrot'], 'Speed': [375., 25.] }).set_index('Animal')))
def test_batching_beam_row_input(self): with beam.Pipeline() as p: result = ( p | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), (u'Parrot', 24.), (u'Parrot', 26.)]) | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1])) | transforms.DataframeTransform(lambda df: df.groupby('Animal').mean())) assert_that( result, df_equal_to( pd.DataFrame({ 'Animal': ['Falcon', 'Parrot'], 'Speed': [375., 25.] }).set_index('Animal')))
def process(self, element): if (element["dt"] > datetime.datetime(2011, 12, 31, 23, 59, 0, 0) ) and (element["dt"] < datetime.datetime(2013, 1, 1, 0, 0, 0, 0)): return [ beam.Row( dt=element["dt"] if "dt" in element.keys() else str("NaN"), avg_temp=float(element["AverageTemperature"]) if "AverageTemperature" in element.keys() else str("NaN"), municipality=str(element["City"]) if "City" in element.keys() else str("NaN"), country=str(element["Country"]) if "Country" in element.keys() else str("NaN"), month=element["dt"].month if "dt" in element.keys() else str("NaN"), ) ]
def ToRowImmigration(values): return beam.Row( arrdate=int(values["arrdate"]) if values["arrdate"] is not None else 0, depdate=int(values["depdate"]) if values["depdate"] is not None else 0, i94mon=int(values["i94mon"]) if "i94mon" in values.keys() else int("NaN"), i94visa=int(values["i94visa"]) if "i94visa" in values.keys() else int("NaN"), i94port=str(values["i94port"]) if "i94port" in values.keys() else str("NaN"), i94addr=str(values["i94addr"]) if "i94addr" in values.keys() else str("NaN"), biryear=int(values["biryear"]) if values["biryear"] != None in values.keys() else 0, gender=str(values["gender"]) if "gender" in values.keys() else str("NaN"), )
def test_dataframes(self): p = beam.Pipeline( runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) data = p | beam.Create( [1, 2, 3]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x)) df = to_dataframe(data) # Watch the local scope for Interactive Beam so that values will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]}) pd.testing.assert_frame_equal( df_expected, ib.collect(df, n=10).reset_index(drop=True))
def ToRowCity(values): return beam.Row( avg_median_age=float(values["avg_median_age"]) if "avg_median_age" in values.keys() else float("NaN"), avg_male_population=float(values["avg_male_population"]) if "avg_male_population" in values.keys() else float("NaN"), avg_female_population=float(values["avg_female_population"]) if "avg_female_population" in values.keys() else float("NaN"), avg_total_population=float(values["avg_total_population"]) if "avg_total_population" in values.keys() else float("NaN"), avg_number_veterans=float(values["avg_number_veterans"]) if "avg_number_veterans" in values.keys() else float("NaN"), avg_foreign_born=float(values["avg_foreign_born"]) if "avg_foreign_born" in values.keys() else float("NaN"), avg_average_household_size=float(values["avg_average_household_size"]) if "avg_average_household_size" in values.keys() else float("NaN"), i94addr=str(values["i94addr"]), )
def run(output_topic, pipeline_args): pipeline_options = PipelineOptions(pipeline_args, save_main_session=True, streaming=True) with beam.Pipeline(options=pipeline_options) as pipeline: _ = ( pipeline | beam.io.ReadFromPubSub( topic='projects/pubsub-public-data/topics/taxirides-realtime', timestamp_attribute="ts").with_output_types(bytes) | "Parse JSON payload" >> beam.Map(json.loads) # Use beam.Row to create a schema-aware PCollection | "Create beam Row" >> beam.Map( lambda x: beam.Row(ride_status=str(x['ride_status']), passenger_count=int(x['passenger_count']))) # SqlTransform will computes result within an existing window | "15s fixed windows" >> beam.WindowInto( beam.window.FixedWindows(15)) # Aggregate drop offs and pick ups that occur within each 15s window | SqlTransform(""" SELECT ride_status, COUNT(*) AS num_rides, SUM(passenger_count) AS total_passengers FROM PCOLLECTION WHERE NOT ride_status = 'enroute' GROUP BY ride_status""") # SqlTransform yields python objects with attributes corresponding to # the outputs of the query. # Collect those attributes, as well as window information, into a dict | "Assemble Dictionary" >> beam.Map( lambda row, window=beam.DoFn.WindowParam: { "ride_status": row.ride_status, "num_rides": row.num_rides, "total_passengers": row.total_passengers, "window_start": window.start.to_rfc3339(), "window_end": window.end.to_rfc3339() }) | "Convert to JSON" >> beam.Map(json.dumps) | "UTF-8 encode" >> beam.Map(lambda s: s.encode("utf-8")) | beam.io.WriteToPubSub(topic=output_topic))
def testInstanceToType(self): class MyClass(object): def method(self): pass test_cases = [ (typehints.Dict[str, int], { 'a': 1 }), (typehints.Dict[str, typehints.Union[str, int]], { 'a': 1, 'b': 'c' }), (typehints.Dict[typehints.Any, typehints.Any], {}), (typehints.Set[str], {'a'}), (typehints.Set[typehints.Union[str, float]], {'a', 0.4}), (typehints.Set[typehints.Any], set()), (typehints.FrozenSet[str], frozenset(['a'])), (typehints.FrozenSet[typehints.Union[str, float]], frozenset(['a', 0.4])), (typehints.FrozenSet[typehints.Any], frozenset()), (typehints.Tuple[int], (1, )), (typehints.Tuple[int, int, str], (1, 2, '3')), (typehints.Tuple[()], ()), (typehints.List[int], [1]), (typehints.List[typehints.Union[int, str]], [1, 'a']), (typehints.List[typehints.Any], []), (type(None), None), (type(MyClass), MyClass), (MyClass, MyClass()), (type(MyClass.method), MyClass.method), (types.MethodType, MyClass().method), (row_type.RowTypeConstraint([('x', int)]), beam.Row(x=37)), ] for expected_type, instance in test_cases: self.assertEqual(expected_type, trivial_inference.instance_to_type(instance), msg=instance)
def test_group_by_attr_expr(self): # [START groupby_attr_expr] with beam.Pipeline() as p: grouped = (p | beam.Create(GROCERY_LIST) | beam.GroupBy('recipe', is_berry=lambda x: 'berry' in x.fruit)) # [END groupby_attr_expr] expected = [ #[START groupby_attr_expr_result] (NamedTuple(recipe='pie', is_berry=True), [ beam.Row(recipe='pie', fruit='strawberry', quantity=3, unit_price=1.50), beam.Row(recipe='pie', fruit='raspberry', quantity=1, unit_price=3.50), beam.Row(recipe='pie', fruit='blackberry', quantity=1, unit_price=4.00), beam.Row(recipe='pie', fruit='blueberry', quantity=1, unit_price=2.00), ]), (NamedTuple(recipe='muffin', is_berry=True), [ beam.Row(recipe='muffin', fruit='blueberry', quantity=2, unit_price=2.00), ]), (NamedTuple(recipe='muffin', is_berry=False), [ beam.Row(recipe='muffin', fruit='banana', quantity=3, unit_price=1.00), ]), #[END groupby_attr_expr_result] ] assert_that(grouped | beam.MapTuple(normalize_kv), equal_to(expected))
def test_dataframe_caching(self, cell): # Create a pipeline that exercises the DataFrame API. This will also use # caching in the background. with cell: # Cell 1 p = beam.Pipeline(interactive_runner.InteractiveRunner()) ib.watch({'p': p}) with cell: # Cell 2 data = p | beam.Create([ 1, 2, 3 ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x)) with beam.dataframe.allow_non_parallel_operations(): df = to_dataframe(data).reset_index(drop=True) ib.collect(df) with cell: # Cell 3 df['output'] = df['square'] * df['cube'] ib.collect(df) with cell: # Cell 4 df['output'] = 0 ib.collect(df) # We use a trace through the graph to perform an isomorphism test. The end # output should look like a linear graph. This indicates that the dataframe # transform was correctly broken into separate pieces to cache. If caching # isn't enabled, all the dataframe computation nodes are connected to a # single shared node. trace = [] # Only look at the top-level transforms for the isomorphism. The test # doesn't care about the transform implementations, just the overall shape. class TopLevelTracer(beam.pipeline.PipelineVisitor): def _find_root_producer(self, node: beam.pipeline.AppliedPTransform): if node is None or not node.full_label: return None parent = self._find_root_producer(node.parent) if parent is None: return node return parent def _add_to_trace(self, node, trace): if '/' not in str(node): if node.inputs: producer = self._find_root_producer( node.inputs[0].producer) producer_name = producer.full_label if producer else '' trace.append((producer_name, node.full_label)) def visit_transform(self, node: beam.pipeline.AppliedPTransform): self._add_to_trace(node, trace) def enter_composite_transform( self, node: beam.pipeline.AppliedPTransform): self._add_to_trace(node, trace) p.visit(TopLevelTracer()) # Do the isomorphism test which states that the topological sort of the # graph yields a linear graph. trace_string = '\n'.join(str(t) for t in trace) prev_producer = '' for producer, consumer in trace: self.assertEqual(producer, prev_producer, trace_string) prev_producer = consumer
elif isinstance(x, typing.Iterable) and not isinstance(x, (str, beam.Row)): return UnorderedList(normalize(e) for e in x) else: return x def normalize_kv(k, v): return normalize(k), normalize(v) # For documentation. NamedTuple = beam.Row # [START groupby_table] GROCERY_LIST = [ beam.Row(recipe='pie', fruit='strawberry', quantity=3, unit_price=1.50), beam.Row(recipe='pie', fruit='raspberry', quantity=1, unit_price=3.50), beam.Row(recipe='pie', fruit='blackberry', quantity=1, unit_price=4.00), beam.Row(recipe='pie', fruit='blueberry', quantity=1, unit_price=2.00), beam.Row(recipe='muffin', fruit='blueberry', quantity=2, unit_price=2.00), beam.Row(recipe='muffin', fruit='banana', quantity=3, unit_price=1.00), ] # [END groupby_table] class GroupByTest(unittest.TestCase): def test_groupby_expr(self): # [START groupby_expr] with beam.Pipeline() as p: grouped = (p | beam.Create([
def SQLDictToRow(pcoll): """Convert a dict (from ReadFromBigQuery) to a beam.Row.""" return pcoll | beam.Map(lambda d: beam.Row(**d))