def test_convert_yield_pandas(self): with beam.Pipeline() as p: a = pd.Series([1, 2, 3]) b = pd.Series([100, 200, 300]) pc_a = p | 'A' >> beam.Create([a]) pc_b = p | 'B' >> beam.Create([b]) df_a = convert.to_dataframe(pc_a, proxy=a[:0]) df_b = convert.to_dataframe(pc_b, proxy=b[:0]) df_2a = 2 * df_a df_3a = 3 * df_a df_ab = df_a * df_b # Converting multiple results at a time can be more efficient. pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab, yield_elements='pandas') # But separate conversions can be done as well. pc_3a = convert.to_pcollection(df_3a, yield_elements='pandas') assert_that(pc_2a, equal_to_unordered_series(2 * a), label='Check2a') assert_that(pc_3a, equal_to_unordered_series(3 * a), label='Check3a') assert_that(pc_ab, equal_to_unordered_series(a * b), label='Checkab')
def test_convert_yield_pandas(self): def equal_to_unordered_series(expected): def check(actual): actual = pd.concat(actual) if sorted(expected) != sorted(actual): raise AssertionError( 'Series not equal: \n%s\n%s\n' % (expected, actual)) return check with beam.Pipeline() as p: a = pd.Series([1, 2, 3]) b = pd.Series([100, 200, 300]) pc_a = p | 'A' >> beam.Create([a]) pc_b = p | 'B' >> beam.Create([b]) df_a = convert.to_dataframe(pc_a, proxy=a[:0]) df_b = convert.to_dataframe(pc_b, proxy=b[:0]) df_2a = 2 * df_a df_3a = 3 * df_a df_ab = df_a * df_b # Converting multiple results at a time can be more efficient. pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab, yield_elements='pandas') # But separate conversions can be done as well. pc_3a = convert.to_pcollection(df_3a, yield_elements='pandas') assert_that(pc_2a, equal_to_unordered_series(2 * a), label='Check2a') assert_that(pc_3a, equal_to_unordered_series(3 * a), label='Check3a') assert_that(pc_ab, equal_to_unordered_series(a * b), label='Checkab')
def test_dataframes_with_multi_index_get_result(self): p = beam.Pipeline(runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) data = [ Record('a', 20, 170), Record('a', 30, 170), Record('b', 22, 180), Record('c', 18, 150) ] aggregate = lambda df: df.groupby(['name', 'height']).mean()['age'] deferred_df = aggregate(to_dataframe(p | beam.Create(data))) df_expected = aggregate(pd.DataFrame(data)) # Watch the local scope for Interactive Beam so that values will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() pd.testing.assert_series_equal(df_expected, ib.collect(deferred_df, n=10))
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # Import this here to avoid pickling the main session. import re # The pipeline will be run on exiting the with block. with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: # Read the text file[pattern] into a PCollection. lines = p | 'Read' >> ReadFromText(known_args.input) words = ( lines | 'Split' >> beam.FlatMap( lambda line: re.findall(r'[\w]+', line)).with_output_types(str) # Map to Row objects to generate a schema suitable for conversion # to a dataframe. | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word))) df = to_dataframe(words) df['count'] = 1 counted = df.groupby('word').sum() counted.to_csv(known_args.output)
def expand(self, input_pcolls): # Avoid circular import. from apache_beam.dataframe import convert # Convert inputs to a flat dict. input_dict = _flatten(input_pcolls) # type: Dict[Any, PCollection] proxies = _flatten(self._proxy) input_frames = { k: convert.to_dataframe(pc, proxies[k]) for k, pc in input_dict.items() } # type: Dict[Any, DeferredFrame] # Apply the function. frames_input = _substitute(input_pcolls, input_frames) if isinstance(frames_input, dict): result_frames = self._func(**frames_input) elif isinstance(frames_input, tuple): result_frames = self._func(*frames_input) else: result_frames = self._func(frames_input) # Compute results as a tuple. result_frames_dict = _flatten(result_frames) keys = list(result_frames_dict.keys()) result_frames_tuple = tuple(result_frames_dict[key] for key in keys) result_pcolls_tuple = convert.to_pcollection( *result_frames_tuple, label='Eval', always_return_tuple=True) # Convert back to the structure returned by self._func. result_pcolls_dict = dict(zip(keys, result_pcolls_tuple)) return _substitute(result_frames, result_pcolls_dict)
def run_scenario(self, input, func): expected = func(input) empty = input.iloc[0:0] input_placeholder = expressions.PlaceholderExpression(empty) input_deferred = frame_base.DeferredFrame.wrap(input_placeholder) actual_deferred = func(input_deferred)._expr.evaluate_at( expressions.Session({input_placeholder: input})) check_correct(expected, actual_deferred) with beam.Pipeline() as p: input_pcoll = p | beam.Create([input.iloc[::2], input.iloc[1::2]]) input_df = convert.to_dataframe(input_pcoll, proxy=empty) output_df = func(input_df) output_proxy = output_df._expr.proxy() if isinstance(output_proxy, pd.core.generic.NDFrame): self.assertTrue( output_proxy.iloc[:0].equals(expected.iloc[:0]), ('Output proxy is incorrect:\n' f'Expected:\n{expected.iloc[:0]}\n\n' f'Actual:\n{output_proxy.iloc[:0]}')) else: self.assertEqual(type(output_proxy), type(expected)) output_pcoll = convert.to_pcollection(output_df, yield_elements='pandas') assert_that(output_pcoll, lambda actual: check_correct(expected, concat(actual)))
def expand(self, root): # TODO(robertwb): Handle streaming (with explicit schema). paths_pcoll = root | beam.Create([self.path]) first = io.filesystems.FileSystems.match( [self.path], limits=[1])[0].metadata_list[0].path with io.filesystems.FileSystems.open(first) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) pcoll = (paths_pcoll | fileio.MatchFiles(self.path) | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=_prefix_range_index_with( ':', sample[:0]))
def expand(self, root): paths_pcoll = root | beam.Create([self.path]) match = io.filesystems.FileSystems.match([self.path], limits=[1])[0] if not match.metadata_list: # TODO(BEAM-12031): This should be allowed for streaming pipelines if # user provides an explicit schema. raise FileNotFoundError(f"Found no files that match {self.path!r}") first_path = match.metadata_list[0].path with io.filesystems.FileSystems.open(first_path) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) pcoll = (paths_pcoll | fileio.MatchFiles(self.path) | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=_prefix_range_index_with( ':', sample[:0]))
def test_windowed_write(self): output = self.temp_dir() with beam.Pipeline() as p: pc = (p | beam.Create( [MyRow(timestamp=i, value=i % 3) for i in range(20)]) | beam.Map(lambda v: beam.window.TimestampedValue( v, v.timestamp)).with_output_types(MyRow) | beam.WindowInto( beam.window.FixedWindows(10)).with_output_types(MyRow)) deferred_df = convert.to_dataframe(pc) deferred_df.to_csv(output + 'out.csv', index=False) first_window_files = (f'{output}out.csv-' f'{datetime.utcfromtimestamp(0).isoformat()}*') self.assertCountEqual( ['timestamp,value'] + [f'{i},{i%3}' for i in range(10)], set(self.read_all_lines(first_window_files, delete=True))) second_window_files = (f'{output}out.csv-' f'{datetime.utcfromtimestamp(10).isoformat()}*') self.assertCountEqual( ['timestamp,value'] + [f'{i},{i%3}' for i in range(10, 20)], set(self.read_all_lines(second_window_files, delete=True))) # Check that we've read (and removed) every output file self.assertEqual(len(glob.glob(f'{output}out.csv*')), 0)
def test_convert_non_deferred(self): with beam.Pipeline() as p: s1 = pd.Series([1, 2, 3]) s2 = convert.to_dataframe(p | beam.Create([100, 200, 300])) pc1, pc2 = convert.to_pcollection(s1, s2, pipeline=p) assert_that(pc1, equal_to([1, 2, 3]), label='CheckNonDeferred') assert_that(pc2, equal_to([100, 200, 300]), label='CheckDeferred')
def expand(self, pcoll): from apache_beam.dataframe import convert # avoid circular import return { 'files_written': self._writer_func(convert.to_dataframe(pcoll), *self._args, ** self._kwargs) | beam.Map(lambda file_result: file_result.file_name). with_output_types(str) }
def test_parse_dataframes(self): """Tests that it correctly parses a DataFrame. """ deferred = to_dataframe(beam.Pipeline() | beam.Create([Record(0, 0, 0)])) els = [windowed_value(pd.DataFrame(Record(n, 0, 0))) for n in range(10)] actual_df = utils.elements_to_df( els, element_type=deferred._expr.proxy()).reset_index(drop=True) expected_df = pd.concat([e.value for e in els], ignore_index=True) pd.testing.assert_frame_equal(actual_df, expected_df)
def run(argv=None): """Main entry point; defines and runs the wordcount pipeline.""" parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument( '--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # Import this here to avoid pickling the main session. import re # The pipeline will be run on exiting the with block. with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p: # Read the text file[pattern] into a PCollection. lines = p | 'Read' >> ReadFromText(known_args.input) words = ( lines | 'Split' >> beam.FlatMap( lambda line: re.findall(r'[\w]+', line)).with_output_types(str) # Map to Row objects to generate a schema suitable for conversion # to a dataframe. | 'ToRows' >> beam.Map(lambda word: beam.Row(word=word))) df = to_dataframe(words) for k, pc in print(df) """ df['count'] = 1 counted = df.groupby('word').sum() counted.to_csv(known_args.output) # Deferred DataFrames can also be converted back to schema'd PCollections counted_pc = to_pcollection(counted, include_indexes=True) # Print out every word that occurred >50 times _ = ( counted_pc | beam.Filter(lambda row: row.count > 50) | beam.Map(lambda row: f'{row.word}: {row.count}') | beam.Map(print)) """ if __name__ == '__main__': logging.getLogger().setLevel(logging.INFO) run()
def test_convert_memoization(self): with beam.Pipeline() as p: a = pd.Series([1, 2, 3]) b = pd.Series([100, 200, 300]) pc_a = p | 'A' >> beam.Create([a]) pc_b = p | 'B' >> beam.Create([b]) df_a = convert.to_dataframe(pc_a, proxy=a[:0]) df_b = convert.to_dataframe(pc_b, proxy=b[:0]) df_2a = 2 * df_a df_3a = 3 * df_a df_ab = df_a * df_b # Two calls to to_pcollection with the same Dataframe should produce the # same PCollection(s) pc_2a_, pc_ab_ = convert.to_pcollection(df_2a, df_ab) pc_3a, pc_2a, pc_ab = convert.to_pcollection(df_3a, df_2a, df_ab) self.assertIs(pc_2a, pc_2a_) self.assertIs(pc_ab, pc_ab_) self.assertIsNot(pc_3a, pc_2a) self.assertIsNot(pc_3a, pc_ab) # The same conversions without the unbatching transform should also cache # PCollections pc_2a_pandas_, pc_ab_pandas_ = convert.to_pcollection( df_2a, df_ab, yield_elements='pandas') pc_3a_pandas, pc_2a_pandas, pc_ab_pandas = convert.to_pcollection( df_3a, df_2a, df_ab, yield_elements='pandas') self.assertIs(pc_2a_pandas, pc_2a_pandas_) self.assertIs(pc_ab_pandas, pc_ab_pandas_) self.assertIsNot(pc_3a_pandas, pc_2a_pandas) self.assertIsNot(pc_3a_pandas, pc_ab_pandas) # .. but the cached PCollections should be different self.assertIsNot(pc_2a_pandas, pc_2a) self.assertIsNot(pc_ab_pandas, pc_ab) self.assertIsNot(pc_3a_pandas, pc_3a)
def test_batching_beam_row_to_dataframe(self): with beam.Pipeline() as p: df = convert.to_dataframe( p | beam.Create([(u'Falcon', 380.), ( u'Falcon', 370.), (u'Parrot', 24.), (u'Parrot', 26.)]) | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))) result = convert.to_pcollection(df.groupby('Animal').mean(), include_indexes=True) assert_that(result, equal_to([('Falcon', 375.), ('Parrot', 25.)]))
def test_double_write(self): output = self.temp_dir() with beam.Pipeline() as p: pc1 = p | 'create pc1' >> beam.Create( [SimpleRow(value=i) for i in [1, 2]]) pc2 = p | 'create pc2' >> beam.Create( [SimpleRow(value=i) for i in [3, 4]]) deferred_df1 = convert.to_dataframe(pc1) deferred_df2 = convert.to_dataframe(pc2) deferred_df1.to_csv(f'{output}out1.csv', transform_label="Writing to csv PC1", index=False) deferred_df2.to_csv(f'{output}out2.csv', transform_label="Writing to csv PC2", index=False) self.assertCountEqual(['value', '1', '2'], set(self.read_all_lines(output + 'out1.csv*'))) self.assertCountEqual(['value', '3', '4'], set(self.read_all_lines(output + 'out2.csv*')))
def run_flight_delay_pipeline(pipeline, start_date=None, end_date=None, output=None): query = f""" SELECT FlightDate AS date, IATA_CODE_Reporting_Airline AS airline, Origin AS departure_airport, Dest AS arrival_airport, DepDelay AS departure_delay, ArrDelay AS arrival_delay FROM `apache-beam-testing.airline_ontime_data.flights` WHERE FlightDate >= '{start_date}' AND FlightDate <= '{end_date}' AND DepDelay IS NOT NULL AND ArrDelay IS NOT NULL """ # Import this here to avoid pickling the main session. import time from apache_beam import window def to_unixtime(s): return time.mktime(s.timetuple()) # The pipeline will be run on exiting the with block. with pipeline as p: tbl = ( p | 'read table' >> beam.io.ReadFromBigQuery(query=query, use_standard_sql=True) | 'assign timestamp' >> beam.Map( lambda x: window.TimestampedValue(x, to_unixtime(x['date']))) # Use beam.Select to make sure data has a schema # The casts in lambdas ensure data types are properly inferred | 'set schema' >> beam.Select( date=lambda x: str(x['date']), airline=lambda x: str(x['airline']), departure_airport=lambda x: str(x['departure_airport']), arrival_airport=lambda x: str(x['arrival_airport']), departure_delay=lambda x: float(x['departure_delay']), arrival_delay=lambda x: float(x['arrival_delay']))) daily = tbl | 'daily windows' >> beam.WindowInto( beam.window.FixedWindows(60 * 60 * 24)) # group the flights data by carrier df = to_dataframe(daily) result = df.groupby('airline').apply(get_mean_delay_at_top_airports) result.to_csv(output)
def test_convert(self): with beam.Pipeline() as p: a = pd.Series([1, 2, 3]) b = pd.Series([100, 200, 300]) pc_a = p | 'A' >> beam.Create(a) pc_b = p | 'B' >> beam.Create(b) df_a = convert.to_dataframe(pc_a) df_b = convert.to_dataframe(pc_b) df_2a = 2 * df_a df_3a = 3 * df_a df_ab = df_a * df_b # Converting multiple results at a time can be more efficient. pc_2a, pc_ab = convert.to_pcollection(df_2a, df_ab) # But separate conversions can be done as well. pc_3a = convert.to_pcollection(df_3a) assert_that(pc_2a, equal_to(list(2 * a)), label='Check2a') assert_that(pc_3a, equal_to(list(3 * a)), label='Check3a') assert_that(pc_ab, equal_to(list(a * b)), label='Checkab')
def test_batching_beam_row_to_dataframe(self): with beam.Pipeline() as p: df = convert.to_dataframe( p | beam.Create([(u'Falcon', 380.), (u'Falcon', 370.), ( u'Parrot', 24.), (u'Parrot', 26.)]) | beam.Map(lambda tpl: beam.Row(Animal=tpl[0], Speed=tpl[1]))) result = convert.to_pcollection(df.groupby('Animal').mean()) assert_that( result, df_equal_to( pd.DataFrame({ 'Animal': ['Falcon', 'Parrot'], 'Speed': [375., 25.] }).set_index('Animal')))
def expand(self, root): # TODO(robertwb): Handle streaming (with explicit schema). paths_pcoll = root | beam.Create([self.path]) first = io.filesystems.FileSystems.match([self.path], limits=[1 ])[0].metadata_list[0].path with io.filesystems.FileSystems.open(first) as handle: df = next(self.reader(handle, *self.args, chunksize=100, **self.kwargs)) pcoll = ( paths_pcoll | fileio.MatchFiles(self.path) | fileio.ReadMatches() | beam.ParDo(_ReadFromPandasDoFn(self.reader, self.args, self.kwargs))) from apache_beam.dataframe import convert return convert.to_dataframe( pcoll, proxy=_prefix_range_index_with(':', df[:0]))
def test_dataframes(self): p = beam.Pipeline( runner=interactive_runner.InteractiveRunner( direct_runner.DirectRunner())) data = p | beam.Create( [1, 2, 3]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x)) df = to_dataframe(data) # Watch the local scope for Interactive Beam so that values will be cached. ib.watch(locals()) # This is normally done in the interactive_utils when a transform is # applied but needs an IPython environment. So we manually run this here. ie.current_env().track_user_pipelines() df_expected = pd.DataFrame({'square': [1, 4, 9], 'cube': [1, 8, 27]}) pd.testing.assert_frame_equal( df_expected, ib.collect(df, n=10).reset_index(drop=True))
def expand(self, root): paths_pcoll = root | beam.Create([self.path]) match = io.filesystems.FileSystems.match([self.path], limits=[1])[0] if not match.metadata_list: # TODO(BEAM-12031): This should be allowed for streaming pipelines if # user provides an explicit schema. raise FileNotFoundError(f"Found no files that match {self.path!r}") first_path = match.metadata_list[0].path with io.filesystems.FileSystems.open(first_path) as handle: if not self.binary: handle = TextIOWrapper(handle) if self.incremental: sample = next( self.reader(handle, *self.args, **dict(self.kwargs, chunksize=100))) else: sample = self.reader(handle, *self.args, **self.kwargs) matches_pcoll = paths_pcoll | fileio.MatchAll() indices_pcoll = ( matches_pcoll.pipeline | 'DoOnce' >> beam.Create([None]) | beam.Map( lambda _, paths: {path: ix for ix, path in enumerate(sorted(paths))}, paths=beam.pvalue.AsList(matches_pcoll | beam.Map(lambda match: match.path))) ) pcoll = (matches_pcoll | beam.Reshuffle() | fileio.ReadMatches() | beam.ParDo( _ReadFromPandasDoFn(self.reader, self.args, self.kwargs, self.binary, self.incremental, self.splitter), path_indices=beam.pvalue.AsSingleton(indices_pcoll))) from apache_beam.dataframe import convert return convert.to_dataframe(pcoll, proxy=sample[:0])
def test_dataframe_caching(self, cell): # Create a pipeline that exercises the DataFrame API. This will also use # caching in the background. with cell: # Cell 1 p = beam.Pipeline(interactive_runner.InteractiveRunner()) ib.watch({'p': p}) with cell: # Cell 2 data = p | beam.Create([ 1, 2, 3 ]) | beam.Map(lambda x: beam.Row(square=x * x, cube=x * x * x)) with beam.dataframe.allow_non_parallel_operations(): df = to_dataframe(data).reset_index(drop=True) ib.collect(df) with cell: # Cell 3 df['output'] = df['square'] * df['cube'] ib.collect(df) with cell: # Cell 4 df['output'] = 0 ib.collect(df) # We use a trace through the graph to perform an isomorphism test. The end # output should look like a linear graph. This indicates that the dataframe # transform was correctly broken into separate pieces to cache. If caching # isn't enabled, all the dataframe computation nodes are connected to a # single shared node. trace = [] # Only look at the top-level transforms for the isomorphism. The test # doesn't care about the transform implementations, just the overall shape. class TopLevelTracer(beam.pipeline.PipelineVisitor): def _find_root_producer(self, node: beam.pipeline.AppliedPTransform): if node is None or not node.full_label: return None parent = self._find_root_producer(node.parent) if parent is None: return node return parent def _add_to_trace(self, node, trace): if '/' not in str(node): if node.inputs: producer = self._find_root_producer( node.inputs[0].producer) producer_name = producer.full_label if producer else '' trace.append((producer_name, node.full_label)) def visit_transform(self, node: beam.pipeline.AppliedPTransform): self._add_to_trace(node, trace) def enter_composite_transform( self, node: beam.pipeline.AppliedPTransform): self._add_to_trace(node, trace) p.visit(TopLevelTracer()) # Do the isomorphism test which states that the topological sort of the # graph yields a linear graph. trace_string = '\n'.join(str(t) for t in trace) prev_producer = '' for producer, consumer in trace: self.assertEqual(producer, prev_producer, trace_string) prev_producer = consumer
def test_convert_scalar(self): with beam.Pipeline() as p: pc = p | 'A' >> beam.Create([1, 2, 3]) s = convert.to_dataframe(pc) pc_sum = convert.to_pcollection(s.sum()) assert_that(pc_sum, equal_to([6]))
def run(): options = MyOptions() with beam.Pipeline(options=options) as p: immigration_data = ( p | "Read Immigration Data" >> beam.io.parquetio.ReadFromParquet( p.options.input_dir.get() + "data.parquet\*").with_output_types(ImmigrationData) | "Immigration dictionary collection to row" >> beam.Map(ToRowImmigration)) df_immigration = to_dataframe(immigration_data) cities_data = ( p | "Read city data" >> beam.io.ReadFromText( p.options.input_dir.get() + "us-cities-demographics.csv", skip_header_lines=1, ) | "Parse city data" >> beam.ParDo(SplitCityData()) | "Get average demographics per State" >> beam.CombinePerKey( AverageDictFn()) | "Key to Column" >> beam.ParDo( OrganizeCityData()).with_output_types(CityData) | "City dictionary collection to row" >> beam.Map(ToRowCity)) df_cities = to_dataframe(cities_data) df_immigration = df_immigration[df_immigration["i94addr"].notna()] df_immigration = df_immigration.join(df_cities, rsuffix="_city") airport_data = ( p | "Read airport data" >> beam.io.ReadFromText( p.options.input_dir.get() + "airport-codes_csv_2.csv", skip_header_lines=1, ) | "Parse airport data" >> beam.ParDo( SplitAirportData()).with_output_types(AirportData) | "Airport dictionary collection to row" >> beam.Map(ToRowAirport)) df_airport = to_dataframe(airport_data) df_immigration = df_immigration[df_immigration["i94port"].notna()] join_data = df_immigration.join(df_airport, rsuffix="_airport") join_data = to_pcollection(join_data, include_indexes=False) # ---#---#---# TODO #---#---#---# # -------- Join temperature data to dataset -------- # temperature_data = ( # p | "Read city data" >> beam.io.ReadFromText('GlobalLandTemperaturesByCity.csv', skip_header_lines=1) | # "Parse temperature data" >> beam.ParDo(SplitTempData()).with_output_types(TemperatureData) | # 'Temperature dictionary filtered and to row' >> beam.ParDo(FilterAndToRowTemperature()) # ) # -------- NotImplementedError: grouby(as_index = False) and drop_duplicates()-------- # df_immigration_2 = df_immigration.filter( # items=['arrdate','i94mon','municipality','i94port'] # ).groupby( # by = ['arrdate','i94mon','municipality','i94port'], as_index = False # ) # df_temperature = to_dataframe(temperature_data) # df_temperature_2 = df_immigration_2.join(df_temperature, # (df_immigration_2.municipality == df_temperature.municipality) \ # & (df_immigration_2.i94mon == df_temperature.month ), 'left') output = join_data | "Save data to file" >> beam.io.WriteToText( p.options.output_dir.get())
def _run_read_write_test(self, format, read_kwargs={}, write_kwargs={}, check_options={}, requires=()): for module in requires: try: importlib.import_module(module) except ImportError: raise unittest.SkipTest('Missing dependency: %s' % module) small = pd.DataFrame({ 'label': ['11a', '37a', '389a'], 'rank': [0, 1, 2] }) big = pd.DataFrame({'number': list(range(1000))}) big['float'] = big.number.map(math.sqrt) big['text'] = big.number.map(lambda n: 'f' + 'o' * n) def frame_equal_to(expected_, check_index=True, check_names=True): def check(actual): expected = expected_ try: actual = pd.concat(actual) if not check_index: expected = expected.sort_values(list( expected.columns)).reset_index(drop=True) actual = actual.sort_values(list( actual.columns)).reset_index(drop=True) if not check_names: actual = actual.rename(columns=dict( zip(actual.columns, expected.columns))) return assert_frame_equal(expected, actual, check_like=True) except: print("EXPECTED") print(expected) print("ACTUAL") print(actual) raise return check for df in (small, big): with tempfile.TemporaryDirectory() as dir: dest = os.path.join(dir, 'out') try: with beam.Pipeline() as p: deferred_df = convert.to_dataframe( p | beam.Create([df[::3], df[1::3], df[2::3]]), proxy=df[:0]) # This does the write. getattr(deferred_df, 'to_%s' % format)(dest, **write_kwargs) with beam.Pipeline() as p: # Now do the read. # TODO(robertwb): Allow reading from pcoll of paths to do it all in # one pipeline. result = convert.to_pcollection( p | getattr(io, 'read_%s' % format)(dest + '*', ** read_kwargs), yield_elements='pandas') assert_that(result, frame_equal_to(df, **check_options)) except: os.system('head -n 100 ' + dest + '*') raise