コード例 #1
0
def ReadCSVToPandas(
    p: beam.Pipeline,
    *args,
    **kwargs,
) -> PCollection[pd.DataFrame]:
    data = p | "Read CSV" >> df_io.read_csv(*args, **kwargs)
    return df_convert.to_pcollection(data, yield_elements='pandas')
コード例 #2
0
ファイル: taxiride.py プロジェクト: ashishramtri/Apache-Beam
def run_aggregation_pipeline(pipeline_args, input_path, output_path):
  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:
    rides = p | read_csv(input_path)

    # Count the number of passengers dropped off per LocationID
    agg = rides.groupby('DOLocationID').passenger_count.sum()
    agg.to_csv(output_path)
コード例 #3
0
def run_aggregation_pipeline(pipeline, input_path, output_path):
    # The pipeline will be run on exiting the with block.
    # [START DataFrame_taxiride_aggregation]
    with pipeline as p:
        rides = p | read_csv(input_path)

        # Count the number of passengers dropped off per LocationID
        agg = rides.groupby('DOLocationID').passenger_count.sum()
        agg.to_csv(output_path)
コード例 #4
0
 def test_read_write_csv(self):
     input = self.temp_dir({'1.csv': 'a,b\n1,2\n', '2.csv': 'a,b\n3,4\n'})
     output = self.temp_dir()
     with beam.Pipeline() as p:
         df = p | io.read_csv(input + '*.csv')
         df['c'] = df.a + df.b
         df.to_csv(output + 'out.csv', index=False)
     self.assertCountEqual(['a,b,c', '1,2,3', '3,4,7'],
                           set(self.read_all_lines(output + 'out.csv*')))
コード例 #5
0
ファイル: taxiride.py プロジェクト: ashishramtri/Apache-Beam
def run_enrich_pipeline(
    pipeline_args, input_path, output_path, zone_lookup_path):
  """Enrich taxi ride data with zone lookup table and perform a grouped
  aggregation."""
  # The pipeline will be run on exiting the with block.
  with beam.Pipeline(options=PipelineOptions(pipeline_args)) as p:
    rides = p | "Read taxi rides" >> read_csv(input_path)
    zones = p | "Read zone lookup" >> read_csv(zone_lookup_path)

    # Enrich taxi ride data with boroughs from zone lookup table
    # Joins on zones.LocationID and rides.DOLocationID, by first making the
    # former the index for zones.
    rides = rides.merge(
        zones.set_index('LocationID').Borough,
        right_index=True,
        left_on='DOLocationID',
        how='left')

    # Sum passengers dropped off per Borough
    agg = rides.groupby('Borough').passenger_count.sum()
    agg.to_csv(output_path)
コード例 #6
0
ファイル: io_test.py プロジェクト: justsh/incubator-beam
 def test_file_not_found(self):
     with self.assertRaisesRegex(FileNotFoundError, r'/tmp/fake_dir/\*\*'):
         with beam.Pipeline() as p:
             _ = p | io.read_csv('/tmp/fake_dir/**')
コード例 #7
0
ファイル: data_lib.py プロジェクト: kokizzu/google-research
def read_csv_as_pcoll(pipeline, path):
    label = os.path.basename(path)
    raw_df = (pipeline | f"ReadCSV{label}" >> df_io.read_csv(path))
    return df_convert.to_pcollection(raw_df,
                                     pipeline=pipeline,
                                     label=f"ToPColl{label}")