def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', default='PROJECT_ID:demos.small_teams', help=('Input BigQuery table to process specified as: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) parser.add_argument( '--output', # required=True, required=False, help=('Output BigQuery table for results specified as: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) parser.add_argument('--gcs_location', required=False, help=('GCS Location to store files to load ' 'data into Bigquery')) known_args, pipeline_args = parser.parse_known_args(argv) source_config = relational_db.SourceConfiguration( drivername='postgresql+pg8000', host='localhost', port=5432, username='******', password='******', database='postgres') table_config_teams = relational_db.TableConfiguration( name='teams', create_if_missing=True, # automatically create the table if not there primary_key_columns=['id'] # and use 'id' column as primary key ) table_config_category = relational_db.TableConfiguration( name='category', create_if_missing=True, # automatically create the table if not there primary_key_columns=['category_ts' ] # and use 'category_ts' column as primary key ) with beam.Pipeline(argv=pipeline_args) as p: # Read the table rows into a PCollection. rows = p | 'read' >> beam.io.ReadFromBigQuery(query=""" SELECT id, category FROM `PROJECT_ID.demos.small_teams` limit 1500""", use_standard_sql=True) counted = count_categories(rows) # Write the output using a "Write" transform that has side effects. rows | 'Write Teams' >> relational_db.Write( source_config=source_config, table_config=table_config_teams) counted | 'Write Counts' >> relational_db.Write( source_config=source_config, table_config=table_config_category)
def execute_pipeline(self, source_config, table_config, records): with TestPipeline() as p: months = p | "Reading records" >> beam.Create(records) months | 'Writing to table' >> relational_db.Write( source_config=source_config, table_config=table_config) # retrieve the table rows return self.db.read_rows(table_config.name)
def main(): # get the cmd args db_args, pipeline_args = get_args() # Target database instance source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, create_if_missing=db_args.create_if_missing) # The data to be written records = [ { 'name': 'Jan', 'num': 1 }, { 'name': 'Feb', 'num': 2 }, { 'name': 'Mar', 'num': 3 }, { 'name': 'Apr', 'num': 4 }, { 'name': 'May', 'num': 5 }, { 'name': 'Jun', 'num': 6 }, ] # Target database table table_config = relational_db.TableConfiguration( name='months', create_if_missing=True, # automatically create the table if not there primary_key_columns=['num'] # and use 'num' column as a primary key ) # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: months = p | "Reading records" >> beam.Create(records, reshuffle=False) months | 'Writing to DB' >> relational_db.Write( source_config=source_config, table_config=table_config)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the pipeline.""" logging.info("HERE") parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='dataset/league_of_legends_dataset.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--database', type=str, required=True, help='Database Name') parser.add_argument('--database_host', type=str, required=True, help='Database Host') parser.add_argument('--table_name', default='leader_board', help='table where to store the data') parser.add_argument('--database_user', default='postgres', help='table where to store the data') parser.add_argument('--database_password', default='postgres', help='table where to store the data') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) logging.info(pipeline_args) source_config = relational_db.SourceConfiguration( drivername='postgresql', host=args.database_host, port=5432, create_if_missing=True, username=args.database_user, password=args.database_password, database=args.database) table_config = relational_db.TableConfiguration( name=args.table_name, create_if_missing=True, primary_key_columns=['gameId']) with beam.Pipeline(options=options) as p: ( # pylint: disable=expression-not-assigned p | 'Setting Up File' >> beam.Create([args.input]) | 'Reading Input Data' >> beam.FlatMap(get_csv_reader) | 'Writing to DB table' >> relational_db.Write( source_config=source_config, table_config=table_config))
def run_main(path_arguments, pipeline_arguments): options = PipelineOptions(pipeline_arguments) options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=options) # initializing Pipeline object main_pipeline = ( p | "Read data from pub sub" >> beam.io.ReadFromPubSub(subscription=INPUT_SUBSCRIPTION) | "Stripping newline character" >> beam.Map(lambda data: data.rstrip().lstrip()) | "Applying our main unnesting function" >> beam.FlatMap(mainProcess) ) main_pipeline | "Printing for debugging" >> beam.Map(print) main_pipeline | "Writing final data to production db" >> relational_db.Write( source_config=SOURCE_CONFIG_PROD, table_config=TABLE_CONFIG ) result = p.run() result.wait_until_finish()
def run(**db_args, pipeline_args): source_config = relational_db.SourceConfiguration( drivername=db_args['drivername'], host=db_args['host'], port=db_args['port'], database=db_args['database'], username=db_args['username'], password=db_args['password'], create_if_missing=db_args['create_if_missing'] ) table_config = relational_db.TableConfiguration( name='YOUR_TABLE_NAME', # table name create_if_missing=True, # automatically create the table if not there primary_key_columns=['id'] ) """Build and run the pipeline.""" options = PipelineOptions( pipeline_args, save_main_session=True, streaming=True, runner='DataflowRunner', project='YOUR_PROJECT', job_name='YOUR_JOB', temp_location='YOUR_BUCKET', region='YOUR_REGION' ) with beam.Pipeline(options=options) as pipeline: messages = ( pipeline | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub( subscription=kwargs['input_subscription']).with_output_types(bytes) | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8')) | 'Parse JSON messages' >> beam.Map(parse_json_message)) # Output the results into Cloud SQL table. _ = messages | 'Write to Cloud SQL' >> relational_db.Write( source_config=source_config, table_config=table_config )
def run_main(pipeline_arguments): options = PipelineOptions(flags=pipeline_arguments, runner='DataflowRunner', project='big-data-292604', temp_location='gs://data_flow-movie-bucket/', region='us-central1') options.view_as(StandardOptions).streaming = True p = beam.Pipeline(options=options) # initializing Pipeline object main_pipeline = ( p | "Read data from pub sub" >> beam.io.ReadFromPubSub(subscription=INPUT_SUBSCRIPTION) | "Stripping newline character" >> beam.Map(lambda data: data.rstrip().lstrip()) | "Filter other type only keep movie" >> beam.filter(lambda data: filter_movie(data)) | "Filter NaN data" >> beam.filter(lambda data: filter_out_nones(data)) | "lower" >> beam.Map(lambda data: low(data)) | "Writing final data to production db" >> relational_db.Write( source_config=SOURCE_CONFIG_PROD, table_config=TABLE_CONFIG)) result = p.run() result.wait_until_finish()
from __future__ import division, print_function import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from beam_nuggets.io import relational_db with beam.Pipeline(options=PipelineOptions()) as p: months = p | "Reading month records" >> beam.Create([ { 'name': 'Jan', 'num': 1 }, { 'name': 'Feb', 'num': 2 }, ]) months | 'Writing to Sqlite table' >> relational_db.Write( source_config=relational_db.SourceConfiguration( drivername='sqlite', database='/tmp/months_db.sqlite', create_if_missing=True), table_config=relational_db.TableConfiguration(name='months', create_if_missing=True))
#renombro el camo de actualizacion element['updated_at'] = element['paid_at'] if element['updated_at'] == '': element['updated_at'] = None del element['paid_at'] #retorno async yield element main = ( p | 'data source ' >> beam.io.ReadFromMongoDB(uri='mongodb://localhost:27017', db='conekta', coll='data_stagin', projection={ 'company_name': 1, 'company_id': 1 })) prov = (main | 'filtro por identificador de compania' >> beam.Filter(lambda row: len(row['company_id']) > 24) | 'prepara informacion' >> beam.ParDo(PrepareData()) | 'verificar que el monto no sea infinito' >> beam.Filter(lambda row: row['amount'] != float('inf')) | 'Writing to DB table' >> relational_db.Write( source_config=source_config, table_config=table_config)) p.run().wait_until_finish()