def main(): # get the cmd args db_args, pipeline_args = get_args() # Target database instance source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, create_if_missing=db_args.create_if_missing) # The data to be written records = [ { 'name': 'Jan', 'num': 1 }, { 'name': 'Feb', 'num': 2 }, { 'name': 'Mar', 'num': 3 }, { 'name': 'Apr', 'num': 4 }, { 'name': 'May', 'num': 5 }, { 'name': 'Jun', 'num': 6 }, ] # Target database table table_config = relational_db.TableConfiguration( name='months', create_if_missing=True, # automatically create the table if not there primary_key_columns=['num'] # and use 'num' column as a primary key ) # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: months = p | "Reading records" >> beam.Create(records, reshuffle=False) months | 'Writing to DB' >> relational_db.Write( source_config=source_config, table_config=table_config)
def run(argv=None): parser = argparse.ArgumentParser() parser.add_argument('--input', default='PROJECT_ID:demos.small_teams', help=('Input BigQuery table to process specified as: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) parser.add_argument( '--output', # required=True, required=False, help=('Output BigQuery table for results specified as: ' 'PROJECT:DATASET.TABLE or DATASET.TABLE.')) parser.add_argument('--gcs_location', required=False, help=('GCS Location to store files to load ' 'data into Bigquery')) known_args, pipeline_args = parser.parse_known_args(argv) source_config = relational_db.SourceConfiguration( drivername='postgresql+pg8000', host='localhost', port=5432, username='******', password='******', database='postgres') table_config_teams = relational_db.TableConfiguration( name='teams', create_if_missing=True, # automatically create the table if not there primary_key_columns=['id'] # and use 'id' column as primary key ) table_config_category = relational_db.TableConfiguration( name='category', create_if_missing=True, # automatically create the table if not there primary_key_columns=['category_ts' ] # and use 'category_ts' column as primary key ) with beam.Pipeline(argv=pipeline_args) as p: # Read the table rows into a PCollection. rows = p | 'read' >> beam.io.ReadFromBigQuery(query=""" SELECT id, category FROM `PROJECT_ID.demos.small_teams` limit 1500""", use_standard_sql=True) counted = count_categories(rows) # Write the output using a "Write" transform that has side effects. rows | 'Write Teams' >> relational_db.Write( source_config=source_config, table_config=table_config_teams) counted | 'Write Counts' >> relational_db.Write( source_config=source_config, table_config=table_config_category)
def run(argv=None, save_main_session=True): """Main entry point; defines and runs the pipeline.""" logging.info("HERE") parser = argparse.ArgumentParser() parser.add_argument('--input', type=str, default='dataset/league_of_legends_dataset.csv', help='Path to the data file(s) containing game data.') parser.add_argument('--database', type=str, required=True, help='Database Name') parser.add_argument('--database_host', type=str, required=True, help='Database Host') parser.add_argument('--table_name', default='leader_board', help='table where to store the data') parser.add_argument('--database_user', default='postgres', help='table where to store the data') parser.add_argument('--database_password', default='postgres', help='table where to store the data') args, pipeline_args = parser.parse_known_args(argv) options = PipelineOptions(pipeline_args) logging.info(pipeline_args) source_config = relational_db.SourceConfiguration( drivername='postgresql', host=args.database_host, port=5432, create_if_missing=True, username=args.database_user, password=args.database_password, database=args.database) table_config = relational_db.TableConfiguration( name=args.table_name, create_if_missing=True, primary_key_columns=['gameId']) with beam.Pipeline(options=options) as p: ( # pylint: disable=expression-not-assigned p | 'Setting Up File' >> beam.Create([args.input]) | 'Reading Input Data' >> beam.FlatMap(get_csv_reader) | 'Writing to DB table' >> relational_db.Write( source_config=source_config, table_config=table_config))
def ReadFromPostgres( p: beam.Pipeline, username: Text, password: Text, database: Text, table: Text, host: Text = 'localhost', port: int = 5432, query_limit: int = None, schema: Dict = None, ) -> beam.pvalue.PCollection: """ The Beam PTransform used to read data from a specific BQ table. Args: p: Input beam.Pipeline object coming from a TFX Executor. host: Host of database. username: Username of database user. password: Password to connect to database. port: Port to connect to with database (default 5432) database: Name of the target database. table: Name of the target table. query_limit: Max number of rows to fetch. schema: Dict specifying schema. Returns: A beam.PCollection of data points. Each row in the BigQuery table represents a single data point. """ query = f'SELECT * FROM {table}' if query_limit is not None: query += f'\nLIMIT {query_limit}' source_config = relational_db.SourceConfiguration( drivername='postgresql+pg8000', host=host, port=port, username=username, password=password, database=database, ) records = p | "Reading records from db" >> relational_db.ReadFromDB( source_config=source_config, table_name=table, query=query, ) return records
def main(): # get the cmd args db_args, pipeline_args = get_args() # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, ) months = p | "Reading records from db" >> relational_db.ReadFromDB( source_config=source_config, table_name=db_args.table) months | 'Writing to stdout' >> beam.Map(print)
def __get_database__(self, config): config_database = relational_db.SourceConfiguration( drivername=config['drivername'], host=config['host'], port=config['port'], username=config['username'], password=config['password'], database=config['database'], ) data_read = ( self.pipeline | "Leyendo filas de la db" >> relational_db.ReadFromDB( source_config=config_database, table_name=config['table'], query=config['query'] ) ) return data_read
def run(**db_args, pipeline_args): source_config = relational_db.SourceConfiguration( drivername=db_args['drivername'], host=db_args['host'], port=db_args['port'], database=db_args['database'], username=db_args['username'], password=db_args['password'], create_if_missing=db_args['create_if_missing'] ) table_config = relational_db.TableConfiguration( name='YOUR_TABLE_NAME', # table name create_if_missing=True, # automatically create the table if not there primary_key_columns=['id'] ) """Build and run the pipeline.""" options = PipelineOptions( pipeline_args, save_main_session=True, streaming=True, runner='DataflowRunner', project='YOUR_PROJECT', job_name='YOUR_JOB', temp_location='YOUR_BUCKET', region='YOUR_REGION' ) with beam.Pipeline(options=options) as pipeline: messages = ( pipeline | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub( subscription=kwargs['input_subscription']).with_output_types(bytes) | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8')) | 'Parse JSON messages' >> beam.Map(parse_json_message)) # Output the results into Cloud SQL table. _ = messages | 'Write to Cloud SQL' >> relational_db.Write( source_config=source_config, table_config=table_config )
from __future__ import division, print_function import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from beam_nuggets.io import relational_db with beam.Pipeline(options=PipelineOptions()) as p: months = p | "Reading month records" >> beam.Create([ { 'name': 'Jan', 'num': 1 }, { 'name': 'Feb', 'num': 2 }, ]) months | 'Writing to Sqlite table' >> relational_db.Write( source_config=relational_db.SourceConfiguration( drivername='sqlite', database='/tmp/months_db.sqlite', create_if_missing=True), table_config=relational_db.TableConfiguration(name='months', create_if_missing=True))
from __future__ import division, print_function import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from beam_nuggets.io import relational_db with beam.Pipeline(options=PipelineOptions()) as p: months = p | "Reading records from db" >> relational_db.Read( source_config=relational_db.SourceConfiguration( drivername='sqlite', database='/tmp/months_db.sqlite'), table_name='months') months | 'Writing to stdout' >> beam.Map(print)
# Change the subscription details as per your GCP project ## INPUT_SUBSCRIPTION = "projects/hadooptest-223316/subscriptions/vitualStoreSubscriber" ## # Change the path to the dir where your service account private key file is kept ## SERVICE_ACCOUNT_PATH = "/home/aakash/credentials/pubsubtest.json" ## # Change the details as per your MYSQL config ## SOURCE_CONFIG_PROD = relational_db.SourceConfiguration( drivername="mysql+pymysql", host="35.200.253.253", port=3306, username="******", password="******", database="virtual_store", create_if_missing=False, # create the database if not there ) ## # Change the details as per your table name ## TABLE_CONFIG = relational_db.TableConfiguration( name="transaction_data", create_if_missing=True, # automatically create the table if not there primary_key_columns=["id"], # and use 'num' column as primary key )
## # Change the subscription details as per your GCP project ## INPUT_SUBSCRIPTION = "projects/big-data-292604/subscriptions/moviesubscription" ## # Change the path to the dir where your service account private key file is kept ## SERVICE_ACCOUNT_PATH = "./cre.json" ## # Change the details as per your MYSQL config ## SOURCE_CONFIG_PROD = relational_db.SourceConfiguration( drivername="mysql+pymysql", host="XXX.XXX.XXX.XX", port=3306, username="******", password="******", database="movie_recommendation", create_if_missing=False, # create the database if not there ) ## # Change the details as per your table name ## TABLE_CONFIG = relational_db.TableConfiguration( name="movie_data", create_if_missing=True, # automatically create the table if not there primary_key_columns=["ImdbID"], # and use 'num' column as primary key )
import apache_beam as beam from beam_nuggets.io import relational_db #configuracion de target source_config = relational_db.SourceConfiguration( drivername='postgresql', host='localhost', port=5432, username='******', password='******', database='conekta', ) table_config = relational_db.TableConfiguration(name='charges', create_if_missing=True, primary_key_columns=['id']) #inicio apache beam p = beam.Pipeline() class PrepareData(beam.DoFn): def process(self, element): #proceso de id del element['_id'] #elimino campo compania del element['name'] #verifico los montos que sean flotantes try: