def test_ReadFromRelationalDB(self): # create read pipeline, execute it and compare retrieved to actual rows with TestPipeline() as p: assert_that( p | "Reading records from db" >> relational_db.ReadFromDB( source_config=self.source_config, table_name=self.table_name), equal_to(self.table_rows))
def ReadFromPostgres( p: beam.Pipeline, username: Text, password: Text, database: Text, table: Text, host: Text = 'localhost', port: int = 5432, query_limit: int = None, schema: Dict = None, ) -> beam.pvalue.PCollection: """ The Beam PTransform used to read data from a specific BQ table. Args: p: Input beam.Pipeline object coming from a TFX Executor. host: Host of database. username: Username of database user. password: Password to connect to database. port: Port to connect to with database (default 5432) database: Name of the target database. table: Name of the target table. query_limit: Max number of rows to fetch. schema: Dict specifying schema. Returns: A beam.PCollection of data points. Each row in the BigQuery table represents a single data point. """ query = f'SELECT * FROM {table}' if query_limit is not None: query += f'\nLIMIT {query_limit}' source_config = relational_db.SourceConfiguration( drivername='postgresql+pg8000', host=host, port=port, username=username, password=password, database=database, ) records = p | "Reading records from db" >> relational_db.ReadFromDB( source_config=source_config, table_name=table, query=query, ) return records
def main(): # get the cmd args db_args, pipeline_args = get_args() # Create the pipeline options = PipelineOptions(pipeline_args) options.view_as(SetupOptions).save_main_session = True with beam.Pipeline(options=options) as p: source_config = relational_db.SourceConfiguration( drivername=db_args.drivername, host=db_args.host, port=db_args.port, database=db_args.database, username=db_args.username, password=db_args.password, ) months = p | "Reading records from db" >> relational_db.ReadFromDB( source_config=source_config, table_name=db_args.table) months | 'Writing to stdout' >> beam.Map(print)
def __get_database__(self, config): config_database = relational_db.SourceConfiguration( drivername=config['drivername'], host=config['host'], port=config['port'], username=config['username'], password=config['password'], database=config['database'], ) data_read = ( self.pipeline | "Leyendo filas de la db" >> relational_db.ReadFromDB( source_config=config_database, table_name=config['table'], query=config['query'] ) ) return data_read
from __future__ import division, print_function import apache_beam as beam from apache_beam.options.pipeline_options import PipelineOptions from beam_nuggets.io import relational_db with beam.Pipeline(options=PipelineOptions()) as p: months = p | "Reading records from db" >> relational_db.ReadFromDB( source_config=relational_db.SourceConfiguration( drivername='sqlite', database='/tmp/months_db.sqlite'), table_name='months') months | 'Writing to stdout' >> beam.Map(print)