Example #1
0
 def test_ReadFromRelationalDB(self):
     # create read pipeline, execute it and compare retrieved to actual rows
     with TestPipeline() as p:
         assert_that(
             p | "Reading records from db" >> relational_db.ReadFromDB(
                 source_config=self.source_config,
                 table_name=self.table_name), equal_to(self.table_rows))
Example #2
0
def ReadFromPostgres(
        p: beam.Pipeline,
        username: Text,
        password: Text,
        database: Text,
        table: Text,
        host: Text = 'localhost',
        port: int = 5432,
        query_limit: int = None,
        schema: Dict = None,
    ) -> beam.pvalue.PCollection:
    """
    The Beam PTransform used to read data from a specific BQ table.

    Args:
        p: Input beam.Pipeline object coming from a TFX Executor.
        host: Host of database.
        username: Username of database user.
        password: Password to connect to database.
        port: Port to connect to with database (default 5432)
        database: Name of the target database.
        table: Name of the target table.
        query_limit: Max number of rows to fetch.
        schema: Dict specifying schema.

    Returns:
        A beam.PCollection of data points. Each row in the BigQuery table
         represents a single data point.
    """
    query = f'SELECT * FROM {table}'

    if query_limit is not None:
        query += f'\nLIMIT {query_limit}'

    source_config = relational_db.SourceConfiguration(
        drivername='postgresql+pg8000',
        host=host,
        port=port,
        username=username,
        password=password,
        database=database,
    )
    records = p | "Reading records from db" >> relational_db.ReadFromDB(
        source_config=source_config,
        table_name=table,
        query=query,
    )
    return records
def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        source_config = relational_db.SourceConfiguration(
            drivername=db_args.drivername,
            host=db_args.host,
            port=db_args.port,
            database=db_args.database,
            username=db_args.username,
            password=db_args.password,
        )

        months = p | "Reading records from db" >> relational_db.ReadFromDB(
            source_config=source_config, table_name=db_args.table)
        months | 'Writing to stdout' >> beam.Map(print)
Example #4
0
    def __get_database__(self, config):
        config_database = relational_db.SourceConfiguration(
            drivername=config['drivername'],
            host=config['host'],
            port=config['port'],
            username=config['username'],
            password=config['password'],
            database=config['database'],
        )

        data_read = (
                self.pipeline
                | "Leyendo filas de la db" >> relational_db.ReadFromDB(
            source_config=config_database,
            table_name=config['table'],
            query=config['query']
        )
        )

        return data_read
from __future__ import division, print_function

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

from beam_nuggets.io import relational_db

with beam.Pipeline(options=PipelineOptions()) as p:
    months = p | "Reading records from db" >> relational_db.ReadFromDB(
        source_config=relational_db.SourceConfiguration(
            drivername='sqlite', database='/tmp/months_db.sqlite'),
        table_name='months')
    months | 'Writing to stdout' >> beam.Map(print)