Beispiel #1
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        default='PROJECT_ID:demos.small_teams',
                        help=('Input BigQuery table to process specified as: '
                              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    parser.add_argument(
        '--output',
        #      required=True,
        required=False,
        help=('Output BigQuery table for results specified as: '
              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))

    parser.add_argument('--gcs_location',
                        required=False,
                        help=('GCS Location to store files to load '
                              'data into Bigquery'))

    known_args, pipeline_args = parser.parse_known_args(argv)

    source_config = relational_db.SourceConfiguration(
        drivername='postgresql+pg8000',
        host='localhost',
        port=5432,
        username='******',
        password='******',
        database='postgres')

    table_config_teams = relational_db.TableConfiguration(
        name='teams',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['id']  # and use 'id' column as primary key
    )

    table_config_category = relational_db.TableConfiguration(
        name='category',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['category_ts'
                             ]  # and use 'category_ts' column as primary key
    )

    with beam.Pipeline(argv=pipeline_args) as p:
        # Read the table rows into a PCollection.
        rows = p | 'read' >> beam.io.ReadFromBigQuery(query="""
            SELECT id, category FROM `PROJECT_ID.demos.small_teams` limit 1500""",
                                                      use_standard_sql=True)
        counted = count_categories(rows)

        # Write the output using a "Write" transform that has side effects.

        rows | 'Write Teams' >> relational_db.Write(
            source_config=source_config, table_config=table_config_teams)
        counted | 'Write Counts' >> relational_db.Write(
            source_config=source_config, table_config=table_config_category)
    def execute_pipeline(self, source_config, table_config, records):
        with TestPipeline() as p:
            months = p | "Reading records" >> beam.Create(records)
            months | 'Writing to table' >> relational_db.Write(
                source_config=source_config, table_config=table_config)

        # retrieve the table rows
        return self.db.read_rows(table_config.name)
def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Target database instance
    source_config = relational_db.SourceConfiguration(
        drivername=db_args.drivername,
        host=db_args.host,
        port=db_args.port,
        database=db_args.database,
        username=db_args.username,
        password=db_args.password,
        create_if_missing=db_args.create_if_missing)

    # The data to be written
    records = [
        {
            'name': 'Jan',
            'num': 1
        },
        {
            'name': 'Feb',
            'num': 2
        },
        {
            'name': 'Mar',
            'num': 3
        },
        {
            'name': 'Apr',
            'num': 4
        },
        {
            'name': 'May',
            'num': 5
        },
        {
            'name': 'Jun',
            'num': 6
        },
    ]

    # Target database table
    table_config = relational_db.TableConfiguration(
        name='months',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['num']  # and use 'num' column as a primary key
    )

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        months = p | "Reading records" >> beam.Create(records, reshuffle=False)
        months | 'Writing to DB' >> relational_db.Write(
            source_config=source_config, table_config=table_config)
Beispiel #4
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the pipeline."""
    logging.info("HERE")
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='dataset/league_of_legends_dataset.csv',
                        help='Path to the data file(s) containing game data.')
    parser.add_argument('--database',
                        type=str,
                        required=True,
                        help='Database Name')
    parser.add_argument('--database_host',
                        type=str,
                        required=True,
                        help='Database Host')
    parser.add_argument('--table_name',
                        default='leader_board',
                        help='table where to store the data')
    parser.add_argument('--database_user',
                        default='postgres',
                        help='table where to store the data')
    parser.add_argument('--database_password',
                        default='postgres',
                        help='table where to store the data')

    args, pipeline_args = parser.parse_known_args(argv)

    options = PipelineOptions(pipeline_args)
    logging.info(pipeline_args)
    source_config = relational_db.SourceConfiguration(
        drivername='postgresql',
        host=args.database_host,
        port=5432,
        create_if_missing=True,
        username=args.database_user,
        password=args.database_password,
        database=args.database)

    table_config = relational_db.TableConfiguration(
        name=args.table_name,
        create_if_missing=True,
        primary_key_columns=['gameId'])

    with beam.Pipeline(options=options) as p:
        (  # pylint: disable=expression-not-assigned
            p
            | 'Setting Up File' >> beam.Create([args.input])
            | 'Reading Input Data' >> beam.FlatMap(get_csv_reader)
            | 'Writing to DB table' >> relational_db.Write(
                source_config=source_config, table_config=table_config))
def run_main(path_arguments, pipeline_arguments):
    options = PipelineOptions(pipeline_arguments)
    options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=options)  # initializing Pipeline object

    main_pipeline = (
        p
        | "Read data from pub sub"
        >> beam.io.ReadFromPubSub(subscription=INPUT_SUBSCRIPTION)
        | "Stripping newline character" >> beam.Map(lambda data: data.rstrip().lstrip())
        | "Applying our main unnesting function" >> beam.FlatMap(mainProcess)
    )

    main_pipeline | "Printing for debugging" >> beam.Map(print)

    main_pipeline | "Writing final data to production db" >> relational_db.Write(
        source_config=SOURCE_CONFIG_PROD, table_config=TABLE_CONFIG
    )

    result = p.run()
    result.wait_until_finish()
Beispiel #6
0
def run(**db_args, pipeline_args):

    source_config = relational_db.SourceConfiguration(
        drivername=db_args['drivername'], 
        host=db_args['host'], 
        port=db_args['port'], 
        database=db_args['database'], 
        username=db_args['username'], 
        password=db_args['password'], 
        create_if_missing=db_args['create_if_missing']
    )

    table_config = relational_db.TableConfiguration(
        name='YOUR_TABLE_NAME', # table name
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['id']
    )


    """Build and run the pipeline."""
    options = PipelineOptions(
        pipeline_args, save_main_session=True, streaming=True, runner='DataflowRunner',
        project='YOUR_PROJECT', job_name='YOUR_JOB', temp_location='YOUR_BUCKET', 
        region='YOUR_REGION'
    )


    with beam.Pipeline(options=options) as pipeline:
        messages = (
            pipeline
            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                subscription=kwargs['input_subscription']).with_output_types(bytes)
            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
            | 'Parse JSON messages' >> beam.Map(parse_json_message))

        # Output the results into Cloud SQL table.
        _ = messages | 'Write to Cloud SQL' >> relational_db.Write(
            source_config=source_config,
            table_config=table_config
        )
def run_main(pipeline_arguments):
    options = PipelineOptions(flags=pipeline_arguments,
                              runner='DataflowRunner',
                              project='big-data-292604',
                              temp_location='gs://data_flow-movie-bucket/',
                              region='us-central1')
    options.view_as(StandardOptions).streaming = True
    p = beam.Pipeline(options=options)  # initializing Pipeline object

    main_pipeline = (
        p
        | "Read data from pub sub" >>
        beam.io.ReadFromPubSub(subscription=INPUT_SUBSCRIPTION)
        | "Stripping newline character" >>
        beam.Map(lambda data: data.rstrip().lstrip())
        | "Filter other type only keep movie" >>
        beam.filter(lambda data: filter_movie(data))
        | "Filter NaN data" >> beam.filter(lambda data: filter_out_nones(data))
        | "lower" >> beam.Map(lambda data: low(data))
        | "Writing final data to production db" >> relational_db.Write(
            source_config=SOURCE_CONFIG_PROD, table_config=TABLE_CONFIG))

    result = p.run()
    result.wait_until_finish()
Beispiel #8
0
from __future__ import division, print_function

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

from beam_nuggets.io import relational_db

with beam.Pipeline(options=PipelineOptions()) as p:
    months = p | "Reading month records" >> beam.Create([
        {
            'name': 'Jan',
            'num': 1
        },
        {
            'name': 'Feb',
            'num': 2
        },
    ])
    months | 'Writing to Sqlite table' >> relational_db.Write(
        source_config=relational_db.SourceConfiguration(
            drivername='sqlite',
            database='/tmp/months_db.sqlite',
            create_if_missing=True),
        table_config=relational_db.TableConfiguration(name='months',
                                                      create_if_missing=True))
        #renombro el camo de actualizacion
        element['updated_at'] = element['paid_at']
        if element['updated_at'] == '':
            element['updated_at'] = None
        del element['paid_at']

        #retorno async
        yield element


main = (
    p
    |
    'data source ' >> beam.io.ReadFromMongoDB(uri='mongodb://localhost:27017',
                                              db='conekta',
                                              coll='data_stagin',
                                              projection={
                                                  'company_name': 1,
                                                  'company_id': 1
                                              }))

prov = (main
        | 'filtro por identificador de compania' >>
        beam.Filter(lambda row: len(row['company_id']) > 24)
        | 'prepara informacion' >> beam.ParDo(PrepareData())
        | 'verificar que el monto no sea infinito' >>
        beam.Filter(lambda row: row['amount'] != float('inf'))
        | 'Writing to DB table' >> relational_db.Write(
            source_config=source_config, table_config=table_config))

p.run().wait_until_finish()