예제 #1
0
def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Target database instance
    source_config = relational_db.SourceConfiguration(
        drivername=db_args.drivername,
        host=db_args.host,
        port=db_args.port,
        database=db_args.database,
        username=db_args.username,
        password=db_args.password,
        create_if_missing=db_args.create_if_missing)

    # The data to be written
    records = [
        {
            'name': 'Jan',
            'num': 1
        },
        {
            'name': 'Feb',
            'num': 2
        },
        {
            'name': 'Mar',
            'num': 3
        },
        {
            'name': 'Apr',
            'num': 4
        },
        {
            'name': 'May',
            'num': 5
        },
        {
            'name': 'Jun',
            'num': 6
        },
    ]

    # Target database table
    table_config = relational_db.TableConfiguration(
        name='months',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['num']  # and use 'num' column as a primary key
    )

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        months = p | "Reading records" >> beam.Create(records, reshuffle=False)
        months | 'Writing to DB' >> relational_db.Write(
            source_config=source_config, table_config=table_config)
예제 #2
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        default='PROJECT_ID:demos.small_teams',
                        help=('Input BigQuery table to process specified as: '
                              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))
    parser.add_argument(
        '--output',
        #      required=True,
        required=False,
        help=('Output BigQuery table for results specified as: '
              'PROJECT:DATASET.TABLE or DATASET.TABLE.'))

    parser.add_argument('--gcs_location',
                        required=False,
                        help=('GCS Location to store files to load '
                              'data into Bigquery'))

    known_args, pipeline_args = parser.parse_known_args(argv)

    source_config = relational_db.SourceConfiguration(
        drivername='postgresql+pg8000',
        host='localhost',
        port=5432,
        username='******',
        password='******',
        database='postgres')

    table_config_teams = relational_db.TableConfiguration(
        name='teams',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['id']  # and use 'id' column as primary key
    )

    table_config_category = relational_db.TableConfiguration(
        name='category',
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['category_ts'
                             ]  # and use 'category_ts' column as primary key
    )

    with beam.Pipeline(argv=pipeline_args) as p:
        # Read the table rows into a PCollection.
        rows = p | 'read' >> beam.io.ReadFromBigQuery(query="""
            SELECT id, category FROM `PROJECT_ID.demos.small_teams` limit 1500""",
                                                      use_standard_sql=True)
        counted = count_categories(rows)

        # Write the output using a "Write" transform that has side effects.

        rows | 'Write Teams' >> relational_db.Write(
            source_config=source_config, table_config=table_config_teams)
        counted | 'Write Counts' >> relational_db.Write(
            source_config=source_config, table_config=table_config_category)
예제 #3
0
def run(argv=None, save_main_session=True):
    """Main entry point; defines and runs the pipeline."""
    logging.info("HERE")
    parser = argparse.ArgumentParser()

    parser.add_argument('--input',
                        type=str,
                        default='dataset/league_of_legends_dataset.csv',
                        help='Path to the data file(s) containing game data.')
    parser.add_argument('--database',
                        type=str,
                        required=True,
                        help='Database Name')
    parser.add_argument('--database_host',
                        type=str,
                        required=True,
                        help='Database Host')
    parser.add_argument('--table_name',
                        default='leader_board',
                        help='table where to store the data')
    parser.add_argument('--database_user',
                        default='postgres',
                        help='table where to store the data')
    parser.add_argument('--database_password',
                        default='postgres',
                        help='table where to store the data')

    args, pipeline_args = parser.parse_known_args(argv)

    options = PipelineOptions(pipeline_args)
    logging.info(pipeline_args)
    source_config = relational_db.SourceConfiguration(
        drivername='postgresql',
        host=args.database_host,
        port=5432,
        create_if_missing=True,
        username=args.database_user,
        password=args.database_password,
        database=args.database)

    table_config = relational_db.TableConfiguration(
        name=args.table_name,
        create_if_missing=True,
        primary_key_columns=['gameId'])

    with beam.Pipeline(options=options) as p:
        (  # pylint: disable=expression-not-assigned
            p
            | 'Setting Up File' >> beam.Create([args.input])
            | 'Reading Input Data' >> beam.FlatMap(get_csv_reader)
            | 'Writing to DB table' >> relational_db.Write(
                source_config=source_config, table_config=table_config))
예제 #4
0
def ReadFromPostgres(
        p: beam.Pipeline,
        username: Text,
        password: Text,
        database: Text,
        table: Text,
        host: Text = 'localhost',
        port: int = 5432,
        query_limit: int = None,
        schema: Dict = None,
    ) -> beam.pvalue.PCollection:
    """
    The Beam PTransform used to read data from a specific BQ table.

    Args:
        p: Input beam.Pipeline object coming from a TFX Executor.
        host: Host of database.
        username: Username of database user.
        password: Password to connect to database.
        port: Port to connect to with database (default 5432)
        database: Name of the target database.
        table: Name of the target table.
        query_limit: Max number of rows to fetch.
        schema: Dict specifying schema.

    Returns:
        A beam.PCollection of data points. Each row in the BigQuery table
         represents a single data point.
    """
    query = f'SELECT * FROM {table}'

    if query_limit is not None:
        query += f'\nLIMIT {query_limit}'

    source_config = relational_db.SourceConfiguration(
        drivername='postgresql+pg8000',
        host=host,
        port=port,
        username=username,
        password=password,
        database=database,
    )
    records = p | "Reading records from db" >> relational_db.ReadFromDB(
        source_config=source_config,
        table_name=table,
        query=query,
    )
    return records
def main():
    # get the cmd args
    db_args, pipeline_args = get_args()

    # Create the pipeline
    options = PipelineOptions(pipeline_args)
    options.view_as(SetupOptions).save_main_session = True
    with beam.Pipeline(options=options) as p:
        source_config = relational_db.SourceConfiguration(
            drivername=db_args.drivername,
            host=db_args.host,
            port=db_args.port,
            database=db_args.database,
            username=db_args.username,
            password=db_args.password,
        )

        months = p | "Reading records from db" >> relational_db.ReadFromDB(
            source_config=source_config, table_name=db_args.table)
        months | 'Writing to stdout' >> beam.Map(print)
예제 #6
0
파일: main.py 프로젝트: stevematos/demo1_BI
    def __get_database__(self, config):
        config_database = relational_db.SourceConfiguration(
            drivername=config['drivername'],
            host=config['host'],
            port=config['port'],
            username=config['username'],
            password=config['password'],
            database=config['database'],
        )

        data_read = (
                self.pipeline
                | "Leyendo filas de la db" >> relational_db.ReadFromDB(
            source_config=config_database,
            table_name=config['table'],
            query=config['query']
        )
        )

        return data_read
예제 #7
0
파일: sub2sql.py 프로젝트: jasonwu0908/beam
def run(**db_args, pipeline_args):

    source_config = relational_db.SourceConfiguration(
        drivername=db_args['drivername'], 
        host=db_args['host'], 
        port=db_args['port'], 
        database=db_args['database'], 
        username=db_args['username'], 
        password=db_args['password'], 
        create_if_missing=db_args['create_if_missing']
    )

    table_config = relational_db.TableConfiguration(
        name='YOUR_TABLE_NAME', # table name
        create_if_missing=True,  # automatically create the table if not there
        primary_key_columns=['id']
    )


    """Build and run the pipeline."""
    options = PipelineOptions(
        pipeline_args, save_main_session=True, streaming=True, runner='DataflowRunner',
        project='YOUR_PROJECT', job_name='YOUR_JOB', temp_location='YOUR_BUCKET', 
        region='YOUR_REGION'
    )


    with beam.Pipeline(options=options) as pipeline:
        messages = (
            pipeline
            | 'Read from Pub/Sub' >> beam.io.ReadFromPubSub(
                subscription=kwargs['input_subscription']).with_output_types(bytes)
            | 'UTF-8 bytes to string' >> beam.Map(lambda msg: msg.decode('utf-8'))
            | 'Parse JSON messages' >> beam.Map(parse_json_message))

        # Output the results into Cloud SQL table.
        _ = messages | 'Write to Cloud SQL' >> relational_db.Write(
            source_config=source_config,
            table_config=table_config
        )
예제 #8
0
from __future__ import division, print_function

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

from beam_nuggets.io import relational_db

with beam.Pipeline(options=PipelineOptions()) as p:
    months = p | "Reading month records" >> beam.Create([
        {
            'name': 'Jan',
            'num': 1
        },
        {
            'name': 'Feb',
            'num': 2
        },
    ])
    months | 'Writing to Sqlite table' >> relational_db.Write(
        source_config=relational_db.SourceConfiguration(
            drivername='sqlite',
            database='/tmp/months_db.sqlite',
            create_if_missing=True),
        table_config=relational_db.TableConfiguration(name='months',
                                                      create_if_missing=True))
예제 #9
0
from __future__ import division, print_function

import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions

from beam_nuggets.io import relational_db

with beam.Pipeline(options=PipelineOptions()) as p:
    months = p | "Reading records from db" >> relational_db.Read(
        source_config=relational_db.SourceConfiguration(
            drivername='sqlite', database='/tmp/months_db.sqlite'),
        table_name='months')
    months | 'Writing to stdout' >> beam.Map(print)
예제 #10
0
# Change the subscription details as per your GCP project
##
INPUT_SUBSCRIPTION = "projects/hadooptest-223316/subscriptions/vitualStoreSubscriber"

##
# Change the path to the dir where your service account private key file is kept
##
SERVICE_ACCOUNT_PATH = "/home/aakash/credentials/pubsubtest.json"

##
# Change the details as per your MYSQL config
##
SOURCE_CONFIG_PROD = relational_db.SourceConfiguration(
    drivername="mysql+pymysql",
    host="35.200.253.253",
    port=3306,
    username="******",
    password="******",
    database="virtual_store",
    create_if_missing=False,  # create the database if not there
)

##
# Change the details as per your table name
##
TABLE_CONFIG = relational_db.TableConfiguration(
    name="transaction_data",
    create_if_missing=True,  # automatically create the table if not there
    primary_key_columns=["id"],  # and use 'num' column as primary key
)
예제 #11
0
##
# Change the subscription details as per your GCP project
##
INPUT_SUBSCRIPTION = "projects/big-data-292604/subscriptions/moviesubscription"

##
# Change the path to the dir where your service account private key file is kept
##
SERVICE_ACCOUNT_PATH = "./cre.json"

##
# Change the details as per your MYSQL config
##
SOURCE_CONFIG_PROD = relational_db.SourceConfiguration(
    drivername="mysql+pymysql",
    host="XXX.XXX.XXX.XX",
    port=3306,
    username="******",
    password="******",
    database="movie_recommendation",
    create_if_missing=False,  # create the database if not there
)

##
# Change the details as per your table name
##
TABLE_CONFIG = relational_db.TableConfiguration(
    name="movie_data",
    create_if_missing=True,  # automatically create the table if not there
    primary_key_columns=["ImdbID"],  # and use 'num' column as primary key
)
예제 #12
0
import apache_beam as beam
from beam_nuggets.io import relational_db

#configuracion de target
source_config = relational_db.SourceConfiguration(
    drivername='postgresql',
    host='localhost',
    port=5432,
    username='******',
    password='******',
    database='conekta',
)

table_config = relational_db.TableConfiguration(name='charges',
                                                create_if_missing=True,
                                                primary_key_columns=['id'])

#inicio apache beam
p = beam.Pipeline()


class PrepareData(beam.DoFn):
    def process(self, element):
        #proceso de id
        del element['_id']

        #elimino campo compania
        del element['name']

        #verifico los montos que sean flotantes
        try: