Example #1
0
    def test_pipeline_partitions_splitter(self):
        expected = [
            {
                "id": 1,
                "name": "test data1",
                "date": date(2020, 1, 1)
            },
            {
                "id": 2,
                "name": "test data2",
                "date": date(2020, 2, 2)
            },
        ]

        with TestPipeline() as p:
            # Access to mysql on docker
            read_from_mysql = ReadFromMySQL(
                query=
                "SELECT * FROM test_db.tests PARTITION (p202002,p202003);",
                host="0.0.0.0",
                database="test_db",
                user="******",
                password="******",
                port=3307,
                splitter=splitters.PartitionSplitter(),
            )

            actual = p | read_from_mysql

            assert_that(actual, equal_to(expected))
Example #2
0
    def test_pipeline_ids_splitter(self):
        expected = [
            {
                "id": 1,
                "name": "test data1",
                "date": date(2020, 1, 1)
            },
            {
                "id": 2,
                "name": "test data2",
                "date": date(2020, 2, 2)
            },
        ]

        with TestPipeline() as p:
            # Access to mysql on docker
            read_from_mysql = ReadFromMySQL(
                query="SELECT * FROM test_db.tests WHERE id IN ({ids});",
                host="0.0.0.0",
                database="test_db",
                user="******",
                password="******",
                port=3307,
                splitter=splitters.IdsSplitter(generate_ids_fn=lambda: [1, 2]),
            )

            actual = p | read_from_mysql

            assert_that(actual, equal_to(expected))
Example #3
0
    def test_pipeline_date_splitter(self):
        expected = [
            {
                "id": 1,
                "name": "test data1",
                "date": date(2020, 1, 1)
            },
            {
                "id": 2,
                "name": "test data2",
                "date": date(2020, 2, 2)
            },
            {
                "id": 3,
                "name": "test data3",
                "date": date(2020, 3, 3)
            },
        ]

        with TestPipeline() as p:
            # Access to mysql on docker
            read_from_mysql = ReadFromMySQL(
                query=
                "SELECT * FROM test_db.tests WHERE date BETWEEN '2020-01-01' AND '2020-04-03';",
                host="0.0.0.0",
                database="test_db",
                user="******",
                password="******",
                port=3307,
                splitter=splitters.DateSplitter(),
            )

            actual = p | read_from_mysql

            assert_that(actual, equal_to(expected))
Example #4
0
    def test_pipeline_limit_offset_splitter(self):
        expected = [
            {
                "id": 1,
                "name": "test data1",
                "date": date(2020, 1, 1),
                "memo": "memo1"
            },
            {
                "id": 2,
                "name": "test data2",
                "date": date(2020, 2, 2),
                "memo": None
            },
            {
                "id": 3,
                "name": "test data3",
                "date": date(2020, 3, 3),
                "memo": "memo3"
            },
            {
                "id": 4,
                "name": "test data4",
                "date": date(2020, 4, 4),
                "memo": None
            },
            {
                "id": 5,
                "name": "test data5",
                "date": date(2020, 5, 5),
                "memo": None
            },
        ]

        with TestPipeline() as p:
            # Access to mysql on docker
            read_from_mysql = ReadFromMySQL(
                query="SELECT * FROM test_db.tests;",
                host="0.0.0.0",
                database="test_db",
                user="******",
                password="******",
                port=3307,
                splitter=splitters.LimitOffsetSplitter(),
            )

            actual = p | read_from_mysql

            assert_that(actual, equal_to(expected))
Example #5
0
def run(argv=None):
    parser = argparse.ArgumentParser()
    parser.add_argument('--core_host', dest='core_host', type=str)
    parser.add_argument('--core_port', dest='core_port', type=int)
    parser.add_argument('--core_username', dest='core_username', type=str)
    parser.add_argument('--core_password', dest='core_password', type=str)
    parser.add_argument('--core_database', dest='core_database', type=str)
    parser.add_argument('--remittances_host',
                        dest='remittances_host',
                        type=str)
    parser.add_argument('--remittances_port',
                        dest='remittances_port',
                        type=int)
    parser.add_argument('--remittances_username',
                        dest='remittances_username',
                        type=str)
    parser.add_argument('--remittances_password',
                        dest='remittances_password',
                        type=str)
    parser.add_argument('--remittances_database',
                        dest='remittances_database',
                        type=str)
    parser.add_argument('--auth_uri', dest='auth_uri', type=str)
    parser.add_argument('--auth_database', dest='auth_database', type=str)
    known_args, pipeline_args = parser.parse_known_args(argv)
    pipeline_options = PipelineOptions(pipeline_args)
    project_id = pipeline_options.display_data()['project']

    with beam.Pipeline(options=pipeline_options) as p:
        # Reads from data sources

        read_core = (
            p
            | 'ReadCore' >> ReadFromDB(source_config=SourceConfiguration(
                drivername='postgresql+psycopg2',
                host=known_args.core_host,
                port=known_args.core_port,
                username=known_args.core_username,
                password=known_args.core_password,
                database=known_args.core_database,
                create_if_missing=False),
                                       table_name='transaction',
                                       query='SELECT * FROM transaction')
            | 'FilterTransactions' >> beam.Filter(filter_transaction))

        read_auth = (
            p
            | 'Read Users' >> ReadFromMongoDB(uri=known_args.auth_uri,
                                              db=known_args.auth_database,
                                              coll='users')
            | 'TransformUsers' >> beam.ParDo(TransformUser())
            | 'FilterUsers' >> beam.Filter(filter_user))

        read_remittances = (
            p
            | 'ReadRemittances' >> ReadFromMySQL(
                query='SELECT * FROM valiu_remittances.remittance',
                host=known_args.remittances_host,
                database=known_args.remittances_database,
                user=known_args.remittances_username,
                password=known_args.remittances_password,
                port=known_args.remittances_port,
                splitter=splitters.NoSplitter())
            | 'TransformRemittances' >> beam.ParDo(TransformRemittance())
            | 'FilterRemittances' >> beam.Filter(filter_remittance))

        # Merges

        merged_transactions = (
            (read_core, read_auth)
            | 'MergeTransactionsUsers' >> LeftJoin('id2', 'id', 'user_'))

        merged_remittances = (
            (read_remittances, read_auth)
            | 'MergeRemittanceUsers' >> MergeRemittancesUsers())

        # Writes to BigQuery

        write_users = (
            read_auth
            | 'WriteUsers' >> beam.io.WriteToBigQuery(
                table='auth_users',
                dataset='auth',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=users_table_schema,
                additional_bq_parameters=users_table_partitioning))

        write_remittances = (
            merged_remittances
            | 'WriteRemittances' >> beam.io.WriteToBigQuery(
                table='remittances_movements',
                dataset='remittances',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=remittances_table_schema,
                additional_bq_parameters=remittances_table_partitioning))

        write_transactions_cash_in = (
            merged_transactions
            | 'FilterCashIn' >> beam.Filter(filter_currency_operation, 'COP',
                                            'USDv')
            | 'CleanTransactionsCashIn' >> beam.ParDo(
                TransformTransaction('cash_in'))
            | 'WriteCashIn' >> beam.io.WriteToBigQuery(
                table='core_cash_in',
                dataset='core',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=transactions_table_schema,
                additional_bq_parameters=transactions_table_partitioning))

        write_transactions_cash_out = (
            merged_transactions
            | 'FilterCashOut' >> beam.Filter(filter_currency_operation, 'USDv',
                                             'VES')
            | 'CleanTransactionsCashOut' >> beam.ParDo(
                TransformTransaction('cash_out'))
            | 'WriteCashOut' >> beam.io.WriteToBigQuery(
                table='core_cash_out',
                dataset='core',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=transactions_table_schema,
                additional_bq_parameters=transactions_table_partitioning))

        write_transactions_p2p = (
            merged_transactions
            | 'FilterP2P' >> beam.Filter(filter_currency_operation, 'USDv',
                                         'USDv')
            | 'CleanTransactionsP2P' >> beam.ParDo(TransformTransaction('p2p'))
            | 'WriteP2P' >> beam.io.WriteToBigQuery(
                table='core_p2p',
                dataset='core',
                project=project_id,
                create_disposition=beam.io.BigQueryDisposition.
                CREATE_IF_NEEDED,
                write_disposition=beam.io.BigQueryDisposition.WRITE_APPEND,
                schema=transactions_table_schema,
                additional_bq_parameters=transactions_table_partitioning))