def test_replicate_s3_to_sf(self):
        """
        Replicate csv files from s3 to Snowflake, check if return code is zero and success log file created
        """

        # 1. Run tap first time - both fastsync and a singer should be triggered
        assertions.assert_run_tap_success(self.tap_id, self.target_id,
                                          ['fastsync', 'singer'])
        self.assert_columns_exist()

        # 2. Run tap second time - both fastsync and a singer should be triggered
        assertions.assert_run_tap_success(self.tap_id, self.target_id,
                                          ['fastsync', 'singer'])
        self.assert_columns_exist()
Example #2
0
    def test_replicate_mariadb_replica_to_sf(self):
        """
        Test Replicate data from MariaDB to Snowflake
        """

        assertions.assert_run_tap_success(
            self.tap_id, self.target_id, ['fastsync', 'singer']
        )
        assertions.assert_row_counts_equal(
            self.e2e_env.run_query_tap_mysql_2,
            self.e2e_env.run_query_target_snowflake,
            self.e2e_env.sf_schema_postfix,
        )
        assertions.assert_all_columns_exist(
            self.e2e_env.run_query_tap_mysql_2,
            self.e2e_env.run_query_target_snowflake,
            mysql_to_snowflake.tap_type_to_target_type,
            schema_postfix=self.e2e_env.sf_schema_postfix,
        )
Example #3
0
    def test_replicate_mariadb_to_sf(self):
        """
        Replicate data from MariaDB to Snowflake
        """

        # 1. Run tap first time - both fastsync and a singer should be triggered
        assertions.assert_run_tap_success(
            self.tap_id, self.target_id, ['fastsync', 'singer']
        )
        assertions.assert_row_counts_equal(
            self.e2e_env.run_query_tap_mysql,
            self.e2e_env.run_query_target_snowflake,
            self.e2e_env.sf_schema_postfix,
        )
        assertions.assert_all_columns_exist(
            self.e2e_env.run_query_tap_mysql,
            self.e2e_env.run_query_target_snowflake,
            mysql_to_snowflake.tap_type_to_target_type,
            schema_postfix=self.e2e_env.sf_schema_postfix,
        )

        # 2. Make changes in MariaDB source database
        #  LOG_BASED
        self.e2e_env.run_query_tap_mysql(
            'UPDATE weight_unit SET isactive = 0 WHERE weight_unit_id IN (2, 3, 4)'
        )
        self.e2e_env.run_query_tap_mysql(
            'INSERT INTO edgydata (c_varchar, `group`, `case`, cjson, c_time) VALUES'
            "('Lorem ipsum dolor sit amet', 10, 'A', '[]', '00:00:00'),"
            "('Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช', 20, 'A', '{}', '12:00:59'),"
            "('Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.', null,'B', "
            '\'[{"key": "ValueOne", "actions": []}, {"key": "ValueTwo", "actions": []}]\','
            " '9:1:00'),"
            "('Special Characters: [\"\\,"
            "!@£$%^&*()]\\\\', null, 'B', "
            "null, '12:00:00'),"
            "('	', 20, 'B', null, '15:36:10'),"
            "(CONCAT(CHAR(0x0000 using utf16), '<- null char'), 20, 'B', null, '15:36:10')"
        )

        self.e2e_env.run_query_tap_mysql('UPDATE all_datatypes SET c_point = NULL')

        #  INCREMENTAL
        self.e2e_env.run_query_tap_mysql(
            'INSERT INTO address(isactive, street_number, date_created, date_updated,'
            ' supplier_supplier_id, zip_code_zip_code_id)'
            'VALUES (1, 1234, NOW(), NOW(), 0, 1234)'
        )
        self.e2e_env.run_query_tap_mysql(
            'UPDATE address SET street_number = 9999, date_updated = NOW()'
            ' WHERE address_id = 1'
        )
        #  FULL_TABLE
        self.e2e_env.run_query_tap_mysql('DELETE FROM no_pk_table WHERE id > 10')

        # 3. Run tap second time - both fastsync and a singer should be triggered, there are some FULL_TABLE
        assertions.assert_run_tap_success(
            self.tap_id, self.target_id, ['fastsync', 'singer']
        )
        assertions.assert_row_counts_equal(
            self.e2e_env.run_query_tap_mysql,
            self.e2e_env.run_query_target_snowflake,
            schema_postfix=self.e2e_env.sf_schema_postfix,
        )
        assertions.assert_all_columns_exist(
            self.e2e_env.run_query_tap_mysql,
            self.e2e_env.run_query_target_snowflake,
            mysql_to_snowflake.tap_type_to_target_type,
            {'blob_col'},
            schema_postfix=self.e2e_env.sf_schema_postfix,
        )

        # Checking if mask-date transformation is working
        result = self.e2e_env.run_query_target_snowflake(
            f'SELECT count(1) FROM ppw_e2e_tap_mysql{self.e2e_env.sf_schema_postfix}.address '
            f'where MONTH(date_created) != 1 or DAY(date_created)::int != 1;'
        )[0][0]

        self.assertEqual(result, 0)

        # Checking if conditional MASK-NUMBER transformation is working
        result = self.e2e_env.run_query_target_snowflake(
            f'SELECT count(1) FROM ppw_e2e_tap_mysql{self.e2e_env.sf_schema_postfix}.address '
            f"where zip_code_zip_code_id != 0 and street_number REGEXP '[801]';"
        )[0][0]

        self.assertEqual(result, 0)

        # Checking if conditional SET-NULL transformation is working
        result = self.e2e_env.run_query_target_snowflake(
            f'SELECT count(1) FROM ppw_e2e_tap_mysql{self.e2e_env.sf_schema_postfix}.edgydata '
            f'where "GROUP" is not null and "CASE" = \'B\';'
        )[0][0]

        self.assertEqual(result, 0)
    def test_replicate_pg_to_sf_with_archive_load_files(self):
        """
        Fastsync tables from Postgres to Snowflake with archive load files enabled
        """

        self.delete_dangling_files_from_archive()

        assertions.assert_run_tap_success(self.tap_id, self.target_id,
                                          ['fastsync', 'singer'])

        expected_archive_files_count = {
            'public.city': 2,  # INCREMENTAL: fastsync and singer
            'public.country': 1,  # FULL_TABLE : fastsync only
            'public2.wearehere': 1,  # FULL_TABLE : fastsync only
        }

        # Assert expected files in archive folder
        for (
                schema_table,
                expected_archive_files,
        ) in expected_archive_files_count.items():

            schema, table = schema_table.split('.')
            files_in_s3_archive = self.get_files_from_s3_for_table(table)

            if (files_in_s3_archive is None
                    or len(files_in_s3_archive) != expected_archive_files):
                raise Exception(
                    f'files_in_archive for {table} is {files_in_s3_archive}.'
                    f'Expected archive files count: {expected_archive_files}')

            # Assert expected metadata
            archive_metadata = self.s3_client.head_object(
                Bucket=self.s3_bucket,
                Key=(files_in_s3_archive[0]['Key']))['Metadata']

            expected_metadata = {
                'tap': 'postgres_to_sf_archive_load_files',
                'schema': schema,
                'table': table,
                'archived-by': 'pipelinewise_fastsync_postgres_to_snowflake',
            }

            if archive_metadata != expected_metadata:
                raise Exception(
                    f'archive_metadata for {table} is {archive_metadata}')

            # Assert expected file contents
            with tempfile.NamedTemporaryFile() as tmpfile:
                with open(tmpfile.name, 'wb') as tmpf:
                    self.s3_client.download_fileobj(
                        self.s3_bucket, files_in_s3_archive[0]['Key'], tmpf)
                with gzip.open(tmpfile, 'rt') as gzipfile:
                    rows_in_csv = len(gzipfile.readlines())

            rows_in_table = self.e2e_env.run_query_tap_postgres(
                f'SELECT COUNT(1) FROM {schema_table}')[0][0]

            if rows_in_csv != rows_in_table:
                raise Exception(
                    f'Rows in csv and db differ: {rows_in_csv} vs {rows_in_table}'
                )
    def test_replicate_mongodb_to_sf(self):
        """
        Test replicate MongoDB to Snowflake
        """

        # Run tap first time - fastsync and singer should be triggered
        assertions.assert_run_tap_success(self.tap_id, self.target_id,
                                          ['fastsync', 'singer'])
        self.assert_columns_exist('listings')
        self.assert_columns_exist('my_collection')
        self.assert_columns_exist('all_datatypes')

        listing_count = self.mongodb_con['listings'].count_documents({})
        my_coll_count = self.mongodb_con['my_collection'].count_documents({})
        all_datatypes_count = self.mongodb_con[
            'all_datatypes'].count_documents({})

        self.assert_row_counts_equal(
            f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}',
            'listings',
            listing_count,
        )
        self.assert_row_counts_equal(
            f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}',
            'my_collection',
            my_coll_count,
        )
        self.assert_row_counts_equal(
            f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}',
            'all_datatypes',
            all_datatypes_count,
        )

        result_insert = self.mongodb_con.my_collection.insert_many([
            {
                'age': randint(10, 30),
                'id': 1001,
                'uuid': uuid.uuid4(),
                'ts': bson.Timestamp(12030, 500),
            },
            {
                'date': datetime.utcnow(),
                'id': 1002,
                'uuid': uuid.uuid4(),
                'regex': bson.Regex(r'^[A-Z]\\w\\d{2,6}.*$'),
            },
            {
                'uuid':
                uuid.uuid4(),
                'id':
                1003,
                'decimal':
                bson.Decimal128(decimal.Decimal('5.64547548425446546546644')),
                'nested_json': {
                    'a': 1,
                    'b': 3,
                    'c': {
                        'key': bson.datetime.datetime(2020, 5, 3, 10, 0, 0)
                    },
                },
            },
        ])
        my_coll_count += len(result_insert.inserted_ids)

        result_del = self.mongodb_con.my_collection.delete_one(
            {'_id': result_insert.inserted_ids[0]})
        my_coll_count -= result_del.deleted_count

        result_update = self.mongodb_con.my_collection.update_many(
            {}, {'$set': {
                'id': 0
            }})

        assertions.assert_run_tap_success(self.tap_id, self.target_id,
                                          ['singer'])

        self.assertEqual(
            result_update.modified_count,
            self.e2e_env.run_query_target_snowflake(
                f'select count(_id) from ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}.my_collection'
                f' where document:id = 0')[0][0],
        )

        self.assert_row_counts_equal(
            f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}',
            'my_collection',
            my_coll_count,
        )
    def test_replicate_pg_to_sf(self):
        """
        Resync tables from Postgres to Snowflake using splitting large files option.
        """

        assertions.assert_run_tap_success(self.tap_id, self.target_id,
                                          ['fastsync', 'singer'])

        assertions.assert_row_counts_equal(
            self.e2e_env.run_query_tap_postgres,
            self.e2e_env.run_query_target_snowflake,
            schema_postfix=self.e2e_env.sf_schema_postfix,
        )

        assertions.assert_all_columns_exist(
            self.e2e_env.run_query_tap_postgres,
            self.e2e_env.run_query_target_snowflake,
            postgres_to_snowflake.tap_type_to_target_type,
            schema_postfix=self.e2e_env.sf_schema_postfix,
        )

        assertions.assert_date_column_naive_in_target(
            self.e2e_env.run_query_target_snowflake,
            'updated_at',
            f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE"',
        )

        result = self.e2e_env.run_query_target_snowflake(
            f'SELECT updated_at FROM '
            f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE" '
            f"where cvarchar='H';")[0][0]

        self.assertEqual(result, datetime(9999, 12, 31, 23, 59, 59, 998993))

        result = self.e2e_env.run_query_target_snowflake(
            f'SELECT updated_at FROM '
            f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE" '
            f"where cvarchar='I';")[0][0]

        self.assertEqual(result, datetime(9999, 12, 31, 23, 59, 59, 998993))

        # 2. Make changes in PG source database
        #  LOG_BASED
        self.e2e_env.run_query_tap_postgres(
            'insert into public."table_with_space and UPPERCase" (cvarchar, updated_at) values '
            "('X', '2020-01-01 08:53:56.8+10'),"
            "('Y', '2020-12-31 12:59:00.148+00'),"
            "('faaaar future', '15000-05-23 12:40:00.148'),"
            "('BC', '2020-01-23 01:40:00 BC'),"
            "('Z', null),"
            "('W', '2020-03-03 12:30:00');")

        #  INCREMENTAL
        last_id = self.e2e_env.run_query_tap_postgres(
            'SELECT max(id) from public.city')[0][0]
        self.e2e_env.run_query_tap_postgres(
            'INSERT INTO public.city (id, name, countrycode, district, population) '
            f"VALUES ({last_id+1}, 'Bath', 'GBR', 'England', 88859)")

        self.e2e_env.run_query_tap_postgres(
            'UPDATE public.edgydata SET '
            "cjson = json '{\"data\": 1234}', "
            "cjsonb = jsonb '{\"data\": 2345}', "
            "cvarchar = 'Liewe Maatjies UPDATED' WHERE cid = 23")

        #  FULL_TABLE
        self.e2e_env.run_query_tap_postgres(
            "DELETE FROM public.country WHERE code = 'UMI'")

        # 3. Run tap second time - both fastsync and a singer should be triggered, there are some FULL_TABLE
        assertions.assert_run_tap_success(self.tap_id,
                                          self.target_id,
                                          ['fastsync', 'singer'],
                                          profiling=True)

        assertions.assert_row_counts_equal(
            self.e2e_env.run_query_tap_postgres,
            self.e2e_env.run_query_target_snowflake,
            schema_postfix=self.e2e_env.sf_schema_postfix,
        )

        assertions.assert_all_columns_exist(
            self.e2e_env.run_query_tap_postgres,
            self.e2e_env.run_query_target_snowflake,
            postgres_to_snowflake.tap_type_to_target_type,
            schema_postfix=self.e2e_env.sf_schema_postfix,
        )

        assertions.assert_date_column_naive_in_target(
            self.e2e_env.run_query_target_snowflake,
            'updated_at',
            f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE"',
        )

        result = self.e2e_env.run_query_target_snowflake(
            f'SELECT updated_at FROM '
            f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE"'
            f" where cvarchar='X';")[0][0]

        self.assertEqual(result, datetime(2019, 12, 31, 22, 53, 56, 800000))

        result = self.e2e_env.run_query_target_snowflake(
            f'SELECT updated_at FROM '
            f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE" '
            f"where cvarchar='faaaar future';")[0][0]

        self.assertEqual(result, datetime(9999, 12, 31, 23, 59, 59, 998993))

        result = self.e2e_env.run_query_target_snowflake(
            f'SELECT updated_at FROM '
            f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE" '
            f"where cvarchar='BC';")[0][0]

        self.assertEqual(result, datetime(9999, 12, 31, 23, 59, 59, 998993))