def test_replicate_s3_to_sf(self): """ Replicate csv files from s3 to Snowflake, check if return code is zero and success log file created """ # 1. Run tap first time - both fastsync and a singer should be triggered assertions.assert_run_tap_success(self.tap_id, self.target_id, ['fastsync', 'singer']) self.assert_columns_exist() # 2. Run tap second time - both fastsync and a singer should be triggered assertions.assert_run_tap_success(self.tap_id, self.target_id, ['fastsync', 'singer']) self.assert_columns_exist()
def test_replicate_mariadb_replica_to_sf(self): """ Test Replicate data from MariaDB to Snowflake """ assertions.assert_run_tap_success( self.tap_id, self.target_id, ['fastsync', 'singer'] ) assertions.assert_row_counts_equal( self.e2e_env.run_query_tap_mysql_2, self.e2e_env.run_query_target_snowflake, self.e2e_env.sf_schema_postfix, ) assertions.assert_all_columns_exist( self.e2e_env.run_query_tap_mysql_2, self.e2e_env.run_query_target_snowflake, mysql_to_snowflake.tap_type_to_target_type, schema_postfix=self.e2e_env.sf_schema_postfix, )
def test_replicate_mariadb_to_sf(self): """ Replicate data from MariaDB to Snowflake """ # 1. Run tap first time - both fastsync and a singer should be triggered assertions.assert_run_tap_success( self.tap_id, self.target_id, ['fastsync', 'singer'] ) assertions.assert_row_counts_equal( self.e2e_env.run_query_tap_mysql, self.e2e_env.run_query_target_snowflake, self.e2e_env.sf_schema_postfix, ) assertions.assert_all_columns_exist( self.e2e_env.run_query_tap_mysql, self.e2e_env.run_query_target_snowflake, mysql_to_snowflake.tap_type_to_target_type, schema_postfix=self.e2e_env.sf_schema_postfix, ) # 2. Make changes in MariaDB source database # LOG_BASED self.e2e_env.run_query_tap_mysql( 'UPDATE weight_unit SET isactive = 0 WHERE weight_unit_id IN (2, 3, 4)' ) self.e2e_env.run_query_tap_mysql( 'INSERT INTO edgydata (c_varchar, `group`, `case`, cjson, c_time) VALUES' "('Lorem ipsum dolor sit amet', 10, 'A', '[]', '00:00:00')," "('Thai: แผ่นดินฮั่นเสื่อมโทรมแสนสังเวช', 20, 'A', '{}', '12:00:59')," "('Chinese: 和毛泽东 <<重上井冈山>>. 严永欣, 一九八八年.', null,'B', " '\'[{"key": "ValueOne", "actions": []}, {"key": "ValueTwo", "actions": []}]\',' " '9:1:00')," "('Special Characters: [\"\\," "!@£$%^&*()]\\\\', null, 'B', " "null, '12:00:00')," "(' ', 20, 'B', null, '15:36:10')," "(CONCAT(CHAR(0x0000 using utf16), '<- null char'), 20, 'B', null, '15:36:10')" ) self.e2e_env.run_query_tap_mysql('UPDATE all_datatypes SET c_point = NULL') # INCREMENTAL self.e2e_env.run_query_tap_mysql( 'INSERT INTO address(isactive, street_number, date_created, date_updated,' ' supplier_supplier_id, zip_code_zip_code_id)' 'VALUES (1, 1234, NOW(), NOW(), 0, 1234)' ) self.e2e_env.run_query_tap_mysql( 'UPDATE address SET street_number = 9999, date_updated = NOW()' ' WHERE address_id = 1' ) # FULL_TABLE self.e2e_env.run_query_tap_mysql('DELETE FROM no_pk_table WHERE id > 10') # 3. Run tap second time - both fastsync and a singer should be triggered, there are some FULL_TABLE assertions.assert_run_tap_success( self.tap_id, self.target_id, ['fastsync', 'singer'] ) assertions.assert_row_counts_equal( self.e2e_env.run_query_tap_mysql, self.e2e_env.run_query_target_snowflake, schema_postfix=self.e2e_env.sf_schema_postfix, ) assertions.assert_all_columns_exist( self.e2e_env.run_query_tap_mysql, self.e2e_env.run_query_target_snowflake, mysql_to_snowflake.tap_type_to_target_type, {'blob_col'}, schema_postfix=self.e2e_env.sf_schema_postfix, ) # Checking if mask-date transformation is working result = self.e2e_env.run_query_target_snowflake( f'SELECT count(1) FROM ppw_e2e_tap_mysql{self.e2e_env.sf_schema_postfix}.address ' f'where MONTH(date_created) != 1 or DAY(date_created)::int != 1;' )[0][0] self.assertEqual(result, 0) # Checking if conditional MASK-NUMBER transformation is working result = self.e2e_env.run_query_target_snowflake( f'SELECT count(1) FROM ppw_e2e_tap_mysql{self.e2e_env.sf_schema_postfix}.address ' f"where zip_code_zip_code_id != 0 and street_number REGEXP '[801]';" )[0][0] self.assertEqual(result, 0) # Checking if conditional SET-NULL transformation is working result = self.e2e_env.run_query_target_snowflake( f'SELECT count(1) FROM ppw_e2e_tap_mysql{self.e2e_env.sf_schema_postfix}.edgydata ' f'where "GROUP" is not null and "CASE" = \'B\';' )[0][0] self.assertEqual(result, 0)
def test_replicate_pg_to_sf_with_archive_load_files(self): """ Fastsync tables from Postgres to Snowflake with archive load files enabled """ self.delete_dangling_files_from_archive() assertions.assert_run_tap_success(self.tap_id, self.target_id, ['fastsync', 'singer']) expected_archive_files_count = { 'public.city': 2, # INCREMENTAL: fastsync and singer 'public.country': 1, # FULL_TABLE : fastsync only 'public2.wearehere': 1, # FULL_TABLE : fastsync only } # Assert expected files in archive folder for ( schema_table, expected_archive_files, ) in expected_archive_files_count.items(): schema, table = schema_table.split('.') files_in_s3_archive = self.get_files_from_s3_for_table(table) if (files_in_s3_archive is None or len(files_in_s3_archive) != expected_archive_files): raise Exception( f'files_in_archive for {table} is {files_in_s3_archive}.' f'Expected archive files count: {expected_archive_files}') # Assert expected metadata archive_metadata = self.s3_client.head_object( Bucket=self.s3_bucket, Key=(files_in_s3_archive[0]['Key']))['Metadata'] expected_metadata = { 'tap': 'postgres_to_sf_archive_load_files', 'schema': schema, 'table': table, 'archived-by': 'pipelinewise_fastsync_postgres_to_snowflake', } if archive_metadata != expected_metadata: raise Exception( f'archive_metadata for {table} is {archive_metadata}') # Assert expected file contents with tempfile.NamedTemporaryFile() as tmpfile: with open(tmpfile.name, 'wb') as tmpf: self.s3_client.download_fileobj( self.s3_bucket, files_in_s3_archive[0]['Key'], tmpf) with gzip.open(tmpfile, 'rt') as gzipfile: rows_in_csv = len(gzipfile.readlines()) rows_in_table = self.e2e_env.run_query_tap_postgres( f'SELECT COUNT(1) FROM {schema_table}')[0][0] if rows_in_csv != rows_in_table: raise Exception( f'Rows in csv and db differ: {rows_in_csv} vs {rows_in_table}' )
def test_replicate_mongodb_to_sf(self): """ Test replicate MongoDB to Snowflake """ # Run tap first time - fastsync and singer should be triggered assertions.assert_run_tap_success(self.tap_id, self.target_id, ['fastsync', 'singer']) self.assert_columns_exist('listings') self.assert_columns_exist('my_collection') self.assert_columns_exist('all_datatypes') listing_count = self.mongodb_con['listings'].count_documents({}) my_coll_count = self.mongodb_con['my_collection'].count_documents({}) all_datatypes_count = self.mongodb_con[ 'all_datatypes'].count_documents({}) self.assert_row_counts_equal( f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}', 'listings', listing_count, ) self.assert_row_counts_equal( f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}', 'my_collection', my_coll_count, ) self.assert_row_counts_equal( f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}', 'all_datatypes', all_datatypes_count, ) result_insert = self.mongodb_con.my_collection.insert_many([ { 'age': randint(10, 30), 'id': 1001, 'uuid': uuid.uuid4(), 'ts': bson.Timestamp(12030, 500), }, { 'date': datetime.utcnow(), 'id': 1002, 'uuid': uuid.uuid4(), 'regex': bson.Regex(r'^[A-Z]\\w\\d{2,6}.*$'), }, { 'uuid': uuid.uuid4(), 'id': 1003, 'decimal': bson.Decimal128(decimal.Decimal('5.64547548425446546546644')), 'nested_json': { 'a': 1, 'b': 3, 'c': { 'key': bson.datetime.datetime(2020, 5, 3, 10, 0, 0) }, }, }, ]) my_coll_count += len(result_insert.inserted_ids) result_del = self.mongodb_con.my_collection.delete_one( {'_id': result_insert.inserted_ids[0]}) my_coll_count -= result_del.deleted_count result_update = self.mongodb_con.my_collection.update_many( {}, {'$set': { 'id': 0 }}) assertions.assert_run_tap_success(self.tap_id, self.target_id, ['singer']) self.assertEqual( result_update.modified_count, self.e2e_env.run_query_target_snowflake( f'select count(_id) from ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}.my_collection' f' where document:id = 0')[0][0], ) self.assert_row_counts_equal( f'ppw_e2e_tap_mongodb{self.e2e_env.sf_schema_postfix}', 'my_collection', my_coll_count, )
def test_replicate_pg_to_sf(self): """ Resync tables from Postgres to Snowflake using splitting large files option. """ assertions.assert_run_tap_success(self.tap_id, self.target_id, ['fastsync', 'singer']) assertions.assert_row_counts_equal( self.e2e_env.run_query_tap_postgres, self.e2e_env.run_query_target_snowflake, schema_postfix=self.e2e_env.sf_schema_postfix, ) assertions.assert_all_columns_exist( self.e2e_env.run_query_tap_postgres, self.e2e_env.run_query_target_snowflake, postgres_to_snowflake.tap_type_to_target_type, schema_postfix=self.e2e_env.sf_schema_postfix, ) assertions.assert_date_column_naive_in_target( self.e2e_env.run_query_target_snowflake, 'updated_at', f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE"', ) result = self.e2e_env.run_query_target_snowflake( f'SELECT updated_at FROM ' f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE" ' f"where cvarchar='H';")[0][0] self.assertEqual(result, datetime(9999, 12, 31, 23, 59, 59, 998993)) result = self.e2e_env.run_query_target_snowflake( f'SELECT updated_at FROM ' f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE" ' f"where cvarchar='I';")[0][0] self.assertEqual(result, datetime(9999, 12, 31, 23, 59, 59, 998993)) # 2. Make changes in PG source database # LOG_BASED self.e2e_env.run_query_tap_postgres( 'insert into public."table_with_space and UPPERCase" (cvarchar, updated_at) values ' "('X', '2020-01-01 08:53:56.8+10')," "('Y', '2020-12-31 12:59:00.148+00')," "('faaaar future', '15000-05-23 12:40:00.148')," "('BC', '2020-01-23 01:40:00 BC')," "('Z', null)," "('W', '2020-03-03 12:30:00');") # INCREMENTAL last_id = self.e2e_env.run_query_tap_postgres( 'SELECT max(id) from public.city')[0][0] self.e2e_env.run_query_tap_postgres( 'INSERT INTO public.city (id, name, countrycode, district, population) ' f"VALUES ({last_id+1}, 'Bath', 'GBR', 'England', 88859)") self.e2e_env.run_query_tap_postgres( 'UPDATE public.edgydata SET ' "cjson = json '{\"data\": 1234}', " "cjsonb = jsonb '{\"data\": 2345}', " "cvarchar = 'Liewe Maatjies UPDATED' WHERE cid = 23") # FULL_TABLE self.e2e_env.run_query_tap_postgres( "DELETE FROM public.country WHERE code = 'UMI'") # 3. Run tap second time - both fastsync and a singer should be triggered, there are some FULL_TABLE assertions.assert_run_tap_success(self.tap_id, self.target_id, ['fastsync', 'singer'], profiling=True) assertions.assert_row_counts_equal( self.e2e_env.run_query_tap_postgres, self.e2e_env.run_query_target_snowflake, schema_postfix=self.e2e_env.sf_schema_postfix, ) assertions.assert_all_columns_exist( self.e2e_env.run_query_tap_postgres, self.e2e_env.run_query_target_snowflake, postgres_to_snowflake.tap_type_to_target_type, schema_postfix=self.e2e_env.sf_schema_postfix, ) assertions.assert_date_column_naive_in_target( self.e2e_env.run_query_target_snowflake, 'updated_at', f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE"', ) result = self.e2e_env.run_query_target_snowflake( f'SELECT updated_at FROM ' f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE"' f" where cvarchar='X';")[0][0] self.assertEqual(result, datetime(2019, 12, 31, 22, 53, 56, 800000)) result = self.e2e_env.run_query_target_snowflake( f'SELECT updated_at FROM ' f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE" ' f"where cvarchar='faaaar future';")[0][0] self.assertEqual(result, datetime(9999, 12, 31, 23, 59, 59, 998993)) result = self.e2e_env.run_query_target_snowflake( f'SELECT updated_at FROM ' f'ppw_e2e_tap_postgres{self.e2e_env.sf_schema_postfix}."TABLE_WITH_SPACE AND UPPERCASE" ' f"where cvarchar='BC';")[0][0] self.assertEqual(result, datetime(9999, 12, 31, 23, 59, 59, 998993))