Example #1
0
    def test__should__be_able_to_get_all_ads__when__run_multiple_times(self):
        # Load secrets via env vars
        execfile("../../secrets.py")
        conn = GoogleAdWordsConnectionManager(
            GoogleAdWordsConnectionSettings(
                client_id=os.getenv("adwords_client_id"),
                user_agent="Tester",
                client_customer_id=os.getenv("adwords_client_customer_id"),
                secrets_manager=GoogleAdWordsSecretsManager()))

        ad_util = AdWordsAdGroupAdUtil(conn)
        ad_util.set_query_to_fetch_all(page_size=10)
        actual1 = ad_util.download_next_page_as_dict()
        expected1 = 10
        print(actual1[0])
        self.assertEqual(expected1, len(actual1))
        actual2 = ad_util.download_next_page_as_dict()
        expected2 = 10
        print(actual2[0])
        self.assertEqual(expected2, len(actual2))
        self.assertNotEqual(actual1[0], actual2[0])
        # Does a reset of page position
        ad_util.set_query_to_fetch_all(page_size=10)
        actual3 = ad_util.download_next_page_as_dict()
        expected3 = 10
        print(actual3[0])
        self.assertEqual(expected3, len(actual3))
        self.assertEqual(actual1[0], actual3[0])
    def test_the_transformation_works(self):
        aws_conn = AwsConnectionSettings(region="us-east-1",
                                         secrets_manager=None,
                                         profile="default")
        execfile("../../secrets.py")

        cassandra_conn_setting = Mock()

        settings = AthenaToAdWordsOfflineConversionSettings(
            source_database=os.getenv("dummy_athena_database"),
            source_table=os.getenv("dummy_athena_table"),
            source_connection_settings=aws_conn,
            etl_identifier="xxxx",
            destination_batch_size=100,
            etl_state_manager_connection=cassandra_conn_setting,
            etl_state_manager_keyspace="test",
            transformation_column_mapping={
                'abc': 'googleClickId',
                'def1': 'conversionName',
                'def2': 'conversionTime',
                'def4': 'conversionValue'
            },
            destination_connection_settings=GoogleAdWordsConnectionSettings(
                client_id=os.getenv("adwords_client_id"),
                user_agent="Tester",
                client_customer_id=os.getenv("adwords_client_customer_id"),
                secrets_manager=GoogleAdWordsSecretsManager()),
        )
        etl = AthenaToAdWordsOfflineConversion(settings)

        df = DataFrame([{
            "abc": "123",
            "def1": "123",
            "def2": "123",
            "def3": "123",
            "def4": "123",
            "def5": "123",
        }, {
            "abc": "222",
            "def1": "333",
            "def2": "444",
            "def3": "333",
            "def4": "333",
            "def5": "333",
        }])
        result = etl._data_frame_to_destination_dict(df)
        expected = [{
            'conversionName': '123',
            'conversionTime': '123',
            'conversionValue': '123',
            'googleClickId': '123'
        }, {
            'conversionName': '333',
            'conversionTime': '444',
            'conversionValue': '333',
            'googleClickId': '222'
        }]
        self.assertListEqual(result, expected)
    def test__should__get_correct_estimations__with__etl_get_parallel_payloads(
            self):
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_to_athena_table_creation"
        etl_settings = AdWordsToAthenaSettings(
            source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy(
                'Id'),
            source_service="AdGroupAdService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)])
        etl = AdWordsToAthena(etl_settings)

        actual_payloads = etl.get_parallel_payloads(page_size=1000,
                                                    number_of_workers=3)
        expected_payloads = [{
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 0,
            'worker': 0
        }, {
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 393000,
            'worker': 1
        }, {
            'number_of_pages': 393,
            'page_size': 1000,
            'start_index': 786000,
            'worker': 2
        }]
        self.assertListEqual(expected_payloads, actual_payloads)
        etl.create_athena_table()
        conn = AwsConnectionManager(aws_setting)
        au = AthenaUtil("dev", conn)
        actual = au.get_glue_table_metadata(target_table)
        print(actual)
    def test_full_integration_with_local_cassandra(self):
        aws_conn = AwsConnectionSettings(region="us-east-1",
                                         secrets_manager=None,
                                         profile="default")
        execfile("../../secrets.py")

        compose = DockerCompose(filepath=os.path.dirname(base.__file__))
        with compose:
            host = compose.get_service_host("cassandra", 9042)
            port = int(compose.get_service_port("cassandra", 9042))

            cassandra_conn_setting = CassandraConnectionSettings(
                cluster_ips=[host],
                port=port,
                load_balancing_policy=DCAwareRoundRobinPolicy(),
                secrets_manager=CassandraSecretsManager(
                    source=DictKeyValueSource({
                        "CASSANDRA_USERNAME": "",
                        "CASSANDRA_PASSWORD": "",
                    })),
            )

            conn = verify_container_is_up(cassandra_conn_setting)
            # conn.get_session('system').execute(""" DROP TABLE test.etl_sink_record_state""")

            settings = AthenaToAdWordsOfflineConversionSettings(
                source_database=os.getenv("dummy_athena_database"),
                source_table=os.getenv("dummy_athena_table"),
                source_connection_settings=aws_conn,
                etl_identifier="test",
                destination_batch_size=100,
                etl_state_manager_connection=cassandra_conn_setting,
                etl_state_manager_keyspace="test",
                transformation_column_mapping={
                    'google_click_id': 'googleClickId',
                    'conversion_name': 'conversionName',
                    'conversion_time': 'conversionTime',
                    'conversion_value': 'conversionValue',
                    'conversion_currency_code': 'conversionCurrencyCode'
                },
                destination_connection_settings=GoogleAdWordsConnectionSettings(
                    client_id=os.getenv("adwords_client_id"),
                    user_agent="Tester",
                    client_customer_id=os.getenv("adwords_client_customer_id"),
                    secrets_manager=GoogleAdWordsSecretsManager()),
            )
            etl = AthenaToAdWordsOfflineConversion(settings)
            files_actual = etl.list_source_files()
            #
            # self.assertListEqual(files_actual, [])

            # etl.upload_all()
            act = etl.upload_all()
            self.assertListEqual(act, [])
Example #5
0
 def test__should__be_able_to_estimate_splits__when__run_with_subclass(
         self):
     # Load secrets via env vars
     execfile("../../secrets.py")
     conn = GoogleAdWordsConnectionManager(
         GoogleAdWordsConnectionSettings(
             client_id=os.getenv("adwords_client_id"),
             user_agent="Tester",
             client_customer_id=os.getenv("adwords_client_customer_id"),
             secrets_manager=GoogleAdWordsSecretsManager()))
     print(conn)
     ad_util = AdWordsAdGroupAdUtil(conn)
Example #6
0
    def test__should__create_s3_file_for_the_given_indices(self):
        # Load secrets via env vars
        execfile("../../secrets.py")

        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"
        conn = AwsConnectionManager(aws_setting)
        s3u = S3Util(conn=conn, bucket=target_bucket)
        s3u.delete_recursive(target_key_prefix)
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())

        adword_to_s3_util = AdWordsToS3(settings=AdWordsToS3Settings(
            source_query_fragment=ServiceQueryBuilder().Select(
                # Attributes
                'BaseAdGroupId',
                'Id',
                'CampaignId',
                'CampaignName',
                'Name',
                'Status',
                'ContentBidCriterionTypeGroup',
                'BaseCampaignId',
                'TrackingUrlTemplate',
                'FinalUrlSuffix',
                'UrlCustomParameters',
                'AdGroupType').OrderBy('Id'),
            source_service="AdGroupService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_file_prefix=None,
            target_connection_settings=aws_setting))
        adword_to_s3_util.build_query(start_index=35000,
                                      page_size=1000,
                                      num_iterations=1)
        adword_to_s3_util.transfer_all()
        actual = s3u.get_keys(target_key_prefix)
        expected = [
            'tmp/test/hip_data_tools/adwords_to_s3/test/index_35000__35999.parquet'
        ]

        self.assertListEqual(expected, actual)
Example #7
0
    def test_adwords_data_upload_for_offline_conversion(self):
        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_util = AdWordsOfflineConversionUtil(
            GoogleAdWordsConnectionManager(
                GoogleAdWordsConnectionSettings(
                    client_id=os.getenv("adwords_client_id"),
                    user_agent="Tester",
                    client_customer_id=os.getenv("adwords_client_customer_id"),
                    secrets_manager=GoogleAdWordsSecretsManager())))

        uploaded, failed = adwords_util.upload_conversions([
            {
                'googleClickId': 'xxx',
                'conversionName': 'claim_attempts_testing',
                'conversionTime': '20200309 074357 UTC',
                'conversionValue': 17.0,
                'conversionCurrencyCode': 'AUD',
            },
            {
                'googleClickId':
                'Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn'
                '-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB',
                'conversionName':
                'claim_attempts_testing',
                'conversionTime':
                '20200309 074353 UTC',
                'conversionValue':
                17.0,
                'conversionCurrencyCode':
                'AUD',
            },
            {
                'googleClickId':
                'Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn'
                '-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB',
                'conversionName':
                'claim_attempts_testing',
                'conversionTime':
                '20200309 023001 UTC',
                'conversionValue':
                17.0,
                'conversionCurrencyCode':
                'AUD',
            },
        ])

        print(uploaded, failed)
        self.assertEqual(len(uploaded), 2)
        self.assertEqual(len(failed), 1)
Example #8
0
    def test__should__get_correct_estimations__with__etl_get_parallel_payloads(
            self):
        # Load secrets via env vars
        execfile("../../secrets.py")

        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"

        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())

        etl_settings = AdWordsToS3Settings(
            source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy(
                'Id'),
            source_service="AdGroupAdService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_file_prefix=None,
            target_connection_settings=aws_setting)
        etl = AdWordsToS3(etl_settings)

        actual_payloads = etl.get_parallel_payloads(page_size=1000,
                                                    number_of_workers=3)
        expected_payloads = [{
            'worker': 0,
            'start_index': 0,
            'number_of_pages': 435,
            'page_size': 1000
        }, {
            'worker': 1,
            'start_index': 435000,
            'number_of_pages': 435,
            'page_size': 1000
        }, {
            'worker': 2,
            'start_index': 870000,
            'number_of_pages': 435,
            'page_size': 1000
        }]

        self.assertListEqual(expected_payloads, actual_payloads)
    def test__should__create_table__with__a_general_report(self):
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_negative_report"
        etl_settings = AdWordsReportsToAthenaSettings(
            source_query=(ReportQueryBuilder().Select(
                'AccountDescriptiveName', 'CampaignId', 'CampaignName',
                'CampaignStatus', 'Id', 'KeywordMatchType', 'Criteria').From(
                    'CAMPAIGN_NEGATIVE_KEYWORDS_PERFORMANCE_REPORT').Build()),
            source_include_zero_impressions=True,
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)],
            target_file_prefix="data",
            transformation_field_type_mask=None)
        etl = AdWordsReportsToAthena(etl_settings)
        etl.transfer()
        etl.create_athena_table()
        etl.add_partitions()

        au = AthenaUtil(database="dev",
                        conn=AwsConnectionManager(aws_setting),
                        output_bucket=os.environ["S3_TEST_BUCKET"])
        actual = au.run_query(query_string="""
        select * from dev.test_adwords_negative_report limit 10
        """,
                              return_result=True)
        print(actual)
        expected = 11

        self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))
Example #10
0
 def test__should__be_able_to_get_all_campaigns__with_one_account(self):
     # Load secrets via env vars
     execfile("../../secrets.py")
     util = AdWordsCampaignUtil(
         GoogleAdWordsConnectionManager(
             GoogleAdWordsConnectionSettings(
                 client_id=os.getenv("adwords_client_id"),
                 user_agent="Tester",
                 client_customer_id=os.getenv("adwords_client_customer_id"),
                 secrets_manager=GoogleAdWordsSecretsManager())))
     util.set_query_to_fetch_all()
     actual = util.download_all_as_dict()
     expected = 2581
     print(actual)
     self.assertEqual(expected, len(actual))
Example #11
0
    def test_local_credentials_are_able_to_connect_to_adwords(self):
        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_util = AdWordsCustomerUtil(
            GoogleAdWordsConnectionManager(
                GoogleAdWordsConnectionSettings(
                    client_id=os.getenv("adwords_client_id"),
                    user_agent="Tester",
                    client_customer_id=None,
                    secrets_manager=GoogleAdWordsSecretsManager())))

        expected = 3
        cust = adwords_util.get_customers()
        print(cust)
        self.assertEqual(len(cust), expected)
Example #12
0
 def test__should__be_able_to_get_report_fields__when__choosing_one_report_type(
         self):
     # Load secrets via env vars
     execfile("../../secrets.py")
     conn = GoogleAdWordsConnectionManager(
         GoogleAdWordsConnectionSettings(
             client_id=os.getenv("adwords_client_id"),
             user_agent="Tester",
             client_customer_id=os.getenv("adwords_client_customer_id"),
             secrets_manager=GoogleAdWordsSecretsManager()))
     ad_util = AdWordsReportDefinitionReader(conn=conn)
     actual = ad_util.get_report_fields(
         "CAMPAIGN_NEGATIVE_KEYWORDS_PERFORMANCE_REPORT")
     expected = 13
     self.assertEqual(expected, len(actual))
Example #13
0
 def test__should__be_able_to_get_report_stream__when__choosing_one_query(
         self):
     # Load secrets via env vars
     execfile("../../secrets.py")
     conn = GoogleAdWordsConnectionManager(
         GoogleAdWordsConnectionSettings(
             client_id=os.getenv("adwords_client_id"),
             user_agent="Tester",
             client_customer_id=os.getenv("adwords_client_customer_id"),
             secrets_manager=GoogleAdWordsSecretsManager()))
     ad_util = AdWordsReportReader(conn=conn)
     report_query = (adwords.ReportQueryBuilder().Select(
         'AdNetworkType1', 'Impressions', 'Clicks').From(
             'CAMPAIGN_PERFORMANCE_REPORT').During('YESTERDAY').Build())
     actual = ad_util.awql_to_dataframe(query=report_query)
     print(actual)
     expected = (17046, 3)
     self.assertEqual(expected, actual.shape)
Example #14
0
 def test__negative_keyword_reports(self):
     # Load secrets via env vars
     execfile("../../secrets.py")
     conn = GoogleAdWordsConnectionManager(
         GoogleAdWordsConnectionSettings(
             client_id=os.getenv("adwords_client_id"),
             user_agent="Tester",
             client_customer_id=os.getenv("adwords_client_customer_id"),
             secrets_manager=GoogleAdWordsSecretsManager()))
     ad_util = AdWordsReportReader(conn)
     report_query = (adwords.ReportQueryBuilder().Select(
         'AccountDescriptiveName', 'CampaignId', 'CampaignName',
         'CampaignStatus', 'Id', 'KeywordMatchType', 'Criteria').From(
             'CAMPAIGN_NEGATIVE_KEYWORDS_PERFORMANCE_REPORT').Build())
     actual = ad_util.awql_to_dataframe(query=report_query)
     print(actual)
     expected = (125493, 7)
     self.assertEqual(expected, actual.shape)
Example #15
0
    def test__should__read_all_accounts__with__parent_id(self):
        # Load secrets via env vars
        execfile("../../secrets.py")
        conn = GoogleAdWordsConnectionManager(
            GoogleAdWordsConnectionSettings(
                client_id=os.getenv("adwords_client_id"),
                user_agent="Tester",
                client_customer_id=os.getenv(
                    "adwords_client_root_customer_id"),
                secrets_manager=GoogleAdWordsSecretsManager()))
        ad_util = AdWordsManagedCustomerUtil(conn)
        all_accounts = ad_util.get_all_accounts()
        print(all_accounts)
        expected = 58
        self.assertEqual(expected, len(all_accounts))

        actual_frame = ad_util.get_all_accounts_as_dataframe()
        print(actual_frame)
        self.assertEqual((58, 8), actual_frame.shape)
Example #16
0
    def test__should__transfer_correct_amount_of_files__with__one_parallel_fragment(
            self):
        # Load secrets via env vars
        execfile("../../secrets.py")
        aws_setting = AwsConnectionSettings(
            region="us-east-1",
            secrets_manager=AwsSecretsManager(),
            profile=None)
        target_bucket = os.getenv('S3_TEST_BUCKET')
        target_key_prefix = "tmp/test/hip_data_tools/adwords_to_s3/test"
        conn = AwsConnectionManager(aws_setting)
        s3u = S3Util(conn=conn, bucket=target_bucket)
        s3u.delete_recursive(target_key_prefix)
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id=os.getenv("adwords_client_customer_id"),
            secrets_manager=GoogleAdWordsSecretsManager())

        etl_settings = AdWordsToS3Settings(
            source_query_fragment=ServiceQueryBuilder().Select('Id').OrderBy(
                'Id'),
            source_service="AdGroupAdService",
            source_service_version="v201809",
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_file_prefix=None,
            target_connection_settings=aws_setting)
        etl = AdWordsToS3(etl_settings)
        etl.build_query(start_index=0, page_size=5, num_iterations=2)

        etl.transfer_all()

        actual = s3u.get_keys(target_key_prefix)
        print(actual)
        expected = [
            'tmp/test/hip_data_tools/adwords_to_s3/test/index_0__4.parquet',
            'tmp/test/hip_data_tools/adwords_to_s3/test/index_5__9.parquet'
        ]
        self.assertListEqual(expected, actual)
    def test_adwords_upload_with_duplicates_in_same_batch(self):
        aws_conn = AwsConnectionSettings(region="us-east-1",
                                         secrets_manager=None,
                                         profile="default")
        execfile("../../secrets.py")

        compose = DockerCompose(filepath=os.path.dirname(base.__file__))
        with compose:
            host = compose.get_service_host("cassandra", 9042)
            port = int(compose.get_service_port("cassandra", 9042))

            cassandra_conn_setting = CassandraConnectionSettings(
                cluster_ips=[host],
                port=port,
                load_balancing_policy=DCAwareRoundRobinPolicy(),
                secrets_manager=CassandraSecretsManager(
                    source=DictKeyValueSource({
                        "CASSANDRA_USERNAME": "",
                        "CASSANDRA_PASSWORD": "",
                    })),
            )

            verify_container_is_up(cassandra_conn_setting)

            settings = AthenaToAdWordsOfflineConversionSettings(
                source_database=os.getenv("dummy_athena_database"),
                source_table=os.getenv("dummy_athena_table"),
                source_connection_settings=aws_conn,
                etl_identifier="xxxx",
                destination_batch_size=100,
                etl_state_manager_connection=cassandra_conn_setting,
                etl_state_manager_keyspace="test",
                transformation_column_mapping={
                    'googleClickId': 'googleClickId',
                    'conversionName': 'conversionName',
                    'conversionTime': 'conversionTime',
                    'conversionValue': 'conversionValue',
                    'conversionCurrencyCode': 'conversionCurrencyCode'
                },
                destination_connection_settings=GoogleAdWordsConnectionSettings(
                    client_id=os.getenv("adwords_client_id"),
                    user_agent="Tester",
                    client_customer_id=os.getenv("adwords_client_customer_id"),
                    secrets_manager=GoogleAdWordsSecretsManager()),
            )
            etl = AthenaToAdWordsOfflineConversion(settings)
            test_df = DataFrame([
                {
                    'googleClickId': 'xxx',
                    'conversionName': 'claim_attempts_testing',
                    'conversionTime': '20200309 074357 UTC',
                    'conversionValue': 17.0,
                    'conversionCurrencyCode': 'AUD',
                },
                {
                    'googleClickId':
                    "Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn"
                    "-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB",
                    'conversionName':
                    'claim_attempts_testing',
                    'conversionTime':
                    '20200309 074353 UTC',
                    'conversionValue':
                    17.0,
                    'conversionCurrencyCode':
                    'AUD',
                },
                {
                    'googleClickId':
                    "Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn"
                    "-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB",
                    'conversionName':
                    'claim_attempts_testing',
                    'conversionTime':
                    '20200309 074353 UTC',  # Duplicate with same time
                    'conversionValue':
                    14.0,
                    'conversionCurrencyCode':
                    'AUD',
                },
                {
                    'googleClickId':
                    "Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn"
                    "-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB",
                    'conversionName':
                    'claim_attempts_testing',
                    'conversionTime':
                    '20200309 084353 UTC',  # Duplicate with diff time
                    'conversionValue':
                    14.0,
                    'conversionCurrencyCode':
                    'AUD',
                },
                {
                    'googleClickId':
                    "EAIaIQobChMI6oiGy_vz5wIVkjUrCh3IcgAuEAAYASAAEgLRk_D_BwE",
                    'conversionName': "claim_attempts_testing",
                    'conversionTime': '20200309 023001 UTC',
                    'conversionValue': 17.0,
                    'conversionCurrencyCode': 'AUD',
                },
            ])

            actual = etl._process_data_frame(test_df)
            expected = [  # The duplicate with same time has been Picked out as an issue
                {
                    'error':
                    "Current State 'EtlStates.Processing' cannot transition to "
                    "'EtlStates.Processing'",
                    'data': {
                        'googleClickId':
                        'Cj0KCQiAqY3zBRDQARIsAJeCVxOIyZ8avQ0he3WIpHPwV6hRn'
                        '-8Y2gDrUBJcc95tDdLcE35TK1mhhmIaAgZGEALw_wcB',
                        'conversionName':
                        'claim_attempts_testing',
                        'conversionTime':
                        '20200309 074353 UTC',
                        'conversionValue':
                        14.0,
                        'conversionCurrencyCode':
                        'AUD'
                    }
                },
            ]

            self.assertListEqual(actual, expected)
    def test_multiple_runs_of_same_data_and_verify_deduplication(self):
        aws_conn = AwsConnectionSettings(region="us-east-1",
                                         secrets_manager=None,
                                         profile="default")
        execfile("../../secrets.py")

        compose = DockerCompose(filepath=os.path.dirname(base.__file__))
        with compose:
            host = compose.get_service_host("cassandra", 9042)
            port = int(compose.get_service_port("cassandra", 9042))

            cassandra_conn_setting = CassandraConnectionSettings(
                cluster_ips=[host],
                port=port,
                load_balancing_policy=DCAwareRoundRobinPolicy(),
                secrets_manager=CassandraSecretsManager(
                    source=DictKeyValueSource({
                        "CASSANDRA_USERNAME": "",
                        "CASSANDRA_PASSWORD": "",
                    })),
            )

            conn = verify_container_is_up(cassandra_conn_setting)
            # conn.get_session('system').execute(""" DROP TABLE test.etl_sink_record_state""")

            settings = AthenaToAdWordsOfflineConversionSettings(
                source_database=os.getenv("dummy_athena_database"),
                source_table=os.getenv("dummy_athena_table"),
                source_connection_settings=aws_conn,
                etl_identifier="test",
                destination_batch_size=100,
                etl_state_manager_connection=cassandra_conn_setting,
                etl_state_manager_keyspace="test",
                transformation_column_mapping={
                    'google_click_id': 'googleClickId',
                    'conversion_name': 'conversionName',
                    'conversion_time': 'conversionTime',
                    'conversion_value': 'conversionValue',
                    'conversion_currency_code': 'conversionCurrencyCode'
                },
                destination_connection_settings=GoogleAdWordsConnectionSettings(
                    client_id=os.getenv("adwords_client_id"),
                    user_agent="Tester",
                    client_customer_id=os.getenv("adwords_client_customer_id"),
                    secrets_manager=GoogleAdWordsSecretsManager()),
            )
            etl = AthenaToAdWordsOfflineConversion(settings)
            source_data = [
                {
                    'google_click_id': 'theFirst',
                    'conversion_name': 'claim_attempts_testing',
                    'conversion_time': '20200309 074357 UTC',
                    'conversion_value': 17.0,
                    'conversion_currency_code': 'AUD',
                },
                {
                    'google_click_id': 'failedSecond',
                    'conversion_name': 'claim_attempts_testing',
                    'conversion_time': '20200309 074357 UTC',
                    'conversion_value': 17.0,
                    'conversion_currency_code': 'AUD',
                },
            ]
            test_df = DataFrame(source_data)
            #  Mock upload_conversions in AdWordsUtil so no actual data is transmitted
            etl._upload_conversions = MagicMock(return_value=([
                {
                    'googleClickId': 'theFirst',
                    'conversionName': 'claim_attempts_testing',
                    'conversionTime': '20200309 074357 UTC',
                    'conversionValue': 17.0,
                    'conversionCurrencyCode': 'AUD',
                },
            ], [
                {
                    'fieldPath':
                    'operations[0].operand',
                    'fieldPathElements': [{
                        'field': 'operations',
                        'index': 0
                    }, {
                        'field': 'operand',
                        'index': None
                    }],
                    'trigger':
                    None,
                    'errorString':
                    'OfflineConversionError.UNPARSEABLE_GCLID',
                    'ApiError.Type':
                    'OfflineConversionError',
                    'reason':
                    'UNPARSEABLE_GCLID',
                    'data': {
                        'googleClickId': 'failedSecond',
                        'conversionName': 'claim_attempts_testing',
                        'conversionTime': '20200309 074357 UTC',
                        'conversionValue': 17.0,
                        'conversionCurrencyCode': 'AUD',
                    },
                },
            ]))
            # etl._process_data_frame(test_df)
            first_actual = etl._process_data_frame(test_df)
            self.assertListEqual(first_actual, [])

            # Repeat process to cause Duplicates
            actual = etl._process_data_frame(test_df)
            # actual = etl.upload_next()
            expected = [{
                'data': {
                    'conversionCurrencyCode': 'AUD',
                    'conversionName': 'claim_attempts_testing',
                    'conversionTime': '20200309 074357 UTC',
                    'conversionValue': 17.0,
                    'googleClickId': 'theFirst'
                },
                'error': 'Current state is not Ready'
            }, {
                'data': {
                    'conversionCurrencyCode': 'AUD',
                    'conversionName': 'claim_attempts_testing',
                    'conversionTime': '20200309 074357 UTC',
                    'conversionValue': 17.0,
                    'googleClickId': 'failedSecond'
                },
                'error': 'Current state is not Ready'
            }]

            self.assertListEqual(actual, expected)
    def test__should__create_table__with__geo_performance_report(self):
        aws_setting = AwsConnectionSettings(
            region="ap-southeast-2",
            secrets_manager=AwsSecretsManager(
                access_key_id_var="SOME_CUSTOM_AWS_ACCESS_KEY_ID",
                secret_access_key_var="SOME_CUSTOM_AWS_SECRET_ACCESS_KEY",
                use_session_token=True,
                aws_session_token_var="SOME_CUSTOM_AWS_SESSION_TOKEN"),
            profile=None)
        target_bucket = "test-bucket"
        target_key_prefix = "something/test"

        # Load secrets via env vars
        execfile("../../secrets.py")
        adwords_settings = GoogleAdWordsConnectionSettings(
            client_id=os.getenv("adwords_client_id"),
            user_agent="Tester",
            client_customer_id="1111111111",
            secrets_manager=GoogleAdWordsSecretsManager())
        target_table = "test_adwords_geo_performance_report"
        etl_settings = AdWordsReportsToAthenaSettings(
            source_query=(
                ReportQueryBuilder().Select(
                    # Attributes
                    'AccountDescriptiveName',
                    'CampaignId',
                    'CityCriteriaId',
                    'CountryCriteriaId',
                    'CustomerDescriptiveName',
                    'ExternalCustomerId',
                    'IsTargetingLocation',
                    'MetroCriteriaId',
                    'MostSpecificCriteriaId',
                    'RegionCriteriaId',

                    # Segments
                    'Date',

                    # Metrics
                    'Impressions',
                    'Clicks',
                    'ConversionRate',
                    'Conversions',
                    'ConversionValue',
                    'Cost',
                    'CostPerConversion').From('GEO_PERFORMANCE_REPORT').During(
                        start_date="20200601", end_date="20200701").Build()),
            source_include_zero_impressions=False,
            source_connection_settings=adwords_settings,
            target_bucket=target_bucket,
            target_key_prefix=target_key_prefix,
            target_connection_settings=aws_setting,
            target_database="dev",
            target_table=target_table,
            target_table_ddl_progress=True,
            is_partitioned_table=True,
            partition_values=[("abc", "def"), ("pqr", 123)],
            target_file_prefix="data",
            transformation_field_type_mask={
                "country__territory": np.int,
                "region": np.int,
                "most_specific_location": np.int
            })
        etl = AdWordsReportsToAthena(etl_settings)
        etl.transfer()
        etl.create_athena_table()
        etl.add_partitions()

        au = AthenaUtil(database="dev",
                        conn=AwsConnectionManager(aws_setting),
                        output_bucket=os.environ["S3_TEST_BUCKET"])
        actual = au.run_query(query_string="""
            select * from dev.test_adwords_geo_performance_report limit 10
            """,
                              return_result=True)
        print(actual)
        expected = 11

        self.assertEqual(expected, len(actual["ResultSet"]["Rows"]))