コード例 #1
0
    def test_nested_create_gather_updates_query(self, project_id):
        """Tests QueryCreator's ability to create a query with nested fields
            to gather updates.

         Args:
             project_id(str): ID of the project that holds the test BQ tables.

         Returns:
             True if test passes, else False.
         """
        if not project_id:
            raise Exception(
                'Test needs project_id to pass. '
                'Add --project_id={your project ID} to test command')
        abs_path = os.path.abspath(os.path.dirname(__file__))
        schema_path = os.path.join(abs_path,
                                   'test_schemas/test_nested_schema.json')
        test_query_creator = query_creator.QueryCreator(
            schema_path=schema_path,
            user_id_field_name=self.user_id_field_name,
            ingest_timestamp_field_name=self.ingest_timestamp_field_name,
            project_id=project_id,
            dataset_id=self.dataset_id,
            updates_table_id=self.user_info_updates_id,
            temp_updates_table_id=self.temp_user_info_updates_id,
            final_table_id=self.user_info_final_id)

        gather_updates_query = test_query_creator.create_gather_updates_query()
        abs_path = os.path.abspath(os.path.dirname(__file__))
        test_gather_updates_query_path = os.path.join(
            abs_path, 'test_queries/test_nested_gather_updates_query.txt')
        with open(test_gather_updates_query_path, 'r') as input_file:
            expected_gather_updates_query = input_file.read().format(
                project_id, '{0:s}')
        assert gather_updates_query == expected_gather_updates_query
コード例 #2
0
    def test_second_update(self, project_id):
        """Tests UserInfoUpdater ability to run an update after at least one
            has been run prior.

         Args:
             project_id(str): ID of the project that holds the test BQ tables.

         Returns:
             True if test passes, else False.
                 """
        # Load a second set of user updates to user_info_updates table to
        # simulate a second iteration.
        self.load_csv_to_bq(filename='test_data/user_info_updates_data_2.csv',
                            table=self.dataset_ref.table(
                                self.user_info_updates_id))

        # Load data into the temp table and final table so that they will
        # contain the same data they did at the end of the
        # test_initial_update() test.
        self.load_csv_to_bq(
            filename='test_data/temp_user_info_updates_expected_1.csv',
            table=self.dataset_ref.table(self.temp_user_info_updates_id))

        self.load_csv_to_bq(
            filename='test_data/user_info_final_expected_1.csv',
            table=self.dataset_ref.table(self.user_info_final_id),
        )

        # Run the UserInfoUpdater on the second set of updates.
        test_updater = user_info_updater.UserInfoUpdater(
            project_id, self.dataset_id, self.user_info_updates_id,
            self.temp_user_info_updates_id, self.user_info_final_id)
        update_query_creator = query_creator.QueryCreator(
            schema_path=self.schema_path,
            user_id_field_name='userId',
            ingest_timestamp_field_name='ingestTimestamp',
            project_id=project_id,
            dataset_id=self.dataset_id,
            updates_table_id=self.user_info_updates_id,
            temp_updates_table_id=self.temp_user_info_updates_id,
            final_table_id=self.user_info_final_id)

        gather_updates_query = update_query_creator.create_gather_updates_query(
        )
        test_updater.gather_updates(gather_updates_query)
        merge_udpates_query = update_query_creator.create_merge_query()
        test_updater.merge_updates(merge_udpates_query)

        # Query the temp table to test that the gather_updates() function worked
        temp_table_query_config = bigquery.QueryJobConfig()
        temp_table_query_config.use_legacy_sql = False
        temp_table_query = self.bq_client.query(
            query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format(
                project_id, self.dataset_id, self.temp_user_info_updates_id),
            job_config=temp_table_query_config,
            location='US')
        temp_table_query.result()
        temp_table_results_df = temp_table_query.to_dataframe() \
            .sort_values(by=['userId']).reset_index(drop=True)
        # Gather expected results for comparison
        abs_path = os.path.abspath(os.path.dirname(__file__))
        expected_temp_data_file = os.path.join(
            abs_path, 'test_data/temp_user_info_updates_expected_2.csv')
        expected_temp_table_df = pd.read_csv(expected_temp_data_file)
        # convert ingestTimestamp to datetime
        expected_temp_table_df['ingestTimestamp'] = pd.to_datetime(
            expected_temp_table_df['ingestTimestamp'])

        # Compare results
        pd.testing.assert_frame_equal(temp_table_results_df,
                                      expected_temp_table_df)

        # Query the final table to test that the merge_updates() function worked
        final_table_query_config = bigquery.QueryJobConfig()
        final_table_query_config.use_legacy_sql = False
        final_table_query = self.bq_client.query(
            query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format(
                project_id, self.dataset_id, self.user_info_final_id),
            job_config=final_table_query_config,
            location='US')
        final_table_query.result()
        final_table_results_df = final_table_query.to_dataframe() \
            .sort_values(by=['userId']).reset_index(drop=True)
        # Gather expected results for comparison
        expected_final_data_file = os.path.join(
            abs_path, 'test_data/user_info_final_expected_2.csv')
        expected_final_table_df = pd.read_csv(expected_final_data_file)
        # convert ingestTimestamp to datetime
        expected_final_table_df['ingestTimestamp'] = pd.to_datetime(
            expected_final_table_df['ingestTimestamp'])

        # Compare results
        pd.testing.assert_frame_equal(final_table_results_df,
                                      expected_final_table_df)
コード例 #3
0
    def test_initial_update(self, project_id):
        """Tests UserInfoUpdater ability to run an initial update.

         Args:
             project_id(str): ID of the project that holds the test BQ tables.

         Returns:
             True if test passes, else False.
         """
        if not project_id:
            raise Exception(
                'Test needs project_id to pass. '
                'Add --project_id={your project ID} to test command')
        # Load the first set of user updates to user_info_updates table.
        # All other tables should be empty at this point.

        self.load_csv_to_bq(filename='test_data/user_info_updates_data_1.csv',
                            table=self.dataset_ref.table(
                                self.user_info_updates_id))
        # Run the UserInfoUpdater on the first set of updates.
        test_updater = user_info_updater.UserInfoUpdater(
            project_id, self.dataset_id, self.user_info_updates_id,
            self.temp_user_info_updates_id, self.user_info_final_id)

        # Get Queries to Run
        update_query_creator = query_creator.QueryCreator(
            schema_path=self.schema_path,
            user_id_field_name='userId',
            ingest_timestamp_field_name='ingestTimestamp',
            project_id=project_id,
            dataset_id=self.dataset_id,
            updates_table_id=self.user_info_updates_id,
            temp_updates_table_id=self.temp_user_info_updates_id,
            final_table_id=self.user_info_final_id)

        gather_updates_query = update_query_creator.create_gather_updates_query(
        )
        test_updater.gather_updates(gather_updates_query)
        merge_udpates_query = update_query_creator.create_merge_query()
        test_updater.merge_updates(merge_udpates_query)

        # Query the temp table to test that the gather_updates() function worked
        temp_table_query_config = bigquery.QueryJobConfig()
        temp_table_query_config.use_legacy_sql = False
        temp_table_query = self.bq_client.query(
            query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format(
                project_id, self.dataset_id, self.temp_user_info_updates_id),
            job_config=temp_table_query_config,
            location='US')
        temp_table_query.result()
        temp_table_results_df = temp_table_query.to_dataframe()\
            .sort_values(by=['userId']).reset_index(drop=True)

        # Gather expected results for comparison
        abs_path = os.path.abspath(os.path.dirname(__file__))
        expected_temp_data_file = os.path.join(
            abs_path, 'test_data/temp_user_info_updates_expected_1.csv')
        expected_temp_table_df = pd.read_csv(expected_temp_data_file)
        # convert ingestTimestamp to datetime
        expected_temp_table_df['ingestTimestamp'] = pd.to_datetime(
            expected_temp_table_df['ingestTimestamp'])

        # Compare results
        pd.testing.assert_frame_equal(temp_table_results_df,
                                      expected_temp_table_df)

        # Query the final table to test that the merge_updates() function worked
        final_table_query_config = bigquery.QueryJobConfig()
        final_table_query_config.use_legacy_sql = False
        final_table_query = self.bq_client.query(
            query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format(
                project_id, self.dataset_id, self.user_info_final_id),
            job_config=final_table_query_config,
            location='US')
        final_table_query.result()
        final_table_results_df = final_table_query.to_dataframe() \
            .sort_values(by=['userId']).reset_index(drop=True)
        # Gather expected results for comparison
        expected_final_data_file = os.path.join(
            abs_path, 'test_data/user_info_final_expected_1.csv')
        expected_final_table_df = pd.read_csv(expected_final_data_file)
        expected_final_table_df['ingestTimestamp'] = pd.to_datetime(
            expected_final_table_df['ingestTimestamp'])

        # Compare results
        pd.testing.assert_frame_equal(final_table_results_df,
                                      expected_final_table_df)
コード例 #4
0
    def test_nested_data_user_update(self, project_id):
        """Tests UserInfoUpdater ability to run an update on nested user data.

         Args:
             project_id(str): ID of the project that holds the test BQ tables.

         Returns:
             True if test passes, else False.
         """
        if not project_id:
            raise Exception(
                'Test needs project_id to pass. '
                'Add --project_id={your project ID} to test command')
        # Load a set of user updates to nested user_info_updates table.

        self.load_json_to_bq(
            filename='test_data/nested_data/user_info_updates_data.json',
            table=self.dataset_ref.table(self.user_info_updates_id))

        # Load data into the temp table and final table to simulate a previous
        #  run.
        self.load_json_to_bq(filename='test_data/nested_data/'
                             'temp_user_info_updates_initial.json',
                             table=self.dataset_ref.table(
                                 self.temp_user_info_updates_id))
        self.load_json_to_bq(
            filename='test_data/nested_data/user_info_final_initial.json',
            table=self.dataset_ref.table(self.user_info_final_id))

        # Run the UserInfoUpdater on the second set of updates.
        test_updater = user_info_updater.UserInfoUpdater(
            project_id, self.dataset_id, self.user_info_updates_id,
            self.temp_user_info_updates_id, self.user_info_final_id)
        update_query_creator = query_creator.QueryCreator(
            schema_path=self.schema_path,
            user_id_field_name='userId',
            ingest_timestamp_field_name='ingestTimestamp',
            project_id=project_id,
            dataset_id=self.dataset_id,
            updates_table_id=self.user_info_updates_id,
            temp_updates_table_id=self.temp_user_info_updates_id,
            final_table_id=self.user_info_final_id)

        gather_updates_query = update_query_creator.create_gather_updates_query(
        )
        test_updater.gather_updates(gather_updates_query)
        merge_udpates_query = update_query_creator.create_merge_query()
        test_updater.merge_updates(merge_udpates_query)

        # Query the temp table to test that the gather_updates() function worked
        temp_table_query_config = bigquery.QueryJobConfig()
        temp_table_query_config.use_legacy_sql = False
        temp_table_query = self.bq_client.query(
            query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format(
                project_id, self.dataset_id, self.temp_user_info_updates_id),
            job_config=temp_table_query_config,
            location='US')
        temp_table_query.result()
        temp_table_results_df = temp_table_query.to_dataframe() \
            .sort_values(by=['userId']).reset_index(drop=True)

        # Gather expected results for comparison
        abs_path = os.path.abspath(os.path.dirname(__file__))
        expected_temp_data_file = os.path.join(
            abs_path,
            'test_data/nested_data/temp_user_info_updates_expected.json')
        expected_temp_table_df = pd.read_json(expected_temp_data_file)
        # Reorder columns since read_json() reads them alphabetically
        with open(self.schema_path, 'r') as f:
            json_schema = json.loads(f.read())
        col_list = [str(col['name']) for col in json_schema['fields']]
        expected_temp_table_df = expected_temp_table_df[col_list]

        # convert ingestTimestamp to datetime
        expected_temp_table_df['ingestTimestamp'] = pd.to_datetime(
            expected_temp_table_df['ingestTimestamp'])

        # Compare results
        pd.testing.assert_frame_equal(temp_table_results_df,
                                      expected_temp_table_df)

        # Query the final table to test that the merge_updates() function worked
        final_table_query_config = bigquery.QueryJobConfig()
        final_table_query_config.use_legacy_sql = False
        final_table_query = self.bq_client.query(
            query='SELECT * FROM `{0:s}.{1:s}.{2:s}`'.format(
                project_id, self.dataset_id, self.user_info_final_id),
            job_config=final_table_query_config,
            location='US')
        final_table_query.result()
        final_table_results_df = final_table_query.to_dataframe() \
            .sort_values(by=['userId']).reset_index(drop=True)
        # Gather expected results for comparison
        expected_final_data_file = os.path.join(
            abs_path, 'test_data/nested_data/user_info_final_expected.json')
        expected_final_table_df = pd.read_json(expected_final_data_file)

        # Reorder columns since read_json() reads them alphabetically
        with open(self.schema_path, 'r') as f:
            json_schema = json.loads(f.read())
        col_list = [str(col['name']) for col in json_schema['fields']]
        expected_final_table_df = expected_final_table_df[col_list]

        # convert ingestTimestamp to datetime
        expected_final_table_df['ingestTimestamp'] = pd.to_datetime(
            expected_final_table_df['ingestTimestamp'])

        # Compare results
        pd.testing.assert_frame_equal(final_table_results_df,
                                      expected_final_table_df)