def _calc_label_stats_cat_feature(self) -> pd.DataFrame: """Calculates the statistics for label by categorical feature values. Returns: results: Calculated statistics. """ logging.info('Calculating statistics from label.') logging.info('Creating the sql code.') sql_segment = self._create_struct_column_list_sql( self._categorical_feature_list) query_params = { 'bq_features_table': self._features_table_path, 'label_column': self._label_column, 'sql_code_segment': sql_segment } sql_query = viz_utils.patch_sql( _NUMERICAL_LABEL_SQL_FILES['calc_num_label_stats'], query_params) logging.info('Finished creating the sql code.') logging.info('Executing the sql code.') results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished executing the sql code.') return results
def _extract_numerical_feature_sample(self) -> pd.DataFrame: """Extracts a random sample of values from selected numerical features. Returns: results: Extracted values as a DataFrame. """ logging.info('Extracting a random sample of numerical features.') logging.info('Creating the sql code.') sql_segment = self._create_column_list_sql( self._numerical_feature_list) query_params = { 'bq_features_table': self._features_table_path, 'label_column': self._label_column, 'positive_class_label': self._positive_class_label, 'negative_class_label': self._negative_class_label, 'num_pos_instances': self._num_pos_instances, 'num_neg_instances': self._num_neg_instances, 'sql_code_segment': sql_segment } sql_query = utils.configure_sql(_EXTRACT_NUM_FEATURE_SAMPLE_SQL_PATH, query_params) logging.info('Finished creating the sql code.') logging.info('Executing the sql code.') results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished executing the sql code.') return results
def _calc_categorical_feature_stats(self) -> pd.DataFrame: """Calculates the statistics from selected categorical features. Returns: results: Calculated statistics. """ logging.info('Calculating statistics from categorical features.') logging.info('Creating the sql code.') sql_segment = self._create_struct_column_list_sql( self._categorical_feature_list) query_params = { 'bq_features_table': self._features_table_path, 'sql_code_segment': sql_segment } sql_template_path = '' if self._label_type == 'binary': sql_template_path = _BINARY_LABEL_SQL_FILES[ 'calc_cat_feature_stats'] else: sql_template_path = _NUMERICAL_LABEL_SQL_FILES[ 'calc_cat_feature_stats'] sql_query = viz_utils.patch_sql(sql_template_path, query_params) logging.info('Finished creating the sql code.') logging.info('Executing the sql code.') results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished executing the sql code.') return results
def test_execute_sql_returns_pd_dataframe(self): fake_sql_query = 'SELECT * FROM project.dataset.table;' self.mock_bq_client.query.return_value.to_dataframe.return_value = TESTDATA_1 results = viz_utils.execute_sql(self.mock_bq_client, fake_sql_query) self.mock_bq_client.query.return_value.result.assert_called_once() pd.testing.assert_frame_equal(results, TESTDATA_1)
def _extract_numerical_feature_sample(self) -> pd.DataFrame: """Extracts a random sample of values from selected numerical features. Returns: results: Extracted values as a DataFrame. """ logging.info('Extracting a random sample of numerical features.') logging.info('Creating the sql code.') sql_segment = self._create_column_list_sql( self._numerical_feature_list) query_params = { 'bq_features_table': self._features_table_path, 'label_column': self._label_column, 'column_list_sql': sql_segment } sql_template_path = '' if self._label_type == 'binary': sql_template_path = _BINARY_LABEL_SQL_FILES['extract_num_feature'] sql_positive_class_label = self._positive_class_label sql_negative_class_label = self._negative_class_label if isinstance(self._positive_class_label, str): sql_positive_class_label = f"'{self._positive_class_label}'" if isinstance(self._negative_class_label, str): sql_negative_class_label = f"'{self._negative_class_label}'" query_params.update({ 'positive_class_label': sql_positive_class_label, 'negative_class_label': sql_negative_class_label, 'num_pos_instances': self._num_pos_instances, 'num_neg_instances': self._num_neg_instances }) else: sql_template_path = _NUMERICAL_LABEL_SQL_FILES[ 'extract_num_feature'] query_params.update({'num_instances': self._num_instances}) sql_query = viz_utils.patch_sql(sql_template_path, query_params) logging.info('Finished creating the sql code.') logging.info('Executing the sql code.') results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished executing the sql code.') return results
def _calc_numerical_fact_stats(self) -> pd.DataFrame: """Calculates the statistics for selected numerical fact variables. Returns: results: Calculated statistics. """ logging.info('Calculating statistics from numerical facts.') logging.info('Reading the sql query from the file.') query_params = { 'bq_facts_table': self._numerical_facts_table_path, } sql_query = utils.configure_sql(_CALC_NUM_FACT_STATS_SQL_PATH, query_params) results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished calculating statistics from numerical facts.') results['date'] = pd.to_datetime(results['date']) return results
def _calc_categorical_fact_stats(self) -> pd.DataFrame: """Calculates the statistics for selected categorical fact variables. Returns: results: Calculated statistics. """ logging.info('Calculating statistics from categorical facts.') logging.info('Reading the sql query from the file.') query_params = { 'bq_facts_table': self._facts_table_path, 'categorical_fact_list': self._categorical_facts, 'number_top_levels': self._number_top_levels } sql_query = utils.configure_sql(_CALC_CAT_FACT_STATS_SQL_PATH, query_params) results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished calculating statistics from categorical facts.') return results
def _calc_categorical_feature_stats(self) -> pd.DataFrame: """Calculates the statistics from selected categorical features. Returns: results: Calculated statistics. """ logging.info('Calculating statistics from categorical features.') logging.info('Creating the sql code.') sql_segment = self._create_struct_column_list_sql( self._categorical_feature_list) query_params = { 'bq_features_table': self._features_table_path, 'sql_code_segment': sql_segment } sql_query = utils.configure_sql(_CALC_CAT_FEATURE_STATS_SQL_PATH, query_params) logging.info('Finished creating the sql code.') logging.info('Executing the sql code.') results = viz_utils.execute_sql(self._bq_client, sql_query) logging.info('Finished executing the sql code.') return results