def setUp(self) -> None: super(TestTableStats, self).setUp() self.table_stats = TableColumnStats( table_name='base.test', col_name='col', stat_name='avg', stat_val='1', start_epoch='1', end_epoch='2', ) self.expected_node_result = { NODE_KEY: 'hive://gold.base/test/col/avg/', NODE_LABEL: 'Stat', 'stat_val:UNQUOTED': 1, 'stat_name': 'avg', 'start_epoch': '1', 'end_epoch': '2', } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.base/test/col/avg/', RELATION_START_LABEL: 'Stat', RELATION_END_KEY: 'hive://gold.base/test/col', RELATION_END_LABEL: 'Column', RELATION_TYPE: 'STAT_OF', RELATION_REVERSE_TYPE: 'STAT' }
def _render_column_stats(self, index_name: str, spec: Dict[str, Any]) -> List[TableColumnStats]: result: List[TableColumnStats] = [] col_name = spec.pop('name') for stat_name, stat_val in spec.items(): if isinstance(stat_val, dict) or isinstance(stat_val, list): continue elif stat_val == 'NaN': continue stat = TableColumnStats(table_name=index_name, col_name=col_name, stat_name=stat_name, stat_val=stat_val, start_epoch='0', end_epoch='0', db=self.database, cluster=self.cluster, schema=self.schema) result.append(stat) return result
def _get_extract_iter(self) -> Any: report = self._load_report() variables = report.get('variables', dict()) report_time = self.parse_date( report.get('analysis', dict()).get('date_start')) for column_name, column_stats in variables.items(): for _stat_name, stat_value in column_stats.items(): stat_spec = self.stat_mappings.get(_stat_name) if stat_spec: stat_name, stat_modifier = stat_spec if isinstance(stat_value, float): stat_value = self.round_value(stat_value) stat = TableColumnStats(table_name=self.table_name, col_name=column_name, stat_name=stat_name, stat_val=stat_modifier(stat_value), start_epoch=report_time, end_epoch='0', db=self.database_name, cluster=self.cluster_name, schema=self.schema_name) yield stat
class TestTableStats(unittest.TestCase): def setUp(self) -> None: super(TestTableStats, self).setUp() self.table_stats = TableColumnStats(table_name='base.test', col_name='col', stat_name='avg', stat_val='1', start_epoch='1', end_epoch='2',) self.expected_node_result = { NODE_KEY: 'hive://gold.base/test/col/avg/', NODE_LABEL: 'Stat', 'stat_val': '1', 'stat_name': 'avg', 'start_epoch': '1', 'end_epoch': '2', } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.base/test/col/avg/', RELATION_START_LABEL: 'Stat', RELATION_END_KEY: 'hive://gold.base/test/col', RELATION_END_LABEL: 'Column', RELATION_TYPE: 'STAT_OF', RELATION_REVERSE_TYPE: 'STAT' } def test_get_table_stat_model_key(self) -> None: table_stats = self.table_stats.get_table_stat_model_key() self.assertEqual(table_stats, 'hive://gold.base/test/col/avg/') def test_get_col_key(self) -> None: metadata = self.table_stats.get_col_key() self.assertEqual(metadata, 'hive://gold.base/test/col') def test_create_nodes(self) -> None: nodes = self.table_stats.create_nodes() self.assertEquals(len(nodes), 1) serialized_node = neo4_serializer.serialize_node(nodes[0]) self.assertEquals(serialized_node, self.expected_node_result) def test_create_relation(self) -> None: relation = self.table_stats.create_relation() self.assertEquals(len(relation), 1) serialized_relation = neo4_serializer.serialize_relationship(relation[0]) self.assertEquals(serialized_relation, self.expected_relation_result) def test_create_next_node(self) -> None: next_node = self.table_stats.create_next_node() serialized_node = neo4_serializer.serialize_node(next_node) self.assertEquals(serialized_node, self.expected_node_result) def test_create_next_relation(self) -> None: next_relation = self.table_stats.create_next_relation() serialized_relation = neo4_serializer.serialize_relationship(next_relation) self.assertEquals(serialized_relation, self.expected_relation_result)
def get_stats(self, schema: str, table: str, cluster: str = None): """ Run `show stats for table`, which returns some statistics for hive tables. """ full_schema_address = self._get_full_schema_address(cluster, schema) full_table_address = full_schema_address + '.' + table stats_query = 'show stats for {}'.format(full_table_address) try: stats_results = self.execute(stats_query, has_header=True) stats_column_names = next(stats_results) for stats_values in stats_results: stats_dict = dict(zip(stats_column_names, stats_values)) column_name = stats_dict.pop('column_name') if column_name: for stat_name, stat_value in stats_dict.items(): if stat_name and stat_value: LOGGER.debug( 'Creating column stats object for {}: {}'. format(stat_name, stat_value)) yield TableColumnStats( table_name=table, col_name=column_name, stat_name=stat_name, stat_val=stat_value, start_epoch=0, end_epoch=int(time.time()), db=self._database, cluster=cluster or self._default_cluster_name, schema=schema, ) except Exception as e: LOGGER.exception(e)
class TestTableStats(unittest.TestCase): def setUp(self) -> None: super(TestTableStats, self).setUp() self.table_stats = TableColumnStats( table_name='base.test', col_name='col', stat_name='avg', stat_val='1', start_epoch='1', end_epoch='2', ) self.expected_node_results = [{ NODE_KEY: 'hive://gold.base/test/col/avg/', NODE_LABEL: 'Stat', 'stat_val': '1', 'stat_type': 'avg', 'start_epoch': '1', 'end_epoch': '2', }] self.expected_relation_results = [{ RELATION_START_KEY: 'hive://gold.base/test/col/avg/', RELATION_START_LABEL: 'Stat', RELATION_END_KEY: 'hive://gold.base/test/col', RELATION_END_LABEL: 'Column', RELATION_TYPE: 'STAT_OF', RELATION_REVERSE_TYPE: 'STAT' }] def test_get_column_stat_model_key(self) -> None: table_stats = self.table_stats.get_column_stat_model_key() self.assertEqual(table_stats, 'hive://gold.base/test/col/avg/') def test_get_col_key(self) -> None: metadata = self.table_stats.get_col_key() self.assertEqual(metadata, 'hive://gold.base/test/col') def test_create_nodes(self) -> None: actual = [] node = self.table_stats.create_next_node() while node: serialized_node = neo4_serializer.serialize_node(node) actual.append(serialized_node) node = self.table_stats.create_next_node() self.assertEqual(actual, self.expected_node_results) def test_create_relation(self) -> None: actual = [] relation = self.table_stats.create_next_relation() while relation: serialized_relation = neo4_serializer.serialize_relationship( relation) actual.append(serialized_relation) relation = self.table_stats.create_next_relation() self.assertEqual(actual, self.expected_relation_results) def test_create_nodes_neptune(self) -> None: actual = [] next_node = self.table_stats.create_next_node() while next_node: serialized_node = neptune_serializer.convert_node(next_node) actual.append(serialized_node) next_node = self.table_stats.create_next_node() expected_neptune_nodes = [{ NEPTUNE_HEADER_ID: 'Stat:hive://gold.base/test/col/avg/', METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: 'hive://gold.base/test/col/avg/', NEPTUNE_HEADER_LABEL: 'Stat', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'stat_val:String(single)': '1', 'stat_type:String(single)': 'avg', 'start_epoch:String(single)': '1', 'end_epoch:String(single)': '2', }] self.assertEqual(actual, expected_neptune_nodes) def test_create_relation_neptune(self) -> None: self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.base/test/col/avg/', RELATION_START_LABEL: 'Stat', RELATION_END_KEY: 'hive://gold.base/test/col', RELATION_END_LABEL: 'Column', RELATION_TYPE: 'STAT_OF', RELATION_REVERSE_TYPE: 'STAT' } expected = [[{ NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Stat:hive://gold.base/test/col/avg/', to_vertex_id='Column:hive://gold.base/test/col', label='STAT_OF'), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Stat:hive://gold.base/test/col/avg/', to_vertex_id='Column:hive://gold.base/test/col', label='STAT_OF'), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Stat:hive://gold.base/test/col/avg/', NEPTUNE_RELATIONSHIP_HEADER_TO: 'Column:hive://gold.base/test/col', NEPTUNE_HEADER_LABEL: 'STAT_OF', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Column:hive://gold.base/test/col', to_vertex_id='Stat:hive://gold.base/test/col/avg/', label='STAT'), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Column:hive://gold.base/test/col', to_vertex_id='Stat:hive://gold.base/test/col/avg/', label='STAT'), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Column:hive://gold.base/test/col', NEPTUNE_RELATIONSHIP_HEADER_TO: 'Stat:hive://gold.base/test/col/avg/', NEPTUNE_HEADER_LABEL: 'STAT', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }]] actual = [] next_relation = self.table_stats.create_next_relation() while next_relation: serialized_relation = neptune_serializer.convert_relationship( next_relation) actual.append(serialized_relation) next_relation = self.table_stats.create_next_relation() self.assertListEqual(actual, expected) def test_create_records(self) -> None: expected = [{ 'rk': 'hive://gold.base/test/col/avg/', 'stat_val': '1', 'stat_type': 'avg', 'start_epoch': '1', 'end_epoch': '2', 'column_rk': 'hive://gold.base/test/col' }] actual = [] record = self.table_stats.create_next_record() while record: serialized_record = mysql_serializer.serialize_record(record) actual.append(serialized_record) record = self.table_stats.create_next_record() self.assertEqual(actual, expected)
class TestTableStats(unittest.TestCase): def setUp(self) -> None: super(TestTableStats, self).setUp() self.table_stats = TableColumnStats( table_name='base.test', col_name='col', stat_name='avg', stat_val='1', start_epoch='1', end_epoch='2', ) self.expected_node_result = { NODE_KEY: 'hive://gold.base/test/col/avg/', NODE_LABEL: 'Stat', 'stat_val': '1', 'stat_name': 'avg', 'start_epoch': '1', 'end_epoch': '2', } self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.base/test/col/avg/', RELATION_START_LABEL: 'Stat', RELATION_END_KEY: 'hive://gold.base/test/col', RELATION_END_LABEL: 'Column', RELATION_TYPE: 'STAT_OF', RELATION_REVERSE_TYPE: 'STAT' } def test_get_table_stat_model_key(self) -> None: table_stats = self.table_stats.get_table_stat_model_key() self.assertEqual(table_stats, 'hive://gold.base/test/col/avg/') def test_get_col_key(self) -> None: metadata = self.table_stats.get_col_key() self.assertEqual(metadata, 'hive://gold.base/test/col') def test_create_nodes(self) -> None: nodes = self.table_stats.create_nodes() self.assertEquals(len(nodes), 1) serialized_node = neo4_serializer.serialize_node(nodes[0]) self.assertEquals(serialized_node, self.expected_node_result) def test_create_relation(self) -> None: relation = self.table_stats.create_relation() self.assertEquals(len(relation), 1) serialized_relation = neo4_serializer.serialize_relationship( relation[0]) self.assertEquals(serialized_relation, self.expected_relation_result) def test_create_next_node(self) -> None: next_node = self.table_stats.create_next_node() serialized_node = neo4_serializer.serialize_node(next_node) self.assertEquals(serialized_node, self.expected_node_result) def test_create_next_node_neptune(self) -> None: next_node = self.table_stats.create_next_node() serialized_node = neptune_serializer.convert_node(next_node) expected_neptune_node = { NEPTUNE_HEADER_ID: 'hive://gold.base/test/col/avg/', NEPTUNE_HEADER_LABEL: 'Stat', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB, 'stat_val:String(single)': '1', 'stat_name:String(single)': 'avg', 'start_epoch:String(single)': '1', 'end_epoch:String(single)': '2', } self.assertDictEqual(serialized_node, expected_neptune_node) def test_create_next_relation(self) -> None: next_relation = self.table_stats.create_next_relation() serialized_relation = neo4_serializer.serialize_relationship( next_relation) self.assertEquals(serialized_relation, self.expected_relation_result) def test_create_next_relation_neptune(self) -> None: next_relation = self.table_stats.create_next_relation() self.expected_relation_result = { RELATION_START_KEY: 'hive://gold.base/test/col/avg/', RELATION_START_LABEL: 'Stat', RELATION_END_KEY: 'hive://gold.base/test/col', RELATION_END_LABEL: 'Column', RELATION_TYPE: 'STAT_OF', RELATION_REVERSE_TYPE: 'STAT' } expected = [{ NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id='hive://gold.base/test/col/avg/', to_vertex_id='hive://gold.base/test/col', label='STAT_OF'), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'hive://gold.base/test/col/avg/', NEPTUNE_RELATIONSHIP_HEADER_TO: 'hive://gold.base/test/col', NEPTUNE_HEADER_LABEL: 'STAT_OF', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{from_vertex_id}_{to_vertex_id}_{label}".format( from_vertex_id='hive://gold.base/test/col', to_vertex_id='hive://gold.base/test/col/avg/', label='STAT'), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'hive://gold.base/test/col', NEPTUNE_RELATIONSHIP_HEADER_TO: 'hive://gold.base/test/col/avg/', NEPTUNE_HEADER_LABEL: 'STAT', NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }] serialized_relation = neptune_serializer.convert_relationship( next_relation) self.assertListEqual(serialized_relation, expected)
def _test_extractor_without_technical_data(self, es_version: str, indices: Dict, stats: Dict) -> None: extractor = self._get_extractor() extractor._get_es_version = lambda: es_version extractor.es.indices.get = MagicMock(return_value=indices) extractor.es.search = MagicMock(return_value=stats) common = { 'db': 'elasticsearch', 'schema': 'schema_name', 'table_name': 'proper_index', 'cluster': 'cluster_name', 'start_epoch': '0', 'end_epoch': '0' } compare_params = { 'table', 'schema', 'db', 'col_name', 'start_epoch', 'end_epoch', 'cluster', 'stat_type', 'stat_val' } expected = [{x: spec[x] for x in compare_params if x in spec} for spec in [ TableColumnStats( **{ **dict(stat_name='avg', stat_val='5', col_name='long_property'), **common }).__dict__, TableColumnStats( **{ **dict(stat_name='sum', stat_val='10', col_name='long_property'), **common }).__dict__, TableColumnStats( **{ **dict(stat_name='count', stat_val='2', col_name='long_property'), **common }).__dict__, ]] result = [] while True: stat = extractor.extract() if stat: result.append(stat) else: break result_spec = [{ x: spec.__dict__[x] for x in compare_params if x in spec.__dict__ } for spec in result] for r in result: self.assertIsInstance(r, TableColumnStats) self.assertListEqual(expected, result_spec)
def test_extractor(self) -> None: extractor = self._get_extractor() extractor._load_report = MagicMock(return_value=self.report_data) common = { 'db': self._common_params().get( 'extractor.pandas_profiling.database_name'), 'schema': self._common_params().get( 'extractor.pandas_profiling.schema_name'), 'table_name': self._common_params().get('extractor.pandas_profiling.table_name'), 'cluster': self._common_params().get( 'extractor.pandas_profiling.cluster_name'), 'start_epoch': '1621246215', 'end_epoch': '0' } compare_params = { 'table', 'schema', 'db', 'col_name', 'start_epoch', 'end_epoch', 'cluster', 'stat_type', 'stat_val' } expected = [{x: spec[x] for x in compare_params if x in spec} for spec in [ TableColumnStats( **{ **dict(stat_name='Mean', stat_val='5.12', col_name='column_1'), **common }).__dict__, TableColumnStats( **{ **dict(stat_name='Maximum', stat_val='15.235', col_name='column_1'), **common }).__dict__, TableColumnStats( **{ **dict(stat_name='Mean', stat_val='10.0', col_name='column_2'), **common }).__dict__, ]] result = [] while True: stat = extractor.extract() if stat: result.append(stat) else: break result_spec = [{ x: spec.__dict__[x] for x in compare_params if x in spec.__dict__ } for spec in result] for r in result: self.assertIsInstance(r, TableColumnStats) self.assertListEqual(expected, result_spec)