def setUp(self) -> None:
        super(TestTableStats, self).setUp()
        self.table_stats = TableColumnStats(
            table_name='base.test',
            col_name='col',
            stat_name='avg',
            stat_val='1',
            start_epoch='1',
            end_epoch='2',
        )

        self.expected_node_result = {
            NODE_KEY: 'hive://gold.base/test/col/avg/',
            NODE_LABEL: 'Stat',
            'stat_val:UNQUOTED': 1,
            'stat_name': 'avg',
            'start_epoch': '1',
            'end_epoch': '2',
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.base/test/col/avg/',
            RELATION_START_LABEL: 'Stat',
            RELATION_END_KEY: 'hive://gold.base/test/col',
            RELATION_END_LABEL: 'Column',
            RELATION_TYPE: 'STAT_OF',
            RELATION_REVERSE_TYPE: 'STAT'
        }
Example #2
0
    def _render_column_stats(self, index_name: str,
                             spec: Dict[str, Any]) -> List[TableColumnStats]:
        result: List[TableColumnStats] = []

        col_name = spec.pop('name')

        for stat_name, stat_val in spec.items():
            if isinstance(stat_val, dict) or isinstance(stat_val, list):
                continue
            elif stat_val == 'NaN':
                continue

            stat = TableColumnStats(table_name=index_name,
                                    col_name=col_name,
                                    stat_name=stat_name,
                                    stat_val=stat_val,
                                    start_epoch='0',
                                    end_epoch='0',
                                    db=self.database,
                                    cluster=self.cluster,
                                    schema=self.schema)

            result.append(stat)

        return result
Example #3
0
    def _get_extract_iter(self) -> Any:
        report = self._load_report()

        variables = report.get('variables', dict())
        report_time = self.parse_date(
            report.get('analysis', dict()).get('date_start'))

        for column_name, column_stats in variables.items():
            for _stat_name, stat_value in column_stats.items():
                stat_spec = self.stat_mappings.get(_stat_name)

                if stat_spec:
                    stat_name, stat_modifier = stat_spec

                    if isinstance(stat_value, float):
                        stat_value = self.round_value(stat_value)

                    stat = TableColumnStats(table_name=self.table_name,
                                            col_name=column_name,
                                            stat_name=stat_name,
                                            stat_val=stat_modifier(stat_value),
                                            start_epoch=report_time,
                                            end_epoch='0',
                                            db=self.database_name,
                                            cluster=self.cluster_name,
                                            schema=self.schema_name)

                    yield stat
Example #4
0
class TestTableStats(unittest.TestCase):

    def setUp(self) -> None:
        super(TestTableStats, self).setUp()
        self.table_stats = TableColumnStats(table_name='base.test',
                                            col_name='col',
                                            stat_name='avg',
                                            stat_val='1',
                                            start_epoch='1',
                                            end_epoch='2',)

        self.expected_node_result = {
            NODE_KEY: 'hive://gold.base/test/col/avg/',
            NODE_LABEL: 'Stat',
            'stat_val': '1',
            'stat_name': 'avg',
            'start_epoch': '1',
            'end_epoch': '2',
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.base/test/col/avg/',
            RELATION_START_LABEL: 'Stat',
            RELATION_END_KEY: 'hive://gold.base/test/col',
            RELATION_END_LABEL: 'Column',
            RELATION_TYPE: 'STAT_OF',
            RELATION_REVERSE_TYPE: 'STAT'
        }

    def test_get_table_stat_model_key(self) -> None:
        table_stats = self.table_stats.get_table_stat_model_key()
        self.assertEqual(table_stats, 'hive://gold.base/test/col/avg/')

    def test_get_col_key(self) -> None:
        metadata = self.table_stats.get_col_key()
        self.assertEqual(metadata, 'hive://gold.base/test/col')

    def test_create_nodes(self) -> None:
        nodes = self.table_stats.create_nodes()
        self.assertEquals(len(nodes), 1)
        serialized_node = neo4_serializer.serialize_node(nodes[0])
        self.assertEquals(serialized_node, self.expected_node_result)

    def test_create_relation(self) -> None:
        relation = self.table_stats.create_relation()

        self.assertEquals(len(relation), 1)
        serialized_relation = neo4_serializer.serialize_relationship(relation[0])
        self.assertEquals(serialized_relation, self.expected_relation_result)

    def test_create_next_node(self) -> None:
        next_node = self.table_stats.create_next_node()
        serialized_node = neo4_serializer.serialize_node(next_node)
        self.assertEquals(serialized_node, self.expected_node_result)

    def test_create_next_relation(self) -> None:
        next_relation = self.table_stats.create_next_relation()
        serialized_relation = neo4_serializer.serialize_relationship(next_relation)
        self.assertEquals(serialized_relation, self.expected_relation_result)
Example #5
0
    def get_stats(self, schema: str, table: str, cluster: str = None):
        """
        Run `show stats for table`, which returns some statistics for hive
        tables.
        """
        full_schema_address = self._get_full_schema_address(cluster, schema)
        full_table_address = full_schema_address + '.' + table
        stats_query = 'show stats for {}'.format(full_table_address)

        try:
            stats_results = self.execute(stats_query, has_header=True)
            stats_column_names = next(stats_results)

            for stats_values in stats_results:
                stats_dict = dict(zip(stats_column_names, stats_values))
                column_name = stats_dict.pop('column_name')
                if column_name:
                    for stat_name, stat_value in stats_dict.items():
                        if stat_name and stat_value:
                            LOGGER.debug(
                                'Creating column stats object for {}: {}'.
                                format(stat_name, stat_value))
                            yield TableColumnStats(
                                table_name=table,
                                col_name=column_name,
                                stat_name=stat_name,
                                stat_val=stat_value,
                                start_epoch=0,
                                end_epoch=int(time.time()),
                                db=self._database,
                                cluster=cluster or self._default_cluster_name,
                                schema=schema,
                            )

        except Exception as e:
            LOGGER.exception(e)
Example #6
0
class TestTableStats(unittest.TestCase):
    def setUp(self) -> None:
        super(TestTableStats, self).setUp()
        self.table_stats = TableColumnStats(
            table_name='base.test',
            col_name='col',
            stat_name='avg',
            stat_val='1',
            start_epoch='1',
            end_epoch='2',
        )

        self.expected_node_results = [{
            NODE_KEY: 'hive://gold.base/test/col/avg/',
            NODE_LABEL: 'Stat',
            'stat_val': '1',
            'stat_type': 'avg',
            'start_epoch': '1',
            'end_epoch': '2',
        }]

        self.expected_relation_results = [{
            RELATION_START_KEY: 'hive://gold.base/test/col/avg/',
            RELATION_START_LABEL: 'Stat',
            RELATION_END_KEY: 'hive://gold.base/test/col',
            RELATION_END_LABEL: 'Column',
            RELATION_TYPE: 'STAT_OF',
            RELATION_REVERSE_TYPE: 'STAT'
        }]

    def test_get_column_stat_model_key(self) -> None:
        table_stats = self.table_stats.get_column_stat_model_key()
        self.assertEqual(table_stats, 'hive://gold.base/test/col/avg/')

    def test_get_col_key(self) -> None:
        metadata = self.table_stats.get_col_key()
        self.assertEqual(metadata, 'hive://gold.base/test/col')

    def test_create_nodes(self) -> None:
        actual = []
        node = self.table_stats.create_next_node()
        while node:
            serialized_node = neo4_serializer.serialize_node(node)
            actual.append(serialized_node)
            node = self.table_stats.create_next_node()

        self.assertEqual(actual, self.expected_node_results)

    def test_create_relation(self) -> None:
        actual = []
        relation = self.table_stats.create_next_relation()
        while relation:
            serialized_relation = neo4_serializer.serialize_relationship(
                relation)
            actual.append(serialized_relation)
            relation = self.table_stats.create_next_relation()

        self.assertEqual(actual, self.expected_relation_results)

    def test_create_nodes_neptune(self) -> None:
        actual = []
        next_node = self.table_stats.create_next_node()
        while next_node:
            serialized_node = neptune_serializer.convert_node(next_node)
            actual.append(serialized_node)
            next_node = self.table_stats.create_next_node()

        expected_neptune_nodes = [{
            NEPTUNE_HEADER_ID:
            'Stat:hive://gold.base/test/col/avg/',
            METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT:
            'hive://gold.base/test/col/avg/',
            NEPTUNE_HEADER_LABEL:
            'Stat',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB,
            'stat_val:String(single)':
            '1',
            'stat_type:String(single)':
            'avg',
            'start_epoch:String(single)':
            '1',
            'end_epoch:String(single)':
            '2',
        }]

        self.assertEqual(actual, expected_neptune_nodes)

    def test_create_relation_neptune(self) -> None:
        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.base/test/col/avg/',
            RELATION_START_LABEL: 'Stat',
            RELATION_END_KEY: 'hive://gold.base/test/col',
            RELATION_END_LABEL: 'Column',
            RELATION_TYPE: 'STAT_OF',
            RELATION_REVERSE_TYPE: 'STAT'
        }

        expected = [[{
            NEPTUNE_HEADER_ID:
            "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id='Stat:hive://gold.base/test/col/avg/',
                to_vertex_id='Column:hive://gold.base/test/col',
                label='STAT_OF'),
            METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT:
            "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id='Stat:hive://gold.base/test/col/avg/',
                to_vertex_id='Column:hive://gold.base/test/col',
                label='STAT_OF'),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            'Stat:hive://gold.base/test/col/avg/',
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            'Column:hive://gold.base/test/col',
            NEPTUNE_HEADER_LABEL:
            'STAT_OF',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }, {
            NEPTUNE_HEADER_ID:
            "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id='Column:hive://gold.base/test/col',
                to_vertex_id='Stat:hive://gold.base/test/col/avg/',
                label='STAT'),
            METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT:
            "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id='Column:hive://gold.base/test/col',
                to_vertex_id='Stat:hive://gold.base/test/col/avg/',
                label='STAT'),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            'Column:hive://gold.base/test/col',
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            'Stat:hive://gold.base/test/col/avg/',
            NEPTUNE_HEADER_LABEL:
            'STAT',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }]]

        actual = []
        next_relation = self.table_stats.create_next_relation()
        while next_relation:
            serialized_relation = neptune_serializer.convert_relationship(
                next_relation)
            actual.append(serialized_relation)
            next_relation = self.table_stats.create_next_relation()

        self.assertListEqual(actual, expected)

    def test_create_records(self) -> None:
        expected = [{
            'rk': 'hive://gold.base/test/col/avg/',
            'stat_val': '1',
            'stat_type': 'avg',
            'start_epoch': '1',
            'end_epoch': '2',
            'column_rk': 'hive://gold.base/test/col'
        }]

        actual = []
        record = self.table_stats.create_next_record()
        while record:
            serialized_record = mysql_serializer.serialize_record(record)
            actual.append(serialized_record)
            record = self.table_stats.create_next_record()

        self.assertEqual(actual, expected)
Example #7
0
class TestTableStats(unittest.TestCase):
    def setUp(self) -> None:
        super(TestTableStats, self).setUp()
        self.table_stats = TableColumnStats(
            table_name='base.test',
            col_name='col',
            stat_name='avg',
            stat_val='1',
            start_epoch='1',
            end_epoch='2',
        )

        self.expected_node_result = {
            NODE_KEY: 'hive://gold.base/test/col/avg/',
            NODE_LABEL: 'Stat',
            'stat_val': '1',
            'stat_name': 'avg',
            'start_epoch': '1',
            'end_epoch': '2',
        }

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.base/test/col/avg/',
            RELATION_START_LABEL: 'Stat',
            RELATION_END_KEY: 'hive://gold.base/test/col',
            RELATION_END_LABEL: 'Column',
            RELATION_TYPE: 'STAT_OF',
            RELATION_REVERSE_TYPE: 'STAT'
        }

    def test_get_table_stat_model_key(self) -> None:
        table_stats = self.table_stats.get_table_stat_model_key()
        self.assertEqual(table_stats, 'hive://gold.base/test/col/avg/')

    def test_get_col_key(self) -> None:
        metadata = self.table_stats.get_col_key()
        self.assertEqual(metadata, 'hive://gold.base/test/col')

    def test_create_nodes(self) -> None:
        nodes = self.table_stats.create_nodes()
        self.assertEquals(len(nodes), 1)
        serialized_node = neo4_serializer.serialize_node(nodes[0])
        self.assertEquals(serialized_node, self.expected_node_result)

    def test_create_relation(self) -> None:
        relation = self.table_stats.create_relation()

        self.assertEquals(len(relation), 1)
        serialized_relation = neo4_serializer.serialize_relationship(
            relation[0])
        self.assertEquals(serialized_relation, self.expected_relation_result)

    def test_create_next_node(self) -> None:
        next_node = self.table_stats.create_next_node()
        serialized_node = neo4_serializer.serialize_node(next_node)
        self.assertEquals(serialized_node, self.expected_node_result)

    def test_create_next_node_neptune(self) -> None:
        next_node = self.table_stats.create_next_node()
        serialized_node = neptune_serializer.convert_node(next_node)
        expected_neptune_node = {
            NEPTUNE_HEADER_ID:
            'hive://gold.base/test/col/avg/',
            NEPTUNE_HEADER_LABEL:
            'Stat',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB,
            'stat_val:String(single)':
            '1',
            'stat_name:String(single)':
            'avg',
            'start_epoch:String(single)':
            '1',
            'end_epoch:String(single)':
            '2',
        }
        self.assertDictEqual(serialized_node, expected_neptune_node)

    def test_create_next_relation(self) -> None:
        next_relation = self.table_stats.create_next_relation()
        serialized_relation = neo4_serializer.serialize_relationship(
            next_relation)
        self.assertEquals(serialized_relation, self.expected_relation_result)

    def test_create_next_relation_neptune(self) -> None:
        next_relation = self.table_stats.create_next_relation()

        self.expected_relation_result = {
            RELATION_START_KEY: 'hive://gold.base/test/col/avg/',
            RELATION_START_LABEL: 'Stat',
            RELATION_END_KEY: 'hive://gold.base/test/col',
            RELATION_END_LABEL: 'Column',
            RELATION_TYPE: 'STAT_OF',
            RELATION_REVERSE_TYPE: 'STAT'
        }

        expected = [{
            NEPTUNE_HEADER_ID:
            "{from_vertex_id}_{to_vertex_id}_{label}".format(
                from_vertex_id='hive://gold.base/test/col/avg/',
                to_vertex_id='hive://gold.base/test/col',
                label='STAT_OF'),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            'hive://gold.base/test/col/avg/',
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            'hive://gold.base/test/col',
            NEPTUNE_HEADER_LABEL:
            'STAT_OF',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }, {
            NEPTUNE_HEADER_ID:
            "{from_vertex_id}_{to_vertex_id}_{label}".format(
                from_vertex_id='hive://gold.base/test/col',
                to_vertex_id='hive://gold.base/test/col/avg/',
                label='STAT'),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            'hive://gold.base/test/col',
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            'hive://gold.base/test/col/avg/',
            NEPTUNE_HEADER_LABEL:
            'STAT',
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }]

        serialized_relation = neptune_serializer.convert_relationship(
            next_relation)
        self.assertListEqual(serialized_relation, expected)
Example #8
0
    def _test_extractor_without_technical_data(self, es_version: str,
                                               indices: Dict,
                                               stats: Dict) -> None:
        extractor = self._get_extractor()

        extractor._get_es_version = lambda: es_version
        extractor.es.indices.get = MagicMock(return_value=indices)
        extractor.es.search = MagicMock(return_value=stats)

        common = {
            'db': 'elasticsearch',
            'schema': 'schema_name',
            'table_name': 'proper_index',
            'cluster': 'cluster_name',
            'start_epoch': '0',
            'end_epoch': '0'
        }

        compare_params = {
            'table', 'schema', 'db', 'col_name', 'start_epoch', 'end_epoch',
            'cluster', 'stat_type', 'stat_val'
        }
        expected = [{x: spec[x]
                     for x in compare_params if x in spec} for spec in [
                         TableColumnStats(
                             **{
                                 **dict(stat_name='avg',
                                        stat_val='5',
                                        col_name='long_property'),
                                 **common
                             }).__dict__,
                         TableColumnStats(
                             **{
                                 **dict(stat_name='sum',
                                        stat_val='10',
                                        col_name='long_property'),
                                 **common
                             }).__dict__,
                         TableColumnStats(
                             **{
                                 **dict(stat_name='count',
                                        stat_val='2',
                                        col_name='long_property'),
                                 **common
                             }).__dict__,
                     ]]

        result = []

        while True:
            stat = extractor.extract()

            if stat:
                result.append(stat)
            else:
                break

        result_spec = [{
            x: spec.__dict__[x]
            for x in compare_params if x in spec.__dict__
        } for spec in result]

        for r in result:
            self.assertIsInstance(r, TableColumnStats)

        self.assertListEqual(expected, result_spec)
Example #9
0
    def test_extractor(self) -> None:
        extractor = self._get_extractor()

        extractor._load_report = MagicMock(return_value=self.report_data)

        common = {
            'db':
            self._common_params().get(
                'extractor.pandas_profiling.database_name'),
            'schema':
            self._common_params().get(
                'extractor.pandas_profiling.schema_name'),
            'table_name':
            self._common_params().get('extractor.pandas_profiling.table_name'),
            'cluster':
            self._common_params().get(
                'extractor.pandas_profiling.cluster_name'),
            'start_epoch':
            '1621246215',
            'end_epoch':
            '0'
        }
        compare_params = {
            'table', 'schema', 'db', 'col_name', 'start_epoch', 'end_epoch',
            'cluster', 'stat_type', 'stat_val'
        }
        expected = [{x: spec[x]
                     for x in compare_params if x in spec} for spec in [
                         TableColumnStats(
                             **{
                                 **dict(stat_name='Mean',
                                        stat_val='5.12',
                                        col_name='column_1'),
                                 **common
                             }).__dict__,
                         TableColumnStats(
                             **{
                                 **dict(stat_name='Maximum',
                                        stat_val='15.235',
                                        col_name='column_1'),
                                 **common
                             }).__dict__,
                         TableColumnStats(
                             **{
                                 **dict(stat_name='Mean',
                                        stat_val='10.0',
                                        col_name='column_2'),
                                 **common
                             }).__dict__,
                     ]]

        result = []

        while True:
            stat = extractor.extract()

            if stat:
                result.append(stat)
            else:
                break

        result_spec = [{
            x: spec.__dict__[x]
            for x in compare_params if x in spec.__dict__
        } for spec in result]

        for r in result:
            self.assertIsInstance(r, TableColumnStats)

        self.assertListEqual(expected, result_spec)