Example #1
0
    def transform(self, record):
        # type: (Any) -> Optional[TableColumnUsage]
        SqlToTblColUsageTransformer.total_counts += 1

        stmt = getattr(record, self._sql_stmt_attr)
        email = getattr(record, self._user_email_attr)

        result = []  # type: List[ColumnReader]
        try:
            columns = ColumnUsageProvider.get_columns(query=stmt)
            # LOGGER.info('Statement: {} ---> columns: {}'.format(stmt, columns))
        except Exception:
            SqlToTblColUsageTransformer.failure_counts += 1
            LOGGER.exception(
                'Failed to get column usage from query: {}'.format(stmt))
            return None

        # Dedupe is needed to make it table level. TODO: Remove this once we are at column level
        dedupe_tuples = set()  # type: set
        for col in columns:
            sub_result = self._get_col_readers(table=col.table,
                                               stmt=stmt,
                                               email=email,
                                               dedupe_tuples=dedupe_tuples)
            result.extend(sub_result)

        if not result:
            return None

        return TableColumnUsage(col_readers=result)
Example #2
0
    def test_mysql_serialize(self) -> None:
        col_readers = [ColumnReader(database='db', cluster='gold', schema='scm', table='foo', column='*',
                                    user_email='*****@*****.**')]
        table_col_usage = TableColumnUsage(col_readers=col_readers)

        actual = []
        record = table_col_usage.next_record()
        while record:
            actual.append(mysql_serializer.serialize_record(record))
            record = table_col_usage.next_record()

        expected_user = {'rk': '*****@*****.**',
                         'first_name': '',
                         'last_name': '',
                         'full_name': '',
                         'employee_type': '',
                         'is_active': True,
                         'updated_at': 0,
                         'slack_id': '',
                         'github_username': '',
                         'team_name': '',
                         'email': '*****@*****.**',
                         'role_name': ''}
        expected_usage = {'table_rk': 'db://gold.scm/foo',
                          'user_rk': '*****@*****.**',
                          'read_count': 1}
        expected = [expected_user, expected_usage]

        self.assertEqual(expected, actual)
    def test_serialize(self):
        # type: () -> None

        col_readers = [
            ColumnReader(database='db',
                         cluster='gold',
                         schema='scm',
                         table='foo',
                         column='*',
                         user_email='*****@*****.**'),
            ColumnReader(database='db',
                         cluster='gold',
                         schema='scm',
                         table='bar',
                         column='*',
                         user_email='*****@*****.**')
        ]
        table_col_usage = TableColumnUsage(col_readers=col_readers)

        node_row = table_col_usage.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = table_col_usage.next_node()

        expected = [{
            'email': '*****@*****.**',
            'KEY': '*****@*****.**',
            'LABEL': 'User'
        }, {
            'email': '*****@*****.**',
            'KEY': '*****@*****.**',
            'LABEL': 'User'
        }]
        self.assertEqual(expected, actual)

        rel_row = table_col_usage.next_relation()
        actual = []
        while rel_row:
            actual.append(rel_row)
            rel_row = table_col_usage.next_relation()

        expected = [{
            'read_count:UNQUOTED': 1,
            'END_KEY': '*****@*****.**',
            'START_LABEL': 'Table',
            'END_LABEL': 'User',
            'START_KEY': 'db://gold.scm/foo',
            'TYPE': 'READ_BY',
            'REVERSE_TYPE': 'READ'
        }, {
            'read_count:UNQUOTED': 1,
            'END_KEY': '*****@*****.**',
            'START_LABEL': 'Table',
            'END_LABEL': 'User',
            'START_KEY': 'db://gold.scm/bar',
            'TYPE': 'READ_BY',
            'REVERSE_TYPE': 'READ'
        }]
        self.assertEqual(expected, actual)
Example #4
0
    def test_extraction(self) -> None:
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            sql_execute.return_value = [{
                'database': 'gold',
                'schema': 'scm',
                'name': 'foo',
                'user_email': '*****@*****.**',
                'read_count': 1
            }]

            expected = TableColumnUsage(col_readers=[
                ColumnReader(database='snowflake',
                             cluster='gold',
                             schema='scm',
                             table='foo',
                             column='*',
                             user_email='*****@*****.**',
                             read_count=1)
            ])

            extractor = GenericUsageExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Example #5
0
    def test_mysql_serialize(self) -> None:
        col_readers = [
            ColumnReader(database='db',
                         cluster='gold',
                         schema='scm',
                         table='foo',
                         column='*',
                         user_email='*****@*****.**')
        ]
        table_col_usage = TableColumnUsage(col_readers=col_readers)

        actual = []
        record = table_col_usage.next_record()
        while record:
            actual.append(mysql_serializer.serialize_record(record))
            record = table_col_usage.next_record()

        expected_user = {'rk': '*****@*****.**', 'email': '*****@*****.**'}
        expected_usage = {
            'table_rk': 'db://gold.scm/foo',
            'user_rk': '*****@*****.**',
            'read_count': 1
        }
        expected = [expected_user, expected_usage]

        self.assertEqual(expected, actual)
 def setUp(self) -> None:
     col_readers = [
         ColumnReader(database='db',
                      cluster='gold',
                      schema='scm',
                      table='foo',
                      column='*',
                      user_email='*****@*****.**')
     ]
     self.table_col_usage = TableColumnUsage(col_readers=col_readers)
    def test_serialize(self) -> None:

        col_readers = [
            ColumnReader(database='db',
                         cluster='gold',
                         schema='scm',
                         table='foo',
                         column='*',
                         user_email='*****@*****.**')
        ]
        table_col_usage = TableColumnUsage(col_readers=col_readers)

        node_row = table_col_usage.next_node()
        actual = []
        while node_row:

            actual.append(neo4_serializer.serialize_node(node_row))
            node_row = table_col_usage.next_node()

        expected = [{
            'first_name': '',
            'last_name': '',
            'full_name': '',
            'employee_type': '',
            'is_active:UNQUOTED': True,
            'updated_at:UNQUOTED': 0,
            'LABEL': 'User',
            'slack_id': '',
            'KEY': '*****@*****.**',
            'github_username': '',
            'team_name': '',
            'email': '*****@*****.**',
            'role_name': ''
        }]
        self.assertEqual(expected, actual)

        rel_row = table_col_usage.next_relation()
        actual = []
        while rel_row:
            actual.append(neo4_serializer.serialize_relationship(rel_row))
            rel_row = table_col_usage.next_relation()

        expected = [{
            'read_count:UNQUOTED': 1,
            'END_KEY': '*****@*****.**',
            'START_LABEL': 'Table',
            'END_LABEL': 'User',
            'START_KEY': 'db://gold.scm/foo',
            'TYPE': 'READ_BY',
            'REVERSE_TYPE': 'READ'
        }]
        self.assertEqual(expected, actual)
    def extract(self):
        # type: () -> Optional[TableColumnUsage]
        """
        It aggregates all count per table and user in memory. Table level aggregation don't expect to occupy much
        memory.
        :return: Provides a record or None if no more to extract
        """
        count_map = {}  # type: Dict[TableColumnUsageTuple, int]
        record = self._extractor.extract()

        count = 0
        while record:
            count += 1
            if count % 1000 == 0:
                LOGGER.info('Aggregated {} records'.format(count))

            tbl_col_usg = self._transformer.transform(record=record)
            record = self._extractor.extract()
            # filtered case
            if not tbl_col_usg:
                continue

            for col_rdr in tbl_col_usg.col_readers:
                key = TableColumnUsageTuple(database=col_rdr.database,
                                            cluster=col_rdr.cluster,
                                            schema=col_rdr.schema,
                                            table=col_rdr.table,
                                            column=col_rdr.column,
                                            email=col_rdr.user_email)
                new_count = count_map.get(key, 0) + col_rdr.read_count
                count_map[key] = new_count

        if not len(count_map):
            return None

        col_readers = []  # type: List[ColumnReader]

        while len(count_map):
            tbl_col_rdr_tuple, count = count_map.popitem()
            col_readers.append(
                ColumnReader(database=tbl_col_rdr_tuple.database,
                             cluster=tbl_col_rdr_tuple.cluster,
                             schema=tbl_col_rdr_tuple.schema,
                             table=tbl_col_rdr_tuple.table,
                             column=tbl_col_rdr_tuple.column,
                             user_email=tbl_col_rdr_tuple.email,
                             read_count=count))

        return TableColumnUsage(col_readers=col_readers)
Example #9
0
 def _get_extract_iter(self) -> Iterator[TableColumnUsage]:
     """
     Using raw level iterator, it groups to table and yields TableColumnUsage
     :return:
     """
     for row in self._get_raw_extract_iter():
         col_readers = []
         col_readers.append(
             ColumnReader(database=self._database_key,
                          cluster=row["database"],
                          schema=row["schema"],
                          table=row["name"],
                          column="*",
                          user_email=row["user_email"],
                          read_count=row["read_count"]))
         yield TableColumnUsage(col_readers=col_readers)
    def test(self):
        # type: () -> None
        config = ConfigFactory.from_dict({
            SqlToTblColUsageTransformer.DATABASE_NAME:
            'database',
            SqlToTblColUsageTransformer.USER_EMAIL_ATTRIBUTE_NAME:
            'email',
            SqlToTblColUsageTransformer.SQL_STATEMENT_ATTRIBUTE_NAME:
            'statement'
        })

        with patch.object(HiveTableMetadataExtractor, 'extract') as mock_extract,\
                patch.object(HiveTableMetadataExtractor, 'init'):
            mock_extract.side_effect = [
                TableMetadata(
                    'hive', 'gold', 'test_schema1', 'test_table1',
                    'test_table1', [
                        ColumnMetadata('test_id1',
                                       'description of test_table1', 'bigint',
                                       0),
                        ColumnMetadata('test_id2', 'description of test_id2',
                                       'bigint', 1),
                        ColumnMetadata('is_active', None, 'boolean', 2),
                        ColumnMetadata('source', 'description of source',
                                       'varchar', 3),
                        ColumnMetadata('etl_created_at',
                                       'description of etl_created_at',
                                       'timestamp', 4),
                        ColumnMetadata('ds', None, 'varchar', 5)
                    ]), None
            ]

            transformer = SqlToTblColUsageTransformer()
            transformer.init(config)
            foo = Foo(email='*****@*****.**',
                      statement='SELECT foo, bar FROM test_table1')

            actual = transformer.transform(foo)
            expected = TableColumnUsage(col_readers=[
                ColumnReader(database=u'database',
                             cluster=u'gold',
                             schema='test_schema1',
                             table='test_table1',
                             column='*',
                             user_email='*****@*****.**')
            ])
            self.assertEqual(expected.__repr__(), actual.__repr__())
Example #11
0
    def transform(self, record):
        # type: (Any) -> Optional[TableColumnUsage]
        SqlToTblColUsageTransformer.total_counts += 1

        stmt = getattr(record, self._sql_stmt_attr)
        email = getattr(record, self._user_email_attr)

        result = []  # type: List[ColumnReader]
        try:
            columns = self._worker_pool.apply_async(
                ColumnUsageProvider.get_columns,
                (stmt, )).get(self._time_out_sec)
            # LOGGER.info('Statement: {} ---> columns: {}'.format(stmt, columns))
        except TimeoutError:
            SqlToTblColUsageTransformer.failure_counts += 1
            LOGGER.exception(
                'Timed out while getting column usage from query: {}'.format(
                    stmt))
            LOGGER.info('Killing the thread.')
            self._worker_pool.terminate()
            self._worker_pool = Pool(processes=1)
            LOGGER.info('Killed the thread.')
            return None
        except Exception:
            SqlToTblColUsageTransformer.failure_counts += 1
            if self._log_all_extraction_failures:
                LOGGER.exception(
                    'Failed to get column usage from query: {}'.format(stmt))
            return None

        # Dedupe is needed to make it table level. TODO: Remove this once we are at column level
        dedupe_tuples = set()  # type: set
        for col in columns:
            sub_result = self._get_col_readers(table=col.table,
                                               stmt=stmt,
                                               email=email,
                                               dedupe_tuples=dedupe_tuples)
            result.extend(sub_result)

        if not result:
            return None

        return TableColumnUsage(col_readers=result)
Example #12
0
    def transform(self, record: Tuple[TableColumnUsageTuple, int]) -> Optional[TableColumnUsage]:
        if not record:
            return None

        (key, count) = record

        if not isinstance(key, TableColumnUsageTuple):
            raise Exception("BigqueryUsageTransformer expects record of type TableColumnUsageTuple")

        col_readers = []
        col_readers.append(ColumnReader(database=key.database,
                                        cluster=key.cluster,
                                        schema=key.schema,
                                        table=key.table,
                                        column=key.column,
                                        user_email=key.email,
                                        read_count=count))

        return TableColumnUsage(col_readers=col_readers)
Example #13
0
    def test_aggregate(self):
        # type: () -> None
        with patch.object(RegexStrReplaceTransformer, 'init'),\
                patch.object(SqlToTblColUsageTransformer, 'init'),\
                patch.object(RegexStrReplaceTransformer, 'transform'),\
                patch.object(SqlToTblColUsageTransformer, 'transform') as mock_sql_transform:

            raw_extractor = MagicMock()
            mock_raw_extractor = MagicMock()
            raw_extractor.extract = mock_raw_extractor
            raw_extractor.get_scope.return_value = 'foo'

            # Just to iterate 5 times
            mock_raw_extractor.side_effect = ['foo', 'bar', 'foo', 'bar', None]

            conf = ConfigFactory.from_dict({RAW_EXTRACTOR: raw_extractor})

            mock_sql_transform.side_effect = [
                TableColumnUsage(col_readers=[
                    ColumnReader(database='database',
                                 cluster='gold',
                                 schema='test_schema1',
                                 table='test_table1',
                                 column='*',
                                 user_email='*****@*****.**')
                ]),
                TableColumnUsage(col_readers=[
                    ColumnReader(database='database',
                                 cluster='gold',
                                 schema='test_schema1',
                                 table='test_table1',
                                 column='*',
                                 user_email='*****@*****.**',
                                 read_count=2)
                ]),
                TableColumnUsage(col_readers=[
                    ColumnReader(database='database',
                                 cluster='gold',
                                 schema='test_schema1',
                                 table='test_table2',
                                 column='*',
                                 user_email='*****@*****.**',
                                 read_count=5)
                ]), None
            ]

            extractor = TblColUsgAggExtractor()
            extractor.init(conf)
            actual = extractor.extract()
            expected = TableColumnUsage(col_readers=[
                ColumnReader(database='database',
                             cluster='gold',
                             schema='test_schema1',
                             table='test_table1',
                             column='*',
                             user_email='*****@*****.**',
                             read_count=3),
                ColumnReader(database='database',
                             cluster='gold',
                             schema='test_schema1',
                             table='test_table2',
                             column='*',
                             user_email='*****@*****.**',
                             read_count=5)
            ])

            self.assertEqual(expected.__repr__(), actual.__repr__())