def __init__(self, ns): self._connection = None self._cache_ts = None self.catalog = ns.catalog self._include_schema = ns.include_schema self._exclude_schema = ns.exclude_schema self._include_table = ns.include_table self._exclude_table = ns.exclude_table self._database = Database( "database", include=self._include_schema, exclude=self._exclude_schema )
def setUp(self): self.explorer = MockExplorer( Namespace( host="mock_connection", include_schema=(), exclude_schema=(), include_table=(), exclude_table=(), catalog=None, )) col1 = Column("c1") col2 = Column("c2") col2._pii = [PiiTypes.LOCATION] schema = Schema("s1") table = Table(schema, "t1") table.add_child(col1) table.add_child(col2) schema = Schema("testSchema") schema.add_child(table) self.explorer._database = Database("database") self.explorer._database.add_child(schema)
def _load_catalog(self): if self._cache_ts is None or self._cache_ts < datetime.now() - timedelta( minutes=10 ): with self._get_context_manager() as cursor: logging.debug("Catalog Query: {0}".format(self._get_catalog_query())) cursor.execute(self._get_catalog_query()) self._database = Database( "database", include=self._include_schema, exclude=self._exclude_schema, ) row = cursor.fetchone() current_schema = None current_table = None if row is not None: current_schema = Schema( row[0], include=self._include_table, exclude=self._exclude_table ) current_table = Table(current_schema, row[1]) while row is not None: if current_schema.get_name() != row[0]: current_schema.add_child(current_table) self._database.add_child(current_schema) current_schema = Schema( row[0], include=self._include_table, exclude=self._exclude_table, ) current_table = Table(current_schema, row[1]) elif current_table.get_name() != row[1]: current_schema.add_child(current_table) current_table = Table(current_schema, row[1]) current_table.add_child(Column(row[2])) row = cursor.fetchone() if current_schema is not None and current_table is not None: current_schema.add_child(current_table) self._database.add_child(current_schema) self._cache_ts = datetime.now()
def setUp(self): self.explorer = MockExplorer( Namespace(host="mock_connection", include_schema=(), exclude_schema=(), include_table=(), exclude_table=(), catalog=None)) col1 = Column('c1') col2 = Column('c2') col2._pii = [PiiTypes.LOCATION] schema = Schema('s1') table = Table(schema, 't1') table.add_child(col1) table.add_child(col2) schema = Schema('testSchema') schema.add_child(table) self.explorer._database = Database('database') self.explorer._database.add_child(schema)
class Explorer(ABC): query_template = "select {column_list} from {schema_name}.{table_name}" _count_query = "select count(*) from {schema_name}.{table_name}" def __init__(self, ns): self._connection = None self._cache_ts = None self.catalog = ns.catalog self._include_schema = ns.include_schema self._exclude_schema = ns.exclude_schema self._include_table = ns.include_table self._exclude_table = ns.exclude_table self._database = Database( "database", include=self._include_schema, exclude=self._exclude_schema ) def __enter__(self): return self def __exit__(self, exc_type, exc_val, exc_tb): self.close_connection() @abstractmethod def _open_connection(self): pass @abstractmethod def _get_catalog_query(self): pass @classmethod def factory(cls, ns): pass @property def small_table_max(self): return 100 @property def database(self): return self._database @classmethod def dispatch(cls, ns): logging.debug("Dispatch of %s" % cls.__name__) explorer = cls.factory(ns) if ns.scan_type is None or ns.scan_type == "deep": explorer.scan() else: explorer.shallow_scan() cls.output(ns, explorer) @classmethod def output(cls, ns, explorer): if ns.catalog["format"] == "ascii_table": headers = ["schema", "table", "column", "has_pii"] tableprint.table(explorer.get_tabular(ns.list_all), headers) elif ns.catalog["format"] == "json": FileStore.save_schemas(explorer) elif ns.catalog["format"] == "db": DbStore.save_schemas(explorer) def get_connection(self): if self._connection is None: self._connection = self._open_connection() return self._connection def close_connection(self): if self._connection is not None: self._connection.close() self._connection = None def scan(self): for schema in self.get_schemas(): schema.scan(self._generate_rows) def shallow_scan(self): for schema in self.get_schemas(): schema.shallow_scan() def get_tabular(self, list_all): tabular = [] for schema in self.get_schemas(): for table in schema.get_children(): for column in table.get_children(): if list_all or column.has_pii(): tabular.append( [ schema.get_name(), table.get_name(), column.get_name(), column.has_pii(), ] ) return tabular def get_dict(self): schemas = [] for schema in self._database.get_children(): schemas.append(schema.get_dict()) return schemas @classmethod def _get_count_query(cls, schema_name, table_name): return cls._count_query.format( schema_name=schema_name.get_name(), table_name=table_name.get_name() ) @classmethod def _get_select_query(cls, schema_name, table_name, column_list): return cls.query_template.format( column_list='"{0}"'.format( '","'.join(col.get_name() for col in column_list) ), schema_name=schema_name.get_name(), table_name=table_name.get_name(), ) @classmethod def _get_sample_query(cls, schema_name, table_name, column_list): raise NotImplementedError def _get_table_count(self, schema_name, table_name): count = self._get_count_query(schema_name, table_name) logging.debug("Count Query: %s" % count) with self._get_context_manager() as cursor: cursor.execute(count) row = cursor.fetchone() return int(row[0]) def _get_query(self, schema_name, table_name, column_list): count = self._get_table_count(schema_name, table_name) logging.debug( "No. of rows in {}.{} is {}".format(schema_name, table_name, count) ) if count < self.small_table_max: logging.debug("Choosing a SELECT query as table size is small") query = self._get_select_query(schema_name, table_name, column_list) else: try: query = self._get_sample_query(schema_name, table_name, column_list) logging.debug("Choosing a SAMPLE query as table size is big") except NotImplementedError: logging.warning( "Sample Row is not implemented for %s" % self.__class__.__name__ ) query = self._get_select_query(schema_name, table_name, column_list) return query def _generate_rows(self, schema_name, table_name, column_list): query = self._get_query(schema_name, table_name, column_list) logging.debug(query) with self._get_context_manager() as cursor: cursor.execute(query) row = cursor.fetchone() while row is not None: yield row row = cursor.fetchone() def _get_context_manager(self): return self.get_connection().cursor() def _load_catalog(self): if self._cache_ts is None or self._cache_ts < datetime.now() - timedelta( minutes=10 ): with self._get_context_manager() as cursor: logging.debug("Catalog Query: {0}".format(self._get_catalog_query())) cursor.execute(self._get_catalog_query()) self._database = Database( "database", include=self._include_schema, exclude=self._exclude_schema, ) row = cursor.fetchone() current_schema = None current_table = None if row is not None: current_schema = Schema( row[0], include=self._include_table, exclude=self._exclude_table ) current_table = Table(current_schema, row[1]) while row is not None: if current_schema.get_name() != row[0]: current_schema.add_child(current_table) self._database.add_child(current_schema) current_schema = Schema( row[0], include=self._include_table, exclude=self._exclude_table, ) current_table = Table(current_schema, row[1]) elif current_table.get_name() != row[1]: current_schema.add_child(current_table) current_table = Table(current_schema, row[1]) current_table.add_child(Column(row[2])) row = cursor.fetchone() if current_schema is not None and current_table is not None: current_schema.add_child(current_table) self._database.add_child(current_schema) self._cache_ts = datetime.now() def get_schemas(self): self._load_catalog() return self._database.get_children() def get_tables(self, schema_name): self._load_catalog() for s in self.get_schemas(): if s.get_name() == schema_name: return s.get_children() raise ValueError("{} schema not found".format(schema_name)) def get_columns(self, schema_name, table_name): self._load_catalog() tables = self.get_tables(schema_name) for t in tables: if t.get_name() == table_name: return t.get_children() raise ValueError("{} table not found".format(table_name))