def test_deletes_expired_and_unused_users(self): ensure_databases_configured().handle() user = UserFactory(email='*****@*****.**') st = SourceTableFactory( dataset=MasterDataSetFactory.create( user_access_type='REQUIRES_AUTHENTICATION' ) ) source_tables = source_tables_for_user(user) db_role_schema_suffix = db_role_schema_suffix_for_user(user) user_creds_to_drop = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user.email), user, valid_for=datetime.timedelta(days=31), ) qs_creds_to_drop = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user.email, suffix='qs'), user, valid_for=datetime.timedelta(seconds=0), ) qs_creds_to_keep = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user.email, suffix='qs'), user, valid_for=datetime.timedelta(minutes=1), ) connections[st.database.memorable_name].cursor().execute('COMMIT') # Make sure that `qs_creds_to_drop` has definitely expired time.sleep(1) with mock.patch('dataworkspace.apps.applications.utils.gevent.sleep'): delete_unused_datasets_users() with connections[st.database.memorable_name].cursor() as cursor: cursor.execute( "SELECT usename FROM pg_catalog.pg_user WHERE usename IN %s", [ ( user_creds_to_drop[0]['db_user'], qs_creds_to_drop[0]['db_user'], qs_creds_to_keep[0]['db_user'], ) ], ) assert cursor.fetchall() == [(qs_creds_to_keep[0]['db_user'],)]
def test_new_credentials_have_pgaudit_configuration(self): ensure_databases_configured().handle() user = UserFactory(email="*****@*****.**") st = SourceTableFactory(dataset=MasterDataSetFactory.create( user_access_type=UserAccessType.REQUIRES_AUTHENTICATION)) source_tables = source_tables_for_user(user) db_role_schema_suffix = db_role_schema_suffix_for_user(user) user_creds_to_drop = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user.email), user, valid_for=datetime.timedelta(days=1), ) connections[st.database.memorable_name].cursor().execute("COMMIT") rolename = user_creds_to_drop[0]["db_user"] query = f"SELECT rolname, rolconfig FROM pg_roles WHERE rolname = '{rolename}';" with connections[st.database.memorable_name].cursor() as cursor: cursor.execute(query) results = cursor.fetchall() assert "pgaudit.log=ALL" in results[0][1] assert "pgaudit.log_catalog=off" in results[0][1]
def spawn( name, user_id, tag, application_instance_id, spawner_options, ): user = get_user_model().objects.get(pk=user_id) application_instance = ApplicationInstance.objects.get( id=application_instance_id) (source_tables, db_role_schema_suffix, db_user) = ( ( source_tables_for_user(user), db_role_schema_suffix_for_user(user), postgres_user(user.email), ) if application_instance.application_template.application_type == 'TOOL' else ( source_tables_for_app(application_instance.application_template), db_role_schema_suffix_for_app( application_instance.application_template), postgres_user( application_instance.application_template.host_basename), )) credentials = new_private_database_credentials( db_role_schema_suffix, source_tables, db_user, user, valid_for=datetime.timedelta(days=31), ) if application_instance.application_template.application_type == 'TOOL': # For AppStream to access credentials write_credentials_to_bucket(user, credentials) app_schema = f'{USER_SCHEMA_STEM}{db_role_schema_suffix}' get_spawner(name).spawn( user, tag, application_instance, spawner_options, credentials, app_schema, )
def handle(self, *args, **options): self.stdout.write('store_db_creds_in_s3 started') bucket = settings.NOTEBOOKS_BUCKET self.stdout.write('Will store credentials in bucket {}'.format(bucket)) all_users = get_user_model().objects.order_by('last_name', 'first_name', 'id') for user in all_users: self.stdout.write(f'Creating credentials for {user.email}') source_tables = source_tables_for_user(user) db_role_schema_suffix = db_role_schema_suffix_for_user(user) creds = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user.email) ) write_credentials_to_bucket(user, creds) self.stdout.write(str(creds)) self.stdout.write(self.style.SUCCESS('store_db_creds_in_s3 finished'))
def handle(self, *args, **options): self.stdout.write("store_db_creds_in_s3 started") bucket = settings.NOTEBOOKS_BUCKET self.stdout.write("Will store credentials in bucket {}".format(bucket)) all_users = get_user_model().objects.order_by("last_name", "first_name", "id") for user in all_users: self.stdout.write(f"Creating credentials for {user.email}") source_tables = source_tables_for_user(user) db_role_schema_suffix = db_role_schema_suffix_for_user(user) creds = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user.email), user, valid_for=datetime.timedelta(days=31), ) write_credentials_to_bucket(user, creds) self.stdout.write(str(creds)) self.stdout.write(self.style.SUCCESS("store_db_creds_in_s3 finished"))
def test_postgres_user_is_restricted_to_63_chars(self, email, suffix, expected_match, expected_length): username = postgres_user(email, suffix=suffix) assert re.match(expected_match, username) assert len(username) == expected_length
def test_very_long_suffix_raises_value_error(self): with pytest.raises(ValueError): postgres_user( "*****@*****.**", suffix="my-very-long-suffix-that-uses-too-many-characters", )
def application_api_PUT(request, public_host): # A transaction is unnecessary: the single_running_or_spawning_integrity # key prevents duplicate spawning/running applications at the same # public host try: application_instance = get_api_visible_application_instance_by_public_host( public_host) except ApplicationInstance.DoesNotExist: pass else: return JsonResponse({'message': 'Application instance already exists'}, status=409) try: ( application_template, tag, _, commit_id, ) = application_template_tag_user_commit_from_host(public_host) except ApplicationTemplate.DoesNotExist: return JsonResponse({'message': 'Application template does not exist'}, status=400) app_type = application_template.application_type (source_tables, db_role_schema_suffix, db_user) = (( source_tables_for_user(request.user), db_role_schema_suffix_for_user(request.user), postgres_user(request.user.email), ) if app_type == 'TOOL' else ( source_tables_for_app(application_template), db_role_schema_suffix_for_app(application_template), postgres_user(application_template.host_basename), )) credentials = new_private_database_credentials(db_role_schema_suffix, source_tables, db_user) if app_type == 'TOOL': # For AppStream to access credentials write_credentials_to_bucket(request.user, credentials) try: memory, cpu = request.GET['__memory_cpu'].split('_') except KeyError: memory = None cpu = None spawner_options = json.dumps(application_options(application_template)) try: application_instance = ApplicationInstance.objects.create( owner=request.user, application_template=application_template, spawner=application_template.spawner, spawner_application_template_options=spawner_options, spawner_application_instance_id=json.dumps({}), public_host=public_host, state='SPAWNING', single_running_or_spawning_integrity=public_host, cpu=cpu, memory=memory, commit_id=commit_id, ) except IntegrityError: application_instance = get_api_visible_application_instance_by_public_host( public_host) else: # The database users are stored so when the database users are cleaned up, # we know _not_ to delete any users used by running or spawning apps for creds in credentials: ApplicationInstanceDbUsers.objects.create( application_instance=application_instance, db_id=creds['db_id'], db_username=creds['db_user'], ) app_schema = f'{USER_SCHEMA_STEM}{db_role_schema_suffix}' spawn.delay( application_template.spawner, request.user.email, str(request.user.profile.sso_id), tag, application_instance.id, spawner_options, credentials, app_schema, ) return JsonResponse(api_application_dict(application_instance), status=200)
def get_user_explorer_connection_settings(user, alias): from dataworkspace.apps.explorer.connections import ( # pylint: disable=import-outside-toplevel connections, ) if not alias: alias = settings.EXPLORER_DEFAULT_CONNECTION if alias not in connections: raise InvalidExplorerConnectionException( 'Attempted to access connection %s, but that is not a registered Explorer connection.' % alias ) def get_available_user_connections(_user_credentials): return {data['memorable_name']: data for data in _user_credentials} with cache.lock( f'get-explorer-connection-{user.profile.sso_id}', blocking_timeout=30, timeout=180, ): cache_key = user_cached_credentials_key(user) user_credentials = cache.get(cache_key, None) # Make sure that the connection settings are still valid if user_credentials: db_aliases_to_credentials = get_available_user_connections(user_credentials) try: with user_explorer_connection(db_aliases_to_credentials[alias]): pass except psycopg2.OperationalError: logger.exception( "Unable to connect using existing cached explorer credentials for %s", user, ) user_credentials = None if not user_credentials: db_role_schema_suffix = db_role_schema_suffix_for_user(user) source_tables = source_tables_for_user(user) db_user = postgres_user(user.email, suffix='explorer') duration = timedelta(hours=24) cache_duration = (duration - timedelta(minutes=15)).total_seconds() user_credentials = new_private_database_credentials( db_role_schema_suffix, source_tables, db_user, valid_for=duration, force_create_for_databases=Database.objects.filter( memorable_name__in=connections.keys() ).all(), ) cache.set(cache_key, user_credentials, timeout=cache_duration) db_aliases_to_credentials = get_available_user_connections(user_credentials) if alias not in db_aliases_to_credentials: raise RuntimeError( f"The credentials for {user.email} did not include any for the `{alias}` database." ) return db_aliases_to_credentials[alias]
def get_superset_credentials(request): superset_endpoint = { urlparse(url).netloc: name for name, url in settings.SUPERSET_DOMAINS.items() }[request.headers["host"]] cache_key = get_cached_credentials_key( request.headers["sso-profile-user-id"], superset_endpoint ) response = cache.get(cache_key, None) if not response: dw_user = get_user_model().objects.get( profile__sso_id=request.headers["sso-profile-user-id"] ) if not dw_user.user_permissions.filter( codename="start_all_applications", content_type=ContentType.objects.get_for_model(ApplicationInstance), ).exists(): return HttpResponse("Unauthorized", status=401) duration = timedelta(hours=24) cache_duration = (duration - timedelta(minutes=15)).total_seconds() # Give "public" users full db credentials if superset_endpoint == "view": dashboards_user_can_access = [ d.identifier for d in VisualisationLink.objects.filter(visualisation_type="SUPERSET") if d.visualisation_catalogue_item.user_has_access(dw_user) ] credentials = [ { "memorable_name": alias, "db_name": data["NAME"], "db_host": data["HOST"], "db_port": data["PORT"], "db_user": data["USER"], "db_password": data["PASSWORD"], } for alias, data in settings.DATABASES_DATA.items() ] # Give "editor"/"admin" users temp private credentials else: dashboards_user_can_access = [] source_tables = source_tables_for_user(dw_user) db_role_schema_suffix = stable_identification_suffix( str(dw_user.profile.sso_id), short=True ) credentials = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(dw_user.email, suffix="superset"), dw_user, valid_for=duration, ) response = { "credentials": credentials[0], "dashboards": dashboards_user_can_access, } cache.set(cache_key, response, timeout=cache_duration) return JsonResponse(response)
def sync_quicksight_users(data_client, user_client, account_id, quicksight_user_list): for quicksight_user in quicksight_user_list: user_arn = quicksight_user['Arn'] user_email = quicksight_user['Email'] user_role = quicksight_user['Role'] user_username = quicksight_user['UserName'] if user_role not in {"AUTHOR", "ADMIN"}: logger.info("Skipping %s with role %s.", user_email, user_role) continue try: # Lightly enforce that only instance can edit permissions for a user at a time. with cache.lock( f"sync-quicksight-permissions-{user_arn}", blocking_timeout=60, timeout=360, ): try: if user_role == "ADMIN": user_client.update_user( AwsAccountId=account_id, Namespace='default', Role=user_role, UnapplyCustomPermissions=True, UserName=user_username, Email=user_email, ) else: user_client.update_user( AwsAccountId=account_id, Namespace="default", Role=user_role, CustomPermissionsName=settings. QUICKSIGHT_AUTHOR_CUSTOM_PERMISSIONS, UserName=user_username, Email=user_email, ) except botocore.exceptions.ClientError as e: if e.response['Error'][ 'Code'] == 'ResourceNotFoundException': continue # Can be raised if the user has been deactivated/"deleted" raise e dw_user = get_user_model().objects.filter( email=user_email).first() if not dw_user: logger.info( "Skipping %s - cannot match with Data Workspace user.", user_email, ) continue # We technically ignore the case for where a single email has multiple matches on DW, but I'm not # sure this is a case that can happen - and if it can, we don't care while prototyping. logger.info("Syncing QuickSight resources for %s", dw_user) source_tables = source_tables_for_user(dw_user) db_role_schema_suffix = stable_identification_suffix( user_arn, short=True) # This creates a DB user for each of our datasets DBs. These users are intended to be long-lived, # so they might already exist. If this is the case, we still generate a new password, as at the moment # these user accounts only last for 31 days by default - so we need to update the password to keep them # from expiring. creds = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user_email, suffix='qs'), valid_for=datetime.timedelta( days=7 ), # We refresh these creds every night, so they don't need to last long at all. ) create_update_delete_quicksight_user_data_sources( data_client, account_id, quicksight_user, creds) except redis.exceptions.LockError: logger.exception("Unable to sync permissions for %s", quicksight_user['Arn'])
def sync_quicksight_permissions( user_sso_ids_to_update=tuple(), poll_for_user_creation=False ): logger.info( 'sync_quicksight_user_datasources(%s, poll_for_user_creation=%s) started', user_sso_ids_to_update, poll_for_user_creation, ) # QuickSight manages users in a single specific regions user_client = boto3.client( 'quicksight', region_name=settings.QUICKSIGHT_USER_REGION ) # Data sources can be in other regions - so here we use the Data Workspace default from its env vars. data_client = boto3.client('quicksight') account_id = boto3.client('sts').get_caller_identity().get('Account') quicksight_user_list: List[Dict[str, str]] if len(user_sso_ids_to_update) > 0: quicksight_user_list = [] for user_sso_id in user_sso_ids_to_update: # Poll for the user for 5 minutes attempts = (5 * 60) if poll_for_user_creation else 1 for _ in range(attempts): attempts -= 1 try: quicksight_user_list.append( user_client.describe_user( AwsAccountId=account_id, Namespace='default', # \/ This is the format of the user name created by DIT SSO \/ UserName=f'quicksight_federation/{user_sso_id}', )['User'] ) break except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'ResourceNotFoundException': if attempts > 0: gevent.sleep(1) elif poll_for_user_creation: logger.exception( "Did not find user with sso id `%s` after 5 minutes", user_sso_id, ) else: raise e else: quicksight_user_list: List[Dict[str, str]] = user_client.list_users( AwsAccountId=account_id, Namespace='default' )['UserList'] for quicksight_user in quicksight_user_list: user_arn = quicksight_user['Arn'] user_email = quicksight_user['Email'] user_role = quicksight_user['Role'] user_username = quicksight_user['UserName'] if user_role not in {"AUTHOR", "ADMIN"}: logger.info("Skipping %s with role %s.", user_email, user_role) continue try: # Lightly enforce that only instance can edit permissions for a user at a time. with cache.lock( f"sync-quicksight-permissions-{user_arn}", blocking_timeout=60, timeout=360, ): try: if user_role == "ADMIN": user_client.update_user( AwsAccountId=account_id, Namespace='default', Role=user_role, UnapplyCustomPermissions=True, UserName=user_username, Email=user_email, ) else: user_client.update_user( AwsAccountId=account_id, Namespace="default", Role=user_role, CustomPermissionsName=settings.QUICKSIGHT_AUTHOR_CUSTOM_PERMISSIONS, UserName=user_username, Email=user_email, ) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'ResourceNotFoundException': continue # Can be raised if the user has been deactivated/"deleted" raise e dw_user = get_user_model().objects.filter(email=user_email).first() if not dw_user: logger.info( "Skipping %s - cannot match with Data Workspace user.", user_email, ) continue # We technically ignore the case for where a single email has multiple matches on DW, but I'm not # sure this is a case that can happen - and if it can, we don't care while prototyping. logger.info("Syncing QuickSight resources for %s", dw_user) source_tables = source_tables_for_user(dw_user) db_role_schema_suffix = stable_identification_suffix( user_arn, short=True ) # This creates a DB user for each of our datasets DBs. These users are intended to be long-lived, # so they might already exist. If this is the case, we still generate a new password, as at the moment # these user accounts only last for 31 days by default - so we need to update the password to keep them # from expiring. creds = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user_email, suffix='qs'), valid_for=datetime.timedelta( days=7 ), # We refresh these creds every night, so they don't need to last long at all. ) create_update_delete_quicksight_user_data_sources( data_client, account_id, quicksight_user, creds ) except redis.exceptions.LockError: logger.exception( "Unable to sync permissions for %s", quicksight_user['Arn'] ) logger.info( 'sync_quicksight_user_datasources(%s, poll_for_user_creation=%s) finished', user_sso_ids_to_update, poll_for_user_creation, )
def sync_quicksight_permissions(user_sso_ids_to_update=tuple()): try: # Lightly enforce that only instance is running the task at a time. The job normally takes just a few minutes. with cache.lock( "sync-quicksight-permissions", blocking_timeout=360, timeout=3600 ): logger.info( f'sync_quicksight_user_datasources({user_sso_ids_to_update}) started' ) # QuickSight manages users in a single specific regions user_client = boto3.client( 'quicksight', region_name=settings.QUICKSIGHT_USER_REGION ) # Data sources can be in other regions - so here we use the Data Workspace default from its env vars. data_client = boto3.client('quicksight') account_id = boto3.client('sts').get_caller_identity().get('Account') quicksight_user_list: List[Dict[str, str]] if len(user_sso_ids_to_update) > 0: quicksight_user_list = [] for user_sso_id in user_sso_ids_to_update: try: quicksight_user_list.append( user_client.describe_user( AwsAccountId=account_id, Namespace='default', # \/ This is the format of the user name created by DIT SSO \/ UserName=f'quicksight_federation/{user_sso_id}', )['User'] ) except botocore.exceptions.ClientError as e: if e.response['Error']['Code'] == 'ResourceNotFoundException': pass # If the user isn't an author on QuickSight, just move on. else: raise e else: quicksight_user_list: List[Dict[str, str]] = user_client.list_users( AwsAccountId=account_id, Namespace='default' )['UserList'] for quicksight_user in quicksight_user_list: user_arn = quicksight_user['Arn'] user_email = quicksight_user['Email'] user_role = quicksight_user['Role'] if user_role != 'AUTHOR' and user_role != 'ADMIN': logger.info(f"Skipping {user_email} with role {user_role}.") continue dw_user = get_user_model().objects.filter(email=user_email).first() if not dw_user: logger.info( f"Skipping {user_email} - cannot match with Data Workspace user." ) continue else: # We technically ignore the case for where a single email has multiple matches on DW, but I'm not # sure this is a case that can happen - and if it can, we don't care while prototyping. logger.info(f"Syncing QuickSight resources for {dw_user}") source_tables = source_tables_for_user(dw_user) db_role_schema_suffix = stable_identification_suffix(user_arn) # This creates a DB user for each of our datasets DBs. These users are intended to be long-lived, # so they might already exist. If this is the case, we still generate a new password, as at the moment # these user accounts only last for 31 days by default - so we need to update the password to keep them # from expiring. creds = new_private_database_credentials( db_role_schema_suffix, source_tables, postgres_user(user_email, suffix='qs'), valid_for=datetime.timedelta( days=7 ), # We refresh these creds every night, so they don't need to last long at all. ) create_update_delete_quicksight_user_data_sources( data_client, account_id, quicksight_user, creds ) logger.info( f'sync_quicksight_user_datasources({user_sso_ids_to_update}) finished' ) except redis.exceptions.LockError: pass
def form_valid(self, form): # call new_private_database_credentials to make sure everything is set config = settings.DATAFLOW_API_CONFIG user = self.request.user source_tables = source_tables_for_user(user) db_role_schema_suffix = db_role_schema_suffix_for_user(user) db_user = postgres_user(user.email) duration = timedelta(hours=24) cleaned = form.cleaned_data if cleaned["schema"] not in self.all_schemas: new_private_database_credentials( db_role_schema_suffix, source_tables, db_user, user, duration, ) file_info = get_s3_csv_file_info(cleaned["path"]) logger.info(file_info) for field in file_info["column_definitions"]: field["data_type"] = SCHEMA_POSTGRES_DATA_TYPE_MAP.get( cleaned[field["column_name"]], PostgresDataTypes.TEXT ) import_path = settings.DATAFLOW_IMPORTS_BUCKET_ROOT + "/" + cleaned["path"] logger.debug("import_path %s", import_path) copy_file_to_uploads_bucket(cleaned["path"], import_path) filename = cleaned["path"].split("/")[-1] logger.debug(filename) conf = { "file_path": import_path, "schema_name": cleaned["schema"], "table_name": cleaned["table_name"], "column_definitions": file_info["column_definitions"], "encoding": file_info["encoding"], } logger.debug(conf) if cleaned["schema"] not in self.all_schemas: conf["db_role"] = cleaned["schema"] try: response = trigger_dataflow_dag( conf, config["DATAFLOW_S3_IMPORT_DAG"], f'{cleaned["schema"]}-{cleaned["table_name"]}-{datetime.now().isoformat()}', ) except HTTPError: return HttpResponseRedirect( f'{reverse("your-files:create-table-failed")}?' f"filename={filename}" ) params = { "filename": filename, "schema": cleaned["schema"], "table_name": cleaned["table_name"], "execution_date": response["execution_date"], } return HttpResponseRedirect( f'{reverse("your-files:create-table-validating")}?{urlencode(params)}' )