def table_exists(schema_name, table_name): """ Checks whether the table exists. """ with core_utils.PostgreSQLCursor(db_schema=schema_name) as (psql_conn, psql_cursor): psql_cursor.execute( sql.SQL( 'SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema = {schemaname} AND table_name = {table_name})' ).format(schemaname=sql.Literal(schema_name), table_name=sql.Literal(table_name))) table_exists = psql_cursor.fetchone()[0] return table_exists
def materialized_view_exists(schemaname, matviewname): """ Checks whether materialized view exists. """ with core_utils.PostgreSQLCursor(db_schema=schemaname) as (psql_conn, psql_cursor): sql_statement = sql.SQL( 'SELECT EXISTS(SELECT 1 FROM pg_matviews WHERE schemaname = {schemaname} AND matviewname = {matviewname})' ).format(schemaname=sql.Literal(str(schemaname)), matviewname=sql.Literal(matviewname)) psql_cursor.execute(sql_statement) view_exists = psql_cursor.fetchone()[0] return view_exists
def cron_job_exists(schemaname, job_id): """ Checks whether cron job exists. TODO: Reconcile the jobs Django model with the pg_cron table, because they're two separate trackers for the same thing. For now, referencing cron.job because it's the source of truth. """ with core_utils.PostgreSQLCursor(db_schema=schemaname) as (psql_conn, psql_cursor): sql_statement = sql.SQL( "SELECT EXISTS(SELECT 1 FROM cron.job WHERE jobid = {job_id} AND active = 't')" ).format(job_id=sql.Literal(str(job_id))) psql_cursor.execute(sql_statement) job_exists = psql_cursor.fetchone()[0] return job_exists
def post(self, request, format='json'): user_serializer = CustomUserSerializer(data=request.data) if user_serializer.is_valid(): user = user_serializer.save() if user: # If the user is successfully created and saved into the # database, create a schema for that user in order to place all # subsequent database resources. with core_utils.PostgreSQLCursor() as (psql_conn, psql_cursor): create_schema_sql_statement = sql.SQL( "CREATE SCHEMA IF NOT EXISTS {}").format( sql.Identifier(str(user.id))) psql_cursor.execute(create_schema_sql_statement) psql_conn.commit() return Response(user_serializer.data, status=status.HTTP_201_CREATED) return Response(user_serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def execute_storedproc_trigger_on_refresh(): """ Calls the SQL file 'trigger_on_refresh.sql' in order to execute a PostgreSQL trigger upon refresh of the underlying status table. NOTE: This method is separate from `apps.py` and `AppConfig().ready()`, since resolution for Django ticket #31658 (https://code.djangoproject.com/ticket/31658) indicates SQL procedures should not be run as part of Django lifecycle methods. NOTE: To list all functions in PostgreSQL, open `psql` and run `\df+`. NOTE: Should use `logging.info`, but `logging` does not print to stdout properly in `broker.py`. This issue may have already been addressed, in which case switch `print()` statements to `logging.info()` statements. """ logger = app_logging.get_broker_logger() if is_database_synchronized(DEFAULT_DB_ALIAS): with core_utils.PostgreSQLCursor() as (psql_conn, psql_cursor): stored_procedure_abspath = os.path.abspath( os.path.join('channels_app', 'storedprocedures', 'trigger_on_refresh.sql')) logger.info( f'Found stored procedure abspath: {stored_procedure_abspath}') stored_procedure_fp = open(stored_procedure_abspath) logger.info('Executing SQL file.') # Execute SQL file: https://stackoverflow.com/a/50080000 psql_cursor.execute(stored_procedure_fp.read()) psql_conn.commit() logger.info('Successfully executed SQL file.')
def broker_proc(): """ Task definition for broker process to run in compute instance background. There should only be one process running, since as the payload is extremely light and refreshes are periodic, the load on the process should be extremely light and there shouldn't be any reason why the process should fall over. If liveness issues occur in production, consider installing ZMQ pub/sub on the database, and create per-trigger channels, and scale out the database and reverse proxy as needed. """ logger = app_logging.get_broker_logger() CHANNEL_NAME = 'psql_refreshes_channel' with core_utils.PostgreSQLCursor() as (psql_conn, psql_cursor): psql_conn.set_isolation_level( psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT) sql_statement = sql.SQL("LISTEN {channel}").format( channel=sql.Identifier(CHANNEL_NAME)) psql_cursor.execute(sql_statement) while True: # If Linux select() syscall returns empty, then do not poll for # updates using 'psycopg2'. if select.select([psql_conn], [], [], 5) == ([], [], []): logger.info("No information from handler yet.") else: psql_conn.poll() while psql_conn.notifies: # NOTE: Processing one event at a time should be # fine...there should be no need for something like # 'multiprocessing', the load shouldn't be there. Update if # different in production. # # NOTE: Need to pop off psql_conn.notifies to avoid # infinite loop while psql_conn.notifies exists. This # processes multiple notifications correctly. notify = psql_conn.notifies.pop(0) payload = json.loads(notify.payload) job_id = payload['job_id'] view_id = payload['view_id'] logger.info("Payload: ", payload) view_name = view_models.MaterializedView.objects.filter( id=view_id)[0].view_name logger.info('Materialized view name: ', view_name) channels = models.Channel.objects.filter(job_id=job_id) for channel in channels: logger.info('Sending update to channel UUID: ', channel.public_identifier) django_eventstream.send_event( str(channel.public_identifier), 'message', { 'update_available': 'true', 'view_name': view_name })
def post(self, request, *args, **kwargs): """ Handles the HTTP POST request. Example: curl \ --header "Content-Type: application/json" \ --header "Authorization: JWT $JWT_ACCESS_TOKEN" \ --method POST \ --data '{"crontab_def", "* * * * *", "view_name": "sample_view"}' \ https://api.tinydevcrm.com/jobs/create/ """ def _validate(request): """ Validates request. Args: rest_framework.request.Request Returns: (bool, dict): (Request is valid, reasons) """ checks = { 'all_required_keys_are_present': True, 'crontab_def_is_valid': True, 'view_exists': True } if (not request.data.get('crontab_def') or not request.data.get('view_name')): checks['all_required_keys_are_present'] = False # TODO: Implement more comprehensive crontab definition checking. # Current validation logic belongs to package 'cron-validator': # https://github.com/vcoder4c/cron-validator and coverage / # correctness are not as comprehensive as cron definition: # https://crontab.guru/ crontab_def = request.data.get('crontab_def') try: parsed = CronValidator.parse(crontab_def) except Exception as e: checks['crontab_def_is_valid'] = False view_name = request.data.get('view_name') checks['view_exists'] = views_utils.materialized_view_exists( str(request.user.id), view_name) return (all(checks.values()), checks) (request_is_valid, validation_checks) = _validate(request) if not request_is_valid: return Response( f'Request did not pass validation. Checks: {str(validation_checks)}', status=status.HTTP_400_BAD_REQUEST) crontab_def = request.data.get('crontab_def') view_name = request.data.get('view_name') view_objects = view_models.MaterializedView.objects.filter( user=request.user.id, view_name=view_name) if view_objects.count() != 1: return Response( 'More than one materialized view with the same schema name and view name present. Data corrupted.', status=status.HTTP_500_INTERNAL_SERVER_ERROR) view_id = view_objects.first().id with core_utils.PostgreSQLCursor( db_schema=request.user.id) as (psql_conn, psql_cursor): # NOTE: The 'INSERT' query references the EventRefreshes Django # model, and must be updated manually if the model is updated. See # 'jobs/models.py' for more information. # # TODO: Each channel needs a corrseponding job ID to listen to. That # information must be communicated from the event refreshes table, # since that is what the underlying trigger listens to. The job ID # does not exist until the PostgreSQL function 'cron.schedule' is # run. Therefore, the job creation for materialized view, and # inserting into the event refreshes table must be split into two # components. This may impact the atomicity of operations (e.g. if a # refresh event for a materialized view doesn't work, an event may # still be sent over the channel), which may impact the amount of # work necessary in order to ensure correctness. This might be # prevented if job ID was mapped to a list of cron jobs for the # CronJob Django model. refresh_view_scheduled_query = sql.SQL( 'REFRESH MATERIALIZED VIEW {view_name}').format( view_name=sql.Identifier(str(request.user.id), view_name)) refresh_view_sql_statement = sql.SQL( "SELECT cron.schedule({crontab_def}, '{scheduled_query}')" ).format(crontab_def=sql.Literal(crontab_def), scheduled_query=refresh_view_scheduled_query) psql_cursor.execute(refresh_view_sql_statement) # NOTE: This commit is necessary in order to get the job ID # necessary for insertion into the notify_channel_scheduled_query. psql_conn.commit() # Create the Django CronJob model. job_id = psql_cursor.fetchone()[0] job_serializer = serializers.CronJobSerializer( data={ 'job_ids': [job_id], 'user': request.user.id, 'view': view_id }) if not job_serializer.is_valid(): return Response(job_serializer.errors(), status=status.HTTP_500_INTERNAL_SERVER_ERROR) cronjob = job_serializer.save() # NOTE: Job ID here refers to the CronJob ID, in order to reference # the foreign key for the appropriate channel to send this event on. notify_channel_scheduled_query = sql.SQL( 'INSERT INTO jobs_eventrefreshes(job_id, view_id, created, status) VALUES (\'{job_id}\', \'{view_id}\', NOW(), \'{status}\')' ).format(job_id=sql.Literal(str(cronjob.id)), view_id=sql.Literal(str(cronjob.view.id)), status=sql.Literal(str(models.EnumStatusTypes.NEW))) notify_channel_sql_query = sql.SQL( "SELECT cron.schedule({crontab_def}, '{scheduled_query}')" ).format(crontab_def=sql.Literal(crontab_def), scheduled_query=notify_channel_scheduled_query) psql_cursor.execute(notify_channel_sql_query) psql_conn.commit() # Update existing cronjob field. job2_id = psql_cursor.fetchone()[0] cronjob.job_ids = cronjob.job_ids + [job2_id] cronjob.save(update_fields=['job_ids']) job_data = model_to_dict(cronjob) return Response(job_data, status=status.HTTP_201_CREATED)
def post(self, request, *args, **kwargs): """ Handles the HTTP POST request. TODO: Parse the SQL statement and automatically insert the user ID as PostgreSQL schema into the raw SQL statement so that the user doesn't need to specify the user ID via the API. Example: - curl \ --header "Content-Type: application/json" \ --header "Authorization: JWT $JWT_ACCESS_TOKEN" \ --method POST \ --data '{"view_name": "\"1\".\"sample_view\"", "sql_query": "SELECT * FROM \"1\".\"sample_table\""}' \ https://api.tinydevcrm.com/views/create/ """ def _validate(request): """ Validates request. Args: rest_framework.request.Request Returns: (bool, dict): (Request is valid, reasons) """ checks = { 'all_required_keys_are_present': True, 'query_starts_with_select_tables_or_values': True, 'query_does_not_contain_semicolons': True, 'view_does_not_exist': True } if ( not request.data.get('view_name') or not request.data.get('sql_query') ): checks['all_required_keys_are_present'] = False sql_query = request.data.get('sql_query') # 'sql_query' matches with internal SQL query: # https://www.postgresql.org/docs/12/sql-creatematerializedview.html if ( not sql_query.startswith('SELECT') and not sql_query.startswith('TABLE') and not sql_query.startswith('VALUES') ): checks['query_starts_with_select_tables_or_values'] = False # Prevent some SQL injection attacks by ensuring SQl query does not # have semicolon. if ';' in sql_query: checks['query_does_not_contain_semicolons'] = False checks['view_does_not_exist'] = not views_utils.materialized_view_exists( str(request.user.id), request.data.get('view_name') ) return (all(checks.values()), checks) (is_valid_request, validation_checks) = _validate(request) if not is_valid_request: return Response( f'Request did not pass validation. Checks: {str(validation_checks)}', status=status.HTTP_400_BAD_REQUEST ) view_name = request.data.get('view_name') sql_query_request = request.data.get('sql_query') with core_utils.PostgreSQLCursor(db_schema=request.user.id) as (psql_conn, psql_cursor): sql_statement = sql.SQL( 'CREATE MATERIALIZED VIEW {view_name} AS %s WITH DATA' ).format( view_name=sql.Identifier(view_name) ) sql_statement = sql_statement.as_string(psql_conn) sql_statement = sql_statement % sql_query_request psql_cursor.execute( sql.SQL(sql_statement) ) psql_conn.commit() view_serializer = serializers.MaterializedViewSerializer( data={ 'view_name': view_name, 'user': request.user.id } ) if view_serializer.is_valid(): view_serializer.save() return Response( view_serializer.data, status=status.HTTP_201_CREATED )
def post(self, request, *args, **kwargs): """ Handles the HTTP POST request. These tables are generated from data dumps backed by "concrete data", Parquet files available via PostgreSQL extension 'parquet_fdw' (foreign data wrapper) that serve as the foundational sources of truth for users. This is opposed to "derived data", which is data computed via mathematical, logical / relational, or other types of transformations. For example, a materialized view would be considered "derived data", while a CSV upload would be considered "concrete data". Making this distinction guarantees unitary application data flow by making sure each layer is immutable (writes are prohibited for foreign tables and only occur during HTTP POST), and that consequently, underlying data pipelines are acyclic and easier to manage. Since there is only one version of the data, versioning and backup of foreign tables is fairly trivial. TODO: A prior version of this method used to create the foreign table as-is. While this would work for creating materialized views and triggers, the inability of writes to the underlying data, as well as the difficulties managing the persist layer precludes this method as too inflexible for OLTP workloads. If foreign tables are desired, create a separate endpoint for supporting only foreign tables. TODO: Take the complete possible PostgreSQL 'CREATE TABLE' syntax and translate that through a form to get the full functionality of 'CREATE TABLE' commands without suffering extraneous security issues of doing so. TODO: In addition, each PostgreSQL table is nested under a user schema defined by the custom user, in order to deconflict data resources underneath the hood, and to help facilitate schema-based multitenancy. Create a PostgreSQL user during the 'CREATE SCHEMA IF NOT EXISTS' in order to apply an authorization for the schema to that user, so that `psql -h db.tinydevcrm.com -U $CUSTOM_USERNAME` can work properly. Example usage: - curl \ --header "Content-Type: multipart/form-data" \ --header "Authorization: JWT $JWT_ACCESS_TOKEN" \ --method POST \ -F [email protected] \ -F table_name=sample_table \ -F columns='[{"column_name": "SomeNumber", "column_type":"int"},{"column_name":"SomeString","column_type":"varchar(256)"}]' \ https://api.tinydevcrm.com/tables/create/ NOTE: These keys, such as 'data' and 'file', are very particular to the underlying models and serializers. Do not change without testing in development. """ def _validate(request): """ Validates request data. Args: rest_framework.request.Request Returns: bool: Request is valid. """ # TODO: Add check for 'column_types_are_valid'. checks = { 'all_required_keys_are_present': True, 'column_schema_is_valid': True, 'table_does_not_exist': True } if ( not request.data.get('file') or not request.data.get('table_name') or not request.data.get('columns') ): checks['all_required_keys_are_present'] = False columns = request.data.get('columns') try: column_data = json.loads(columns) assert type(column_data) is list for item in column_data: assert type(item) is dict assert sorted(item.keys()) == ['column_name', 'column_type'] # TODO: Add check for column types and column names except (Exception, AssertionError) as e: checks['column_schema_is_valid'] = False checks['table_does_not_exist'] = not table_utils.table_exists( str(request.user.id), request.data.get('table_name') ) return ( all(checks.values()), checks ) (is_valid, validation_checks) = _validate(request) if not is_valid: return Response( f'Request is not valid: {str(validation_checks)}', status=status.HTTP_400_BAD_REQUEST ) file_serializer = serializers.DataFileSerializer( # Use the form key 'file=@$FILENAME' in order to send binary files # as part of a multipart/form-data request. data={ 'file': request.data['file'] } ) if not file_serializer.is_valid(): return Response( file_serializer.errors, status=status.HTTP_400_BAD_REQUEST ) datafile = file_serializer.save() table_name = request.data.get('table_name') file_abspath = os.path.join( settings.MEDIA_ROOT, datafile.file.name ) temp_table_name = f'temp_{str(request.user.id)}_created_{int(datetime.datetime.now().timestamp())}' copy_table_sql_query = sql.SQL( 'CREATE TABLE {table_name} AS TABLE {temp_table_name} WITH DATA' ).format( table_name=sql.Identifier(table_name), temp_table_name=sql.Identifier(temp_table_name) ) drop_temp_table_sql_query = sql.SQL( 'DROP FOREIGN TABLE {temp_table_name}' ).format( temp_table_name=sql.Identifier(temp_table_name) ) # Add error handling logic within this with block if there are numerous # HTTP 500 errors that appear in logs. with core_utils.PostgreSQLCursor(db_schema=request.user.id) as (psql_conn, psql_cursor): # Dynamic column creation makes table creation query much more # tricky. columns = json.loads(request.data.get('columns')) column_names = [ column_def['column_name'] for column_def in columns ] column_types = [ column_def['column_type'].upper() for column_def in columns ] column_query = sql.SQL(',').join([ sql.SQL('{} {}').format( sql.Identifier(column_name), sql.Placeholder() ) for column_name in column_names ]) create_foreign_table_sql_query = sql.SQL( 'CREATE FOREIGN TABLE {temp_table_name} ({columns}) SERVER parquet_srv OPTIONS (filename {file_abspath});' ).format( temp_table_name=sql.Identifier(temp_table_name), columns=column_query, file_abspath=sql.Literal(file_abspath) ) create_foreign_table_sql_query = create_foreign_table_sql_query.as_string(psql_conn) # TODO: I am concerned this may be a little insecure, since I am not # referencing the DB API when templating this string. This has to be # done because psycopg2.sql wraps elements within singly or doubly # quoted strings, but column types are not strings. I think this # should be fine because I can validate recognized column types as # enumerations. That hasn't been done yet. create_foreign_table_sql_query = create_foreign_table_sql_query % tuple(column_types) psql_cursor.execute(create_foreign_table_sql_query) psql_cursor.execute(copy_table_sql_query) psql_cursor.execute(drop_temp_table_sql_query) psql_conn.commit() table_serializer = serializers.TableSerializer( data={ 'table_name': table_name, 'user': request.user.id } ) if table_serializer.is_valid(): table_serializer.save() datafile.delete() # Deleting the data model does not delete the file. Do that # separately. os.remove(os.path.abspath(os.path.join( settings.MEDIA_ROOT, str(datafile.file) ))) return Response( table_serializer.data, status=status.HTTP_201_CREATED )