Ejemplo n.º 1
0
def table_exists(schema_name, table_name):
    """
    Checks whether the table exists.
    """
    with core_utils.PostgreSQLCursor(db_schema=schema_name) as (psql_conn,
                                                                psql_cursor):
        psql_cursor.execute(
            sql.SQL(
                'SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema = {schemaname} AND table_name = {table_name})'
            ).format(schemaname=sql.Literal(schema_name),
                     table_name=sql.Literal(table_name)))
        table_exists = psql_cursor.fetchone()[0]
        return table_exists
Ejemplo n.º 2
0
def materialized_view_exists(schemaname, matviewname):
    """
    Checks whether materialized view exists.
    """
    with core_utils.PostgreSQLCursor(db_schema=schemaname) as (psql_conn,
                                                               psql_cursor):
        sql_statement = sql.SQL(
            'SELECT EXISTS(SELECT 1 FROM pg_matviews WHERE schemaname = {schemaname} AND matviewname = {matviewname})'
        ).format(schemaname=sql.Literal(str(schemaname)),
                 matviewname=sql.Literal(matviewname))

        psql_cursor.execute(sql_statement)
        view_exists = psql_cursor.fetchone()[0]

        return view_exists
Ejemplo n.º 3
0
def cron_job_exists(schemaname, job_id):
    """
    Checks whether cron job exists.

    TODO: Reconcile the jobs Django model with the pg_cron table, because
    they're two separate trackers for the same thing. For now, referencing
    cron.job because it's the source of truth.
    """
    with core_utils.PostgreSQLCursor(db_schema=schemaname) as (psql_conn,
                                                               psql_cursor):
        sql_statement = sql.SQL(
            "SELECT EXISTS(SELECT 1 FROM cron.job WHERE jobid = {job_id} AND active = 't')"
        ).format(job_id=sql.Literal(str(job_id)))

        psql_cursor.execute(sql_statement)
        job_exists = psql_cursor.fetchone()[0]

        return job_exists
Ejemplo n.º 4
0
    def post(self, request, format='json'):
        user_serializer = CustomUserSerializer(data=request.data)
        if user_serializer.is_valid():
            user = user_serializer.save()
            if user:
                # If the user is successfully created and saved into the
                # database, create a schema for that user in order to place all
                # subsequent database resources.
                with core_utils.PostgreSQLCursor() as (psql_conn, psql_cursor):
                    create_schema_sql_statement = sql.SQL(
                        "CREATE SCHEMA IF NOT EXISTS {}").format(
                            sql.Identifier(str(user.id)))
                    psql_cursor.execute(create_schema_sql_statement)
                    psql_conn.commit()

                return Response(user_serializer.data,
                                status=status.HTTP_201_CREATED)
        return Response(user_serializer.errors,
                        status=status.HTTP_400_BAD_REQUEST)
Ejemplo n.º 5
0
def execute_storedproc_trigger_on_refresh():
    """
    Calls the SQL file 'trigger_on_refresh.sql' in order to execute a PostgreSQL
    trigger upon refresh of the underlying status table.

    NOTE: This method is separate from `apps.py` and `AppConfig().ready()`,
    since resolution for Django ticket #31658
    (https://code.djangoproject.com/ticket/31658) indicates SQL procedures
    should not be run as part of Django lifecycle methods.

    NOTE: To list all functions in PostgreSQL, open `psql` and run `\df+`.

    NOTE: Should use `logging.info`, but `logging` does not print to stdout
    properly in `broker.py`. This issue may have already been addressed, in
    which case switch `print()` statements to `logging.info()` statements.
    """
    logger = app_logging.get_broker_logger()

    if is_database_synchronized(DEFAULT_DB_ALIAS):
        with core_utils.PostgreSQLCursor() as (psql_conn, psql_cursor):
            stored_procedure_abspath = os.path.abspath(
                os.path.join('channels_app', 'storedprocedures',
                             'trigger_on_refresh.sql'))

            logger.info(
                f'Found stored procedure abspath: {stored_procedure_abspath}')

            stored_procedure_fp = open(stored_procedure_abspath)

            logger.info('Executing SQL file.')

            # Execute SQL file: https://stackoverflow.com/a/50080000
            psql_cursor.execute(stored_procedure_fp.read())
            psql_conn.commit()

            logger.info('Successfully executed SQL file.')
Ejemplo n.º 6
0
def broker_proc():
    """
    Task definition for broker process to run in compute instance background.
    There should only be one process running, since as the payload is extremely
    light and refreshes are periodic, the load on the process should be
    extremely light and there shouldn't be any reason why the process should
    fall over.

    If liveness issues occur in production, consider installing ZMQ pub/sub on
    the database, and create per-trigger channels, and scale out the database
    and reverse proxy as needed.
    """
    logger = app_logging.get_broker_logger()

    CHANNEL_NAME = 'psql_refreshes_channel'

    with core_utils.PostgreSQLCursor() as (psql_conn, psql_cursor):
        psql_conn.set_isolation_level(
            psycopg2.extensions.ISOLATION_LEVEL_AUTOCOMMIT)
        sql_statement = sql.SQL("LISTEN {channel}").format(
            channel=sql.Identifier(CHANNEL_NAME))

        psql_cursor.execute(sql_statement)

        while True:
            # If Linux select() syscall returns empty, then do not poll for
            # updates using 'psycopg2'.
            if select.select([psql_conn], [], [], 5) == ([], [], []):
                logger.info("No information from handler yet.")
            else:
                psql_conn.poll()
                while psql_conn.notifies:
                    # NOTE: Processing one event at a time should be
                    # fine...there should be no need for something like
                    # 'multiprocessing', the load shouldn't be there. Update if
                    # different in production.
                    #
                    # NOTE: Need to pop off psql_conn.notifies to avoid
                    # infinite loop while psql_conn.notifies exists. This
                    # processes multiple notifications correctly.
                    notify = psql_conn.notifies.pop(0)
                    payload = json.loads(notify.payload)
                    job_id = payload['job_id']
                    view_id = payload['view_id']

                    logger.info("Payload: ", payload)

                    view_name = view_models.MaterializedView.objects.filter(
                        id=view_id)[0].view_name

                    logger.info('Materialized view name: ', view_name)

                    channels = models.Channel.objects.filter(job_id=job_id)

                    for channel in channels:
                        logger.info('Sending update to channel UUID: ',
                                    channel.public_identifier)
                        django_eventstream.send_event(
                            str(channel.public_identifier), 'message', {
                                'update_available': 'true',
                                'view_name': view_name
                            })
Ejemplo n.º 7
0
    def post(self, request, *args, **kwargs):
        """
        Handles the HTTP POST request.

        Example:

        curl \
            --header "Content-Type: application/json" \
            --header "Authorization: JWT $JWT_ACCESS_TOKEN" \
            --method POST \
            --data '{"crontab_def", "* * * * *", "view_name": "sample_view"}' \
            https://api.tinydevcrm.com/jobs/create/
        """
        def _validate(request):
            """
            Validates request.

            Args:
                rest_framework.request.Request

            Returns:
                (bool, dict): (Request is valid, reasons)
            """
            checks = {
                'all_required_keys_are_present': True,
                'crontab_def_is_valid': True,
                'view_exists': True
            }

            if (not request.data.get('crontab_def')
                    or not request.data.get('view_name')):
                checks['all_required_keys_are_present'] = False

            # TODO: Implement more comprehensive crontab definition checking.
            # Current validation logic belongs to package 'cron-validator':
            # https://github.com/vcoder4c/cron-validator and coverage /
            # correctness are not as comprehensive as cron definition:
            # https://crontab.guru/
            crontab_def = request.data.get('crontab_def')

            try:
                parsed = CronValidator.parse(crontab_def)
            except Exception as e:
                checks['crontab_def_is_valid'] = False

            view_name = request.data.get('view_name')

            checks['view_exists'] = views_utils.materialized_view_exists(
                str(request.user.id), view_name)

            return (all(checks.values()), checks)

        (request_is_valid, validation_checks) = _validate(request)

        if not request_is_valid:
            return Response(
                f'Request did not pass validation. Checks: {str(validation_checks)}',
                status=status.HTTP_400_BAD_REQUEST)

        crontab_def = request.data.get('crontab_def')
        view_name = request.data.get('view_name')

        view_objects = view_models.MaterializedView.objects.filter(
            user=request.user.id, view_name=view_name)
        if view_objects.count() != 1:
            return Response(
                'More than one materialized view with the same schema name and view name present. Data corrupted.',
                status=status.HTTP_500_INTERNAL_SERVER_ERROR)
        view_id = view_objects.first().id

        with core_utils.PostgreSQLCursor(
                db_schema=request.user.id) as (psql_conn, psql_cursor):
            # NOTE: The 'INSERT' query references the EventRefreshes Django
            # model, and must be updated manually if the model is updated. See
            # 'jobs/models.py' for more information.
            #
            # TODO: Each channel needs a corrseponding job ID to listen to. That
            # information must be communicated from the event refreshes table,
            # since that is what the underlying trigger listens to. The job ID
            # does not exist until the PostgreSQL function 'cron.schedule' is
            # run. Therefore, the job creation for materialized view, and
            # inserting into the event refreshes table must be split into two
            # components. This may impact the atomicity of operations (e.g. if a
            # refresh event for a materialized view doesn't work, an event may
            # still be sent over the channel), which may impact the amount of
            # work necessary in order to ensure correctness. This might be
            # prevented if job ID was mapped to a list of cron jobs for the
            # CronJob Django model.
            refresh_view_scheduled_query = sql.SQL(
                'REFRESH MATERIALIZED VIEW {view_name}').format(
                    view_name=sql.Identifier(str(request.user.id), view_name))
            refresh_view_sql_statement = sql.SQL(
                "SELECT cron.schedule({crontab_def}, '{scheduled_query}')"
            ).format(crontab_def=sql.Literal(crontab_def),
                     scheduled_query=refresh_view_scheduled_query)
            psql_cursor.execute(refresh_view_sql_statement)
            # NOTE: This commit is necessary in order to get the job ID
            # necessary for insertion into the notify_channel_scheduled_query.
            psql_conn.commit()

            # Create the Django CronJob model.
            job_id = psql_cursor.fetchone()[0]
            job_serializer = serializers.CronJobSerializer(
                data={
                    'job_ids': [job_id],
                    'user': request.user.id,
                    'view': view_id
                })
            if not job_serializer.is_valid():
                return Response(job_serializer.errors(),
                                status=status.HTTP_500_INTERNAL_SERVER_ERROR)

            cronjob = job_serializer.save()

            # NOTE: Job ID here refers to the CronJob ID, in order to reference
            # the foreign key for the appropriate channel to send this event on.
            notify_channel_scheduled_query = sql.SQL(
                'INSERT INTO jobs_eventrefreshes(job_id, view_id, created, status) VALUES (\'{job_id}\', \'{view_id}\', NOW(), \'{status}\')'
            ).format(job_id=sql.Literal(str(cronjob.id)),
                     view_id=sql.Literal(str(cronjob.view.id)),
                     status=sql.Literal(str(models.EnumStatusTypes.NEW)))
            notify_channel_sql_query = sql.SQL(
                "SELECT cron.schedule({crontab_def}, '{scheduled_query}')"
            ).format(crontab_def=sql.Literal(crontab_def),
                     scheduled_query=notify_channel_scheduled_query)
            psql_cursor.execute(notify_channel_sql_query)
            psql_conn.commit()

            # Update existing cronjob field.
            job2_id = psql_cursor.fetchone()[0]
            cronjob.job_ids = cronjob.job_ids + [job2_id]
            cronjob.save(update_fields=['job_ids'])

        job_data = model_to_dict(cronjob)

        return Response(job_data, status=status.HTTP_201_CREATED)
Ejemplo n.º 8
0
    def post(self, request, *args, **kwargs):
        """
        Handles the HTTP POST request.

        TODO: Parse the SQL statement and automatically insert the user ID as
        PostgreSQL schema into the raw SQL statement so that the user doesn't
        need to specify the user ID via the API.

        Example:

        - curl \
            --header "Content-Type: application/json" \
            --header "Authorization: JWT $JWT_ACCESS_TOKEN" \
            --method POST \
            --data '{"view_name": "\"1\".\"sample_view\"", "sql_query": "SELECT * FROM \"1\".\"sample_table\""}' \
            https://api.tinydevcrm.com/views/create/
        """
        def _validate(request):
            """
            Validates request.

            Args:
                rest_framework.request.Request

            Returns:
                (bool, dict): (Request is valid, reasons)
            """
            checks = {
                'all_required_keys_are_present': True,
                'query_starts_with_select_tables_or_values': True,
                'query_does_not_contain_semicolons': True,
                'view_does_not_exist': True
            }

            if (
                not request.data.get('view_name') or
                not request.data.get('sql_query')
            ):
                checks['all_required_keys_are_present'] = False

            sql_query = request.data.get('sql_query')

            # 'sql_query' matches with internal SQL query:
            # https://www.postgresql.org/docs/12/sql-creatematerializedview.html
            if (
                not sql_query.startswith('SELECT') and
                not sql_query.startswith('TABLE') and
                not sql_query.startswith('VALUES')
            ):
                checks['query_starts_with_select_tables_or_values'] = False

            # Prevent some SQL injection attacks by ensuring SQl query does not
            # have semicolon.

            if ';' in sql_query:
                checks['query_does_not_contain_semicolons'] = False

            checks['view_does_not_exist'] = not views_utils.materialized_view_exists(
                str(request.user.id),
                request.data.get('view_name')
            )

            return (all(checks.values()), checks)

        (is_valid_request, validation_checks) = _validate(request)

        if not is_valid_request:
            return Response(
                f'Request did not pass validation. Checks: {str(validation_checks)}',
                status=status.HTTP_400_BAD_REQUEST
            )

        view_name = request.data.get('view_name')
        sql_query_request = request.data.get('sql_query')

        with core_utils.PostgreSQLCursor(db_schema=request.user.id) as (psql_conn, psql_cursor):
            sql_statement = sql.SQL(
                'CREATE MATERIALIZED VIEW {view_name} AS %s WITH DATA'
            ).format(
                view_name=sql.Identifier(view_name)
            )
            sql_statement = sql_statement.as_string(psql_conn)
            sql_statement = sql_statement % sql_query_request

            psql_cursor.execute(
                sql.SQL(sql_statement)
            )
            psql_conn.commit()

            view_serializer = serializers.MaterializedViewSerializer(
                data={
                    'view_name': view_name,
                    'user': request.user.id
                }
            )
            if view_serializer.is_valid():
                view_serializer.save()

        return Response(
            view_serializer.data,
            status=status.HTTP_201_CREATED
        )
Ejemplo n.º 9
0
    def post(self, request, *args, **kwargs):
        """
        Handles the HTTP POST request.

        These tables are generated from data dumps backed by "concrete data",
        Parquet files available via PostgreSQL extension 'parquet_fdw' (foreign
        data wrapper) that serve as the foundational sources of truth for users.
        This is opposed to "derived data", which is data computed via
        mathematical, logical / relational, or other types of transformations.
        For example, a materialized view would be considered "derived data",
        while a CSV upload would be considered "concrete data".

        Making this distinction guarantees unitary application data flow by
        making sure each layer is immutable (writes are prohibited for foreign
        tables and only occur during HTTP POST), and that consequently,
        underlying data pipelines are acyclic and easier to manage. Since there
        is only one version of the data, versioning and backup of foreign tables
        is fairly trivial.

        TODO: A prior version of this method used to create the foreign table
        as-is. While this would work for creating materialized views and
        triggers, the inability of writes to the underlying data, as well as the
        difficulties managing the persist layer precludes this method as too
        inflexible for OLTP workloads. If foreign tables are desired, create a
        separate endpoint for supporting only foreign tables.

        TODO: Take the complete possible PostgreSQL 'CREATE TABLE' syntax and
        translate that through a form to get the full functionality of 'CREATE
        TABLE' commands without suffering extraneous security issues of doing
        so.

        TODO: In addition, each PostgreSQL table is nested under a user schema
        defined by the custom user, in order to deconflict data resources
        underneath the hood, and to help facilitate schema-based multitenancy.
        Create a PostgreSQL user during the 'CREATE SCHEMA IF NOT EXISTS' in
        order to apply an authorization for the schema to that user, so that
        `psql -h db.tinydevcrm.com -U $CUSTOM_USERNAME` can work properly.

        Example usage:

        - curl \
            --header "Content-Type: multipart/form-data" \
            --header "Authorization: JWT $JWT_ACCESS_TOKEN" \
            --method POST \
            -F [email protected] \
            -F table_name=sample_table \
            -F columns='[{"column_name": "SomeNumber", "column_type":"int"},{"column_name":"SomeString","column_type":"varchar(256)"}]' \
            https://api.tinydevcrm.com/tables/create/

        NOTE: These keys, such as 'data' and 'file', are very particular to the
        underlying models and serializers. Do not change without testing in
        development.
        """
        def _validate(request):
            """
            Validates request data.

            Args:
                rest_framework.request.Request

            Returns:
                bool: Request is valid.
            """
            # TODO: Add check for 'column_types_are_valid'.
            checks = {
                'all_required_keys_are_present': True,
                'column_schema_is_valid': True,
                'table_does_not_exist': True
            }

            if (
                not request.data.get('file') or
                not request.data.get('table_name') or
                not request.data.get('columns')
            ):
                checks['all_required_keys_are_present'] = False

            columns = request.data.get('columns')
            try:
                column_data = json.loads(columns)
                assert type(column_data) is list
                for item in column_data:
                    assert type(item) is dict
                    assert sorted(item.keys()) == ['column_name', 'column_type']
                    # TODO: Add check for column types and column names
            except (Exception, AssertionError) as e:
                checks['column_schema_is_valid'] = False

            checks['table_does_not_exist'] = not table_utils.table_exists(
                str(request.user.id),
                request.data.get('table_name')
            )

            return (
                all(checks.values()),
                checks
            )

        (is_valid, validation_checks) = _validate(request)
        if not is_valid:
            return Response(
                f'Request is not valid: {str(validation_checks)}',
                status=status.HTTP_400_BAD_REQUEST
            )

        file_serializer = serializers.DataFileSerializer(
            # Use the form key 'file=@$FILENAME' in order to send binary files
            # as part of a multipart/form-data request.
            data={
                'file': request.data['file']
            }
        )

        if not file_serializer.is_valid():
            return Response(
                file_serializer.errors,
                status=status.HTTP_400_BAD_REQUEST
            )

        datafile = file_serializer.save()

        table_name = request.data.get('table_name')

        file_abspath = os.path.join(
            settings.MEDIA_ROOT,
            datafile.file.name
        )

        temp_table_name = f'temp_{str(request.user.id)}_created_{int(datetime.datetime.now().timestamp())}'

        copy_table_sql_query = sql.SQL(
            'CREATE TABLE {table_name} AS TABLE {temp_table_name} WITH DATA'
        ).format(
            table_name=sql.Identifier(table_name),
            temp_table_name=sql.Identifier(temp_table_name)
        )

        drop_temp_table_sql_query = sql.SQL(
            'DROP FOREIGN TABLE {temp_table_name}'
        ).format(
            temp_table_name=sql.Identifier(temp_table_name)
        )

        # Add error handling logic within this with block if there are numerous
        # HTTP 500 errors that appear in logs.
        with core_utils.PostgreSQLCursor(db_schema=request.user.id) as (psql_conn, psql_cursor):
            # Dynamic column creation makes table creation query much more
            # tricky.
            columns = json.loads(request.data.get('columns'))
            column_names = [
                column_def['column_name']
                for column_def
                in columns
            ]
            column_types = [
                column_def['column_type'].upper()
                for column_def
                in columns
            ]
            column_query = sql.SQL(',').join([
                sql.SQL('{} {}').format(
                    sql.Identifier(column_name),
                    sql.Placeholder()
                )
                for column_name
                in column_names
            ])

            create_foreign_table_sql_query = sql.SQL(
                'CREATE FOREIGN TABLE {temp_table_name} ({columns}) SERVER parquet_srv OPTIONS (filename {file_abspath});'
            ).format(
                temp_table_name=sql.Identifier(temp_table_name),
                columns=column_query,
                file_abspath=sql.Literal(file_abspath)
            )
            create_foreign_table_sql_query = create_foreign_table_sql_query.as_string(psql_conn)

            # TODO: I am concerned this may be a little insecure, since I am not
            # referencing the DB API when templating this string. This has to be
            # done because psycopg2.sql wraps elements within singly or doubly
            # quoted strings, but column types are not strings. I think this
            # should be fine because I can validate recognized column types as
            # enumerations. That hasn't been done yet.
            create_foreign_table_sql_query = create_foreign_table_sql_query % tuple(column_types)

            psql_cursor.execute(create_foreign_table_sql_query)

            psql_cursor.execute(copy_table_sql_query)
            psql_cursor.execute(drop_temp_table_sql_query)
            psql_conn.commit()

            table_serializer = serializers.TableSerializer(
                data={
                    'table_name': table_name,
                    'user': request.user.id
                }
            )
            if table_serializer.is_valid():
                table_serializer.save()

            datafile.delete()
            # Deleting the data model does not delete the file. Do that
            # separately.
            os.remove(os.path.abspath(os.path.join(
                settings.MEDIA_ROOT,
                str(datafile.file)
            )))

        return Response(
            table_serializer.data,
            status=status.HTTP_201_CREATED
        )