Esempio n. 1
0
    def test_load_invalid_manifest(self):
        """ test an invalid manifest file """

        invalid_dir = os_path.join(_TEST_DIR, 'invalid_manifest')

        error_list = [
            {
                # no file list provided
                'file': 'no_file_list',
                'msg': "'file_list' is a required property",
            },
            {
                # a cluster file entry should have a prefix
                'file': 'cluster_no_prefix',
                'msg': r"{'data_type': 'cluster', 'path': 'I2_named.tsv'} is not valid under any of the given schemas",
            },
            {
                # each file_list entry has to have a path
                'file': 'missing_path',
                'msg': "'path' is a required property",
            },
            {
                # if the date is not quoted, pyyaml will turn it into a date object. Doh!
                'file': 'date_not_in_quotes',
                'msg': "datetime.date\(2020, 12, 25\) is not of type 'string'",
            },
            {
                # file format is invalid
                'file': 'invalid_format',
                'msg': "'txt' is not one of \['tsv', 'csv'\]"
            },
            {
                # there must be an indicator of file format
                'file': 'no_file_format',
                'msg': r"{'data_type': 'edge', 'date': '2020-12-25', 'path': 'edge_data'}"
                        + " is not valid under any of the given schemas",
            },
        ]

        for entry in error_list:
            data_file = os_path.join(invalid_dir, entry['file'] + '.yaml')
            print('looking at ' + data_file)

            with self.assertRaisesRegex(ValidationError, entry['msg']):
                run_validator(
                    schema_file=schema_file,
                    data_file=data_file,
                    nicer_errors=True
                )
Esempio n. 2
0
    def _get_manifest(self, configuration):
        """
        Read the manifest file, which contains path and file type info, and validate it.
        The manifest is expected to be at ROOT_DATA_PATH/manifest.yaml
        """

        schema_file = self._get_manifest_schema_file()

        # load the manifest and validate it against the schema
        manifest_file = os.path.join(configuration['ROOT_DATA_PATH'],
                                     'manifest.yaml')

        try:
            with open(manifest_file) as fd:
                manifest = yaml.safe_load(fd)
        except FileNotFoundError:
            raise RuntimeError(
                f"No manifest file found at {manifest_file}.\n"
                "Please ensure that you have created a manifest that lists the files "
                "in the release")

        try:
            validated_manifest = run_validator(schema_file=schema_file,
                                               data=manifest)
        except Exception as err:
            print(err)
            raise RuntimeError(
                "The manifest file failed validation. Please recheck the file and try again."
            )

        return validated_manifest
Esempio n. 3
0
    def test_date_format_validation(self,
                                    schema_arg=None,
                                    schema_file_arg=None):
        '''ensure that fancy date formats are correctly validated'''

        # skip if the test is not being called from test_json_validation
        if schema_arg is None and schema_file_arg is None:
            self.assertTrue(True)
            return

        tests = [{
            'input': {
                'date': '20200606'
            },
            'file': 'invalid_date',
            'err_str': "'20200606' is not a 'date'",
        }, {
            'input': {
                'date': 20200606
            },
            'file': 'invalid_date_type',
            'err_str': "20200606 is not of type 'string'"
        }, {
            'input': {
                "name": "valid_date",
                "date": "2020-06-06",
                "distance": 3
            },
            'file': 'valid_date',
            'output': {
                **schema_defaults,
                "name": "valid_date",
                "date": "2020-06-06",
                "distance": 3,
            }
        }]

        self.execute_tests(schema_arg, schema_file_arg, tests)

        # pyyaml-specific issue: dates get automatically parsed into datetime objects (doh!)
        file_path = os_path.join(json_validation_dir, 'unquoted_date.yaml')
        err_str = "datetime.date\(2020, 6, 6\) is not of type 'string'"
        with self.assertRaisesRegex(ValidationError, err_str):
            run_validator(schema=schema_arg,
                          schema_file=schema_file_arg,
                          data_file=file_path,
                          validate_at=valid_json_loc)
Esempio n. 4
0
def validate_data_source(path):
    print(f'  validating {path}..')

    # JSON schema for data source files in /data_sources
    data_source_schema_file = _VALID_SCHEMA_TYPES['data_source']['file']
    data = run_validator(schema_file=data_source_schema_file, data_file=path)
    namecheck_schema(path, data)

    print(f'✓ {path} is valid.')
    return data
Esempio n. 5
0
def validate_view(path):
    """Validate the structure and syntax of an arangodb view"""
    print(f'  validating {path}..')

    # JSON schema for /views
    view_schema_file = _VALID_SCHEMA_TYPES['view']['file']
    data = run_validator(data_file=path, schema_file=view_schema_file)
    namecheck_schema(path, data)

    print(f'✓ {path} is valid.')
    return data
Esempio n. 6
0
def validate_stored_query(path):
    print(f'  validating {path}..')

    stored_queries_schema_file = _VALID_SCHEMA_TYPES['stored_query']['file']
    data = run_validator(schema_file=stored_queries_schema_file, data_file=path)
    namecheck_schema(path, data)

    # Make sure `params` can be used as a JSON schema
    if data.get('params'):
        # If the schema is invalid, a SchemaError will get raised
        # Otherwise, the schema will work and a ValidationError will get raised
        try:
            run_validator(data={}, schema=data['params'])
        except ValidationError:
            pass

    # check that the query is valid AQL
    validate_aql_on_arango(data)

    print(f'✓ {path} is valid.')
    return data
Esempio n. 7
0
    def test_complex_schema_references(self):
        """test validation with complex references that reference other references"""

        valid_data = {
            'node': {
                'id': 'TAIR:19830',
                'type': 'gene',
            },
            'edge': valid_edge_data,
            'marks_out_of_ten': 5
        }

        invalid_data = {
            'node': {
                'id': 'TAIR:19830',
                'type': 'gene',
            },
            'edge': invalid_edge_data,
            'marks_out_of_ten': 5
        }

        err_msg = "'whatever' is not valid under any of the given schemas"
        for file_ext in ['json', 'yaml']:
            with self.subTest(file_ext=file_ext):
                file_path = os_path.join(
                    *(test_data_dirs + ['schema_refs', 'level_1']),
                    'test_object.' + file_ext)

                # data fails validation
                with self.assertRaisesRegex(ValidationError, err_msg):
                    run_validator(
                        schema_file=file_path,
                        data=invalid_data,
                    )

                self.assertEqual(
                    run_validator(
                        schema_file=file_path,
                        data=valid_data,
                    ), valid_data)
Esempio n. 8
0
def validate_collection(path):
    print(f'  validating {path}..')

    # JSON schema for vertex and edge collection schemas found in /schema
    collection_schema_file = _VALID_SCHEMA_TYPES['collection']['file']
    data = run_validator(schema_file=collection_schema_file, data_file=path)
    namecheck_schema(path, data)

    # Make sure it can be used as a JSON schema
    # If the schema is invalid, a SchemaError will get raised
    # Otherwise, the schema will work and a ValidationError will get raised (what we want)
    try:
        run_validator(data={}, schema=data['schema'])
    except ValidationError:
        pass
    except Exception as err:
        print('=' * 80)
        print('Unable to load schema in ' + path)
        raise err

    required = data['schema'].get('required', [])

    # Edges must require _from and _to while vertices must require _key
    has_edge_fields = ('_from' in required and '_to' in required)
    has_delta_edge_fields = ('from' in required and 'to' in required)

    if data['type'] == 'edge' and data.get('delta') and not has_delta_edge_fields:
        raise ValidationError('Time-travel edge schemas must require "from" and "to" attributes in ' + path)
    elif data['type'] == 'edge' and not data.get('delta') and not has_edge_fields:
        raise ValidationError('Edge schemas must require "_from" and "_to" attributes in ' + path)
    elif data['type'] == 'vertex' and data.get('delta') and 'id' not in required:
        raise ValidationError('Time-travel vertex schemas must require the "id" attribute in ' + path)
    elif data['type'] == 'vertex' and not data.get('delta') and '_key' not in required:
        raise ValidationError('Vertex schemas must require the "_key" attribute in ' + path)

    print(f'✓ {path} is valid.')
    return data
Esempio n. 9
0
    def execute_tests(self,
                      schema_arg,
                      schema_file_arg,
                      tests,
                      file_types=[None, 'json', 'yaml']):

        for t in tests:
            for file_ext in file_types:
                data = t['input']
                data_file = os_path.join(json_validation_dir,
                                         f"{t['file']}.{file_ext}")
                if file_ext is None:
                    data_file = None
                else:
                    data = None

                with self.subTest(input=t['input'], file_type=file_ext):
                    if 'err_str' in t:
                        with self.assertRaisesRegex(ValidationError,
                                                    t['err_str']):
                            run_validator(schema=schema_arg,
                                          schema_file=schema_file_arg,
                                          data=data,
                                          data_file=data_file,
                                          validate_at=valid_json_loc)

                    else:
                        output = run_validator(schema=schema_arg,
                                               schema_file=schema_file_arg,
                                               data=data,
                                               data_file=data_file,
                                               validate_at=valid_json_loc)
                        self.assertEqual(output, {
                            **schema_defaults,
                            **t['output']
                        })
Esempio n. 10
0
    def test_load_valid_manifests(self):

        valid_dir = os_path.join(_TEST_DIR, 'valid_manifest')
        file_list = ['with_descriptions', 'no_file_ext', 'no_file_format']

        for file in file_list:
            data_file = os_path.join(valid_dir, file + '.yaml')
            print('looking at ' + data_file)

            self.assertTrue(
                run_validator(
                    schema_file=schema_file,
                    data_file=data_file,
                    nicer_errors=True
                )
            )
Esempio n. 11
0
    def test_schema_references(self):
        """Ensure referenced schemas, including those written in yaml, can be accessed."""

        # same schema in different places
        path_list = [[], ['level_1'], ['level_1', 'level_2']]

        err_msg = "'whatever' is not valid under any of the given schemas"
        for path in path_list:

            for file_ext in ['json', 'yaml']:
                with self.subTest(file_ext=file_ext):
                    file_path = os_path.join(
                        *(test_data_dirs + ['schema_refs'] + path),
                        'edge.' + file_ext)

                    # fails due to invalid data
                    with self.assertRaisesRegex(ValidationError, err_msg):
                        run_validator(
                            schema_file=file_path,
                            data=invalid_edge_data,
                        )

                    # valid data
                    self.assertEqual(
                        run_validator(
                            schema_file=file_path,
                            data=valid_edge_data,
                        ), valid_edge_data)

                    # validate using the schema instead of the schema_file
                    with open(file_path) as fd:
                        contents = yaml.safe_load(
                            fd) if file_ext == 'yaml' else json.load(fd)

                    # if there is no $id in the schema, the ref resolver won't know
                    # where the schema file is located and will not resolve relative references
                    with self.assertRaisesRegex(RefResolutionError,
                                                'No such file or directory'):
                        run_validator(schema=contents, data=valid_edge_data)

                    # inject an $id with the current file path
                    contents['$id'] = file_path
                    self.assertEqual(
                        run_validator(
                            schema=contents,
                            data=valid_edge_data,
                        ), valid_edge_data)
Esempio n. 12
0
def run_query():
    """
    Run a stored query as a query against the database.
    Auth:
     - only kbase re admins for ad-hoc queries
     - public stored queries (these have access controls within them based on params)
    """
    json_body = parse_json.get_json_body() or {}
    # fetch number of documents to return
    batch_size = int(flask.request.args.get('batch_size', 10000))
    full_count = flask.request.args.get('full_count', False)

    if 'query' in json_body:
        # Run an adhoc query for a sysadmin
        auth.require_auth_token(roles=['RE_ADMIN'])
        query_text = _preprocess_stored_query(json_body['query'], json_body)
        del json_body['query']
        if 'ws_ids' in query_text:
            # Fetch any authorized workspace IDs using a KBase auth token, if present
            auth_token = auth.get_auth_header()
            json_body['ws_ids'] = auth.get_workspace_ids(auth_token)

        resp_body = arango_client.run_query(query_text=query_text,
                                            bind_vars=json_body,
                                            batch_size=batch_size,
                                            full_count=full_count)
        return flask.jsonify(resp_body)

    if 'stored_query' in flask.request.args or 'view' in flask.request.args:
        # Run a query from a query name
        # Note: we are maintaining backwards compatibility here with the "view" arg.
        # "stored_query" is the more accurate name
        query_name = flask.request.args.get(
            'stored_query') or flask.request.args.get('view')
        stored_query = spec_loader.get_stored_query(query_name)

        if 'params' in stored_query:
            # Validate the user params for the query
            stored_query_path = spec_loader.get_stored_query(query_name,
                                                             path_only=True)
            run_validator(schema_file=stored_query_path,
                          data=json_body,
                          validate_at='/params')

        stored_query_source = _preprocess_stored_query(stored_query['query'],
                                                       stored_query)
        if 'ws_ids' in stored_query_source:
            # Fetch any authorized workspace IDs using a KBase auth token, if present
            auth_token = auth.get_auth_header()
            json_body['ws_ids'] = auth.get_workspace_ids(auth_token)

        resp_body = arango_client.run_query(query_text=stored_query_source,
                                            bind_vars=json_body,
                                            batch_size=batch_size,
                                            full_count=full_count)
        return flask.jsonify(resp_body)

    if 'cursor_id' in flask.request.args:
        # Run a query from a cursor ID
        cursor_id = flask.request.args['cursor_id']
        resp_body = arango_client.run_query(cursor_id=cursor_id)
        return flask.jsonify(resp_body)
    # No valid options were passed
    raise InvalidParameters('Pass in a query name or a cursor_id')
Esempio n. 13
0
    def test_array_validation(self, schema_arg=None, schema_file_arg=None):
        """
        check array validation and default population works correctly when refs are used

        The current implementation of the population of defaults does not allow defaults to be
        populated if the property is a reference, i.e.

        'properties': {
            'fruits': {
                '$ref': '...'
            }
        }

        """

        # skip if the test is not being called from test_json_validation
        if schema_arg is None and schema_file_arg is None:
            self.assertTrue(True)
            return

        # test the use of refs when populating defaults
        tests = [
            {
                'fruits': fruit_ref,
                'name': 'using fruit.yaml -- array item is a ref',
                'output': {
                    'params': {
                        'name': 'name',
                        'distance': 1,
                        'fruits': []
                    }
                }
            },
            {
                # N.b. the default does not get populated in this case!
                # This is a change from the expected functionality
                'fruits': fruits_array_ref,
                'name': 'using fruits_array.yaml -- the array is a ref',
                'output': {
                    'params': {
                        'name': 'name',
                        'distance': 1,
                    }
                }
            },
            {
                'fruits': fruits_explicit,
                'name': 'with no references',
                'output': {
                    'params': {
                        'name': 'name',
                        'distance': 1,
                        'fruits': []
                    }
                }
            }
        ]

        for t in tests:
            with self.subTest(desc=t['name']):
                test_schema['properties']['params']['properties'][
                    'fruits'] = t['fruits']
                output = run_validator(schema=test_schema,
                                       data={'params': {
                                           'name': 'name'
                                       }})
                self.assertEqual(output, t['output'])

        # restore the original value
        test_schema['properties']['params']['properties'][
            'fruits'] = fruits_explicit
Esempio n. 14
0
    def test_non_validation_validator_errors(self):
        '''test errors in the validator that are unrelated to the validation functionality'''

        err_str = "Please supply either a schema or a schema file path"
        with self.assertRaisesRegex(ValueError, err_str):
            run_validator()

        with self.assertRaisesRegex(ValueError, err_str):
            run_validator(data={})

        # only supply one of schema or schema_file
        with self.assertRaisesRegex(ValueError, err_str):
            run_validator(schema={}, schema_file='/path/to/file')

        err_str = "Please supply either a data structure or a data file path"
        with self.assertRaisesRegex(ValueError, err_str):
            run_validator(schema={})

        with self.assertRaisesRegex(ValueError, err_str):
            run_validator(schema={}, data={}, data_file='')

        with self.assertRaisesRegex(ValueError, err_str):
            run_validator(schema={}, data=None, data_file=None)

        # invalid file type
        test_file = os_path.join(*(test_data_dirs + ['test_file.md']))
        err_msg = f'Unknown file type encountered: {test_file}'
        with self.assertRaisesRegex(TypeError, err_msg):
            run_validator(schema_file=test_file, data={})

        # invalid jsonpointer string - note the grammar error is from jsonpointer
        err_str = 'location must starts with /'
        json_loc = 'start validating here'
        with self.assertRaisesRegex(JsonPointerException, err_str):
            run_validator(schema=test_schema, data={}, validate_at=json_loc)

        # invalid jsonpointer ref
        err_str = "member 'property' not found in"
        json_loc = '/properties/params/property'
        with self.assertRaisesRegex(JsonPointerException, err_str):
            run_validator(schema=test_schema, data={}, validate_at=json_loc)

        # finally!!
        output = run_validator(schema=test_schema,
                               data={
                                   'name': 'name',
                                   'distance': 3
                               },
                               validate_at=valid_json_loc)
        self.assertEqual(output, {
            **schema_defaults,
            **{
                'name': 'name',
                'distance': 3
            }
        })