def __init__(self, schemaformat):
     self.schema = schemaformat
     checker = FormatChecker()
     self.validator = Draft4Validator(self.schema, format_checker=checker)
Exemple #2
0
def uri_blank(value):
    return value == "" or FormatChecker().conforms(value, "uri")
Exemple #3
0
 def test_it_can_validate_no_formats(self):
     checker = FormatChecker(formats=())
     self.assertFalse(checker.checkers)
Exemple #4
0
def persist_lines(config, lines):
    state = None
    schemas = {}
    key_properties = {}
    headers = {}
    validators = {}
    csv_files_to_load = {}
    row_count = {}
    stream_to_sync = {}
    primary_key_exists = {}
    batch_size = config['batch_size'] if 'batch_size' in config else 100000

    now = datetime.now().strftime('%Y%m%dT%H%M%S')

    # Loop over lines from stdin
    for line in lines:
        try:
            o = json.loads(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if 'type' not in o:
            raise Exception(
                "Line is missing required key 'type': {}".format(line))
        t = o['type']

        if t == 'RECORD':
            if 'stream' not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(o['stream']))

            # Get schema for this record's stream
            stream = o['stream']

            # Validate record
            validators[stream].validate(float_to_decimal(o['record']))

            sync = stream_to_sync[stream]

            primary_key_string = sync.record_primary_key_string(o['record'])
            if stream not in primary_key_exists:
                primary_key_exists[stream] = {}
            if primary_key_string and primary_key_string in primary_key_exists[
                    stream]:
                flush_records(o, csv_files_to_load, row_count,
                              primary_key_exists, sync)

            csv_line = sync.record_to_csv_line(o['record'])
            csv_files_to_load[o['stream']].write(
                bytes(csv_line + '\n', 'UTF-8'))
            row_count[o['stream']] += 1
            if primary_key_string:
                primary_key_exists[stream][primary_key_string] = True

            if row_count[o['stream']] >= batch_size:
                flush_records(o, csv_files_to_load, row_count,
                              primary_key_exists, sync)

            state = None
        elif t == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']
        elif t == 'SCHEMA':
            if 'stream' not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))
            stream = o['stream']
            schema = float_to_decimal(o['schema'])
            if 'properties' not in schema:
                logger.debug(
                    f"Schema for stream '{stream}' misses properties. Waiting for properly-formed schema."
                )
                continue
            schemas[stream] = o
            walk_schema_for_numeric_precision(schema)
            validators[stream] = Draft4Validator(
                schema, format_checker=FormatChecker())
            if 'key_properties' not in o:
                raise Exception("key_properties field is required")
            key_properties[stream] = o['key_properties']
            stream_to_sync[stream] = DbSync(config, o)
            stream_to_sync[stream].create_schema_if_not_exists()
            stream_to_sync[stream].sync_table()
            row_count[stream] = 0
            csv_files_to_load[stream] = NamedTemporaryFile(mode='w+b')
        elif t == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')
        else:
            raise Exception("Unknown message type {} in message {}".format(
                o['type'], o))

    for (stream_name, count) in row_count.items():
        if count > 0:
            stream_to_sync[stream_name].load_csv(
                csv_files_to_load[stream_name], count)

    return state
Exemple #5
0
 def _validate_schema(self, items, json_name):
     schema = self.json_schemas[json_name]
     # May raise jsonschema.ValidationError
     validate(items, schema, format_checker=FormatChecker())
Exemple #6
0
from flask_restplus import Api
from jsonschema import FormatChecker

import logging

log = logging.getLogger(__name__)

#  Instantiate a Flask-RESTPlus API
api = Api(version='1.0',
          title='iter8 analytics REST API',
          description='API to perform analytics to support canary releases '
          'and A/B tests',
          format_checker=FormatChecker(formats=("date-time", )))


def build_http_error(msg, http_code):
    '''Returns a specific error message and HTTP code pip that can be used by '
    'the REST API'''
    return {'message': msg}, http_code


@api.errorhandler
def default_error_handler(e):
    '''Error handler for uncaught exceptions'''
    message = 'An unexpected error occurred'
    log.exception(message)
    return {'message': message}, 500
Exemple #7
0
 def schema(self, value):
     self._schema = Draft7Validator(value, format_checker=FormatChecker())
 def test_it_can_register_checkers(self):
     checker = FormatChecker()
     checker.checks("new")(self.fn)
     self.assertEqual(checker.checkers,
                      dict(FormatChecker.checkers, new=(self.fn, ())))
Exemple #9
0
    def test_format_checker_object_on_constructor(self, app, client):
        from jsonschema import FormatChecker
        self._setup_api_format_checker_tests(app, format_checker=FormatChecker())

        out = client.post_json('/format_checker/', {'ip': '192.168.1'}, status=400)
        assert 'ipv4' in out['errors']['ip']
Exemple #10
0
 def _validator(self):
     Draft4Validator.check_schema(self.request)
     return Draft4Validator(self.request, format_checker=FormatChecker())
Exemple #11
0
 def _update_validator(self):
     Draft4Validator.check_schema(self.update)
     return Draft4Validator(self.update, format_checker=FormatChecker())
Exemple #12
0
def tracker_sync():
    request_data = request.get_json(silent=True)
    requester_ip = request.remote_addr

    if request_data is None:
        return jsonify({
            "error": "Request is not JSON",
            "success": False,
        })

    if not models.tracker_ip_exists(requester_ip):
        # Return an error if the tracker is not in the tracker list
        return jsonify({
            "success": False,
            "dead_tracker": True,
            "error": "Tracker not in tracker list",
        })

    try:
        validate(request_data,
                 schemas.TRACKER_SYNC_SCHEMA,
                 format_checker=FormatChecker())
    except ValidationError as e:
        return jsonify({
            "error": str(e),
            "success": False,
        })

    event = request_data["event"]
    event_ip = request_data["event_ip"]
    event_data = request_data["data"]

    # By default, don't rebroadcast and respond with success
    rebroadcast = False
    sync_response = {"success": True}

    try:
        if event == "new_tracker":
            # If the tracker doesn't exist, rebroadcast and add it
            if not models.tracker_ip_exists(event_ip):
                tracker = models.add_tracker(event_ip)

                # Can't just set rebroadcast here since we need to broadcast before adding the tracker
                broadcaster.new_event(event, event_ip, event_data)
                broadcaster.new_tracker(tracker)

        elif event == "add_file":
            peer_guid = event_data["guid"]
            models.ensure_peer_exists(event_ip, peer_guid,
                                      event_data["seq_number"])

            # If the sequence number is new, apply and rebroadcast
            if event_data["seq_number"] >= models.peer_expected_seq(peer_guid):
                models.add_file(event_data, event_ip)
                rebroadcast = True

        elif event == "keep_alive":
            peer_guid = event_data["guid"]
            models.ensure_peer_exists(event_ip, peer_guid)

            # If the keepalive sequence number is new, apply and rebroadcast
            if event_data["ka_seq_number"] >= models.peer_expected_ka_seq(
                    peer_guid):
                models.keep_alive(event_data, event_ip)
                rebroadcast = True

        elif event == "deregister_file_by_hash":
            peer_guid = event_data["guid"]
            models.ensure_peer_exists(event_ip, peer_guid)

            # If the sequence number is new, apply and rebroadcast
            if event_data["seq_number"] >= models.peer_expected_seq(peer_guid):
                models.deregister_file_by_hash(event_data, event_ip)
                rebroadcast = True

        # Only rebroadcast if specified
        if rebroadcast:
            broadcaster.new_event(event, event_ip, event_data)
    except Exception:
        print("Recieved exception during tracker sync", sys.stderr)
        print_exc()

        return jsonify({
            "error": "Unexpected error",
            "success": False,
        })

    return jsonify(sync_response)
from flask import Flask, Blueprint
from flask_restplus import Api
from app.core.log_wrapper import log
from app.core.settings import LocalhostConfig, ProductionConfig, TestConfig
from app.core.env_var_wrapper import EnvironmentVariableWrapper
from werkzeug.contrib.fixers import ProxyFix
from jsonschema import FormatChecker

bp = Blueprint('api', __name__)
api = Api(bp,
          version='1.0',
          title='Calls API',
          description='A Olist test',
          validate=True,
          format_checker=FormatChecker(formats=('date-time', )),
          doc='/docs')


def init_app():
    app = Flask(__name__)
    app.wsgi_app = ProxyFix(app.wsgi_app)
    env = EnvironmentVariableWrapper().env()
    if env == 'development':
        LocalhostConfig(app).config()
    elif env == 'production':
        ProductionConfig(app).config()
    elif env == 'test':
        TestConfig(app).config()
    else:
        raise ValueError('Please, inform FLASK_ENV. Possible values = ' +
                         '[development, qa, production]')
 def test(self, format=format):
     v = validators.Draft4Validator(
         {"format": format},
         format_checker=FormatChecker(),
     )
     v.validate(123)
Exemple #15
0
import logging
from flask_restplus import Api
from src import settings
from jsonschema import FormatChecker

api = Api(version='1.0',
          title='Groceries API',
          description='For creating a shopping list for the groceries.',
          format_checker=FormatChecker(
              formats=['date-time']))  # add the jsonschema formatcheckers here


@api.errorhandler
def default_error_handler(e):
    message = 'An unhandled exception occurred.'
    logging.exception(message)

    if not settings.FLASK_DEBUG:
        return {'message': message}, 500
Exemple #16
0
def parse(filename):
    try:
        schema = json.loads(open("pl.schema").read())
        schema = Draft4Validator(schema, format_checker=FormatChecker())
    except ValueError as e:
        post_error("pl.schema - " + str(e))
        return

    try:
        pl = json.loads(open(filename).read())
    except ValueError as e:
        post_error(filename + " - " + str(e))
        return

    for error in schema.iter_errors(pl):
        pprint(error.absolute_path)
        post_error(error.message)

    foldernames = []
    displaynames = []
    repositories = []

    path = Path(bitness_from_input)
    if path.exists():
        shutil.rmtree(path)
    os.mkdir(bitness_from_input)
    for plugin in pl["npp-plugins"]:
        print(plugin["display-name"])

        try:
            response = requests.get(plugin["repository"])
        except requests.exceptions.RequestException as e:
            post_error(str(e))
            continue

        if response.status_code != 200:
            post_error(
                f'{plugin["display-name"]}: failed to download plugin. Returned code {response.status_code}'
            )
            continue

        # Hash it and make sure its what is expected
        hash = sha256(response.content).hexdigest()
        if plugin["id"].lower() != hash.lower():
            post_error(
                f'{plugin["display-name"]}: Invalid hash. Got {hash.lower()} but expected {plugin["id"]}'
            )
            continue

        # Make sure its a valid zip file
        try:
            zip = zipfile.ZipFile(io.BytesIO(response.content))
        except zipfile.BadZipFile as e:
            post_error(f'{plugin["display-name"]}: Invalid zip file')
            continue

        # The expected DLL name
        dll_name = f'{plugin["folder-name"]}.dll'.lower()

        # Notepad++ is not case sensitive, but extracting files from the zip is,
        # so find the exactfile name to use
        for file in zip.namelist():
            if dll_name == file.lower():
                dll_name = file
                break
        else:
            post_error(
                f'{plugin["display-name"]}: Zip file does not contain {plugin["folder-name"]}.dll'
            )
            continue

        with zip.open(dll_name) as dll_file, open(
                "./" + bitness_from_input + "/" + dll_name, 'wb') as f:
            f.write(dll_file.read())

        version = plugin["version"]

        # Fill in any of the missing numbers as zeros
        version = version + (3 - version.count('.')) * ".0"

        try:
            dll_version = get_version_number("./" + bitness_from_input + "/" +
                                             dll_name)
        except win32api.error:
            post_error(
                f'{plugin["display-name"]}: Does not contain any version information'
            )
            continue

        if dll_version != version:
            post_error(
                f'{plugin["display-name"]}: Unexpected DLL version. DLL is {dll_version} but expected {version}'
            )
            continue

        #check uniqueness of json folder-name, display-name and repository
        found = False
        for name in displaynames:
            if plugin["display-name"] == name:
                post_error(
                    f'{plugin["display-name"]}: non unique display-name entry')
                found = True
        if found == False:
            displaynames.append(plugin["display-name"])

        found = False
        for folder in foldernames:
            if plugin["folder-name"] == folder:
                post_error(
                    f'{plugin["folder-name"]}: non unique folder-name entry')
                found = True
        if found == False:
            foldernames.append(plugin["folder-name"])

        found = False
        for repo in repositories:
            if plugin["repository"] == repo:
                post_error(
                    f'{plugin["repository"]}: non unique repository entry')
                found = True
        if found == False:
            repositories.append(plugin["repository"])
def persist_messages(messages, config, s3_client):
    state = None
    schemas = {}
    key_properties = {}
    headers = {}
    validators = {}

    delimiter = config.get('delimiter', ',')
    quotechar = config.get('quotechar', '"')

    # Use the system specific temp directory if no custom temp_dir provided
    temp_dir = os.path.expanduser(config.get('temp_dir',
                                             tempfile.gettempdir()))

    # Create temp_dir if not exists
    if temp_dir:
        os.makedirs(temp_dir, exist_ok=True)

    filenames = []
    now = datetime.now().strftime('%Y%m%dT%H%M%S')

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o['type']
        if message_type == 'RECORD':
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o['stream']))

            # Validate record
            try:
                validators[o['stream']].validate(
                    utils.float_to_decimal(o['record']))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        "Data validation failed and cannot load to destination. RECORD: {}\n"
                        "'multipleOf' validations that allows long precisions are not supported"
                        " (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema."
                        .format(o['record']))
                    raise ex

            record_to_load = o['record']
            if config.get('add_metadata_columns'):
                record_to_load = utils.add_metadata_values_to_record(o, {})
            else:
                record_to_load = utils.remove_metadata_values_from_record(o)

            filename = o['stream'] + '-' + now + '.csv'
            filename = os.path.expanduser(os.path.join(temp_dir, filename))
            target_key = utils.get_target_key(
                o,
                prefix=config.get('s3_key_prefix', ''),
                timestamp=now,
                naming_convention=config.get('naming_convention'))
            if not (filename, target_key) in filenames:
                filenames.append((filename, target_key))

            file_is_empty = (
                not os.path.isfile(filename)) or os.stat(filename).st_size == 0

            flattened_record = utils.flatten_record(record_to_load)

            if o['stream'] not in headers and not file_is_empty:
                with open(filename, 'r') as csvfile:
                    reader = csv.reader(csvfile,
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                    first_line = next(reader)
                    headers[o[
                        'stream']] = first_line if first_line else flattened_record.keys(
                        )
            else:
                headers[o['stream']] = flattened_record.keys()

            with open(filename, 'a') as csvfile:
                writer = csv.DictWriter(csvfile,
                                        headers[o['stream']],
                                        extrasaction='ignore',
                                        delimiter=delimiter,
                                        quotechar=quotechar)
                if file_is_empty:
                    writer.writeheader()

                writer.writerow(flattened_record)

            state = None
        elif message_type == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']
        elif message_type == 'SCHEMA':
            stream = o['stream']
            schemas[stream] = o['schema']
            if config.get('add_metadata_columns'):
                schemas[stream] = utils.add_metadata_columns_to_schema(o)

            schema = utils.float_to_decimal(o['schema'])
            validators[stream] = Draft7Validator(
                schema, format_checker=FormatChecker())
            key_properties[stream] = o['key_properties']
        elif message_type == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')
        else:
            logger.warning("Unknown message type {} in message {}".format(
                o['type'], o))

    # Upload created CSV files to S3
    for filename, target_key in filenames:
        compressed_file = None
        if config.get("compression") is None or config["compression"].lower(
        ) == "none":
            pass  # no compression
        else:
            if config["compression"] == "gzip":
                compressed_file = f"{filename}.gz"
                with open(filename, 'rb') as f_in:
                    with gzip.open(compressed_file, 'wb') as f_out:
                        logger.info(f"Compressing file as '{compressed_file}'")
                        shutil.copyfileobj(f_in, f_out)
            else:
                raise NotImplementedError(
                    "Compression type '{}' is not supported. "
                    "Expected: 'none' or 'gzip'".format(config["compression"]))
        s3.upload_file(compressed_file or filename,
                       s3_client,
                       config.get('s3_bucket'),
                       target_key,
                       encryption_type=config.get('encryption_type'),
                       encryption_key=config.get('encryption_key'))

        # Remove the local file(s)
        os.remove(filename)
        if compressed_file:
            os.remove(compressed_file)

    return state
Exemple #18
0
"""
OpenC2 Specific Format Checkers
"""
from jsonschema import FormatChecker

# Format Checker
ExtendedFormatChecker = FormatChecker()
"""
Default formats
email/idn-email - checks if string and `@` in string
ip-address/ipv4 - checks if valid ipv4 address (ip-address only valid on draft3)
host-name/hostname - checks if valid hostname (host-name only valid on draft3)
uri -  checks if valid Resource Identifier (only valid if rfc3987 pk installed)
date-time - checks if valid datetime (only valid if strict_rfc3339 pkg installed)
regex - checks for valid regex

# Draft 3 Only
time - checks if valid time
color - checks if valid webcolor (only valid if webcolors pkg installed)

# Draft 7 Only
idn-hostname - checks if valid Internationalized Internet host name (only if idna pkg installed)
iri - checks if valid Internationalized Resource Identifier (only valid if rfc3987 pk installed)
iri-reference -  checks if valid Internationalized Resource Identifier Reference (only valid if rfc3987 pk installed)
time - checks if valid time (only valid if strict_rfc3339 pkg installed)
relative-json-pointer - TBD (only valid if jsonpointer pkg installed)

# Mixed
date - checks for valid date (only for draft3 and draft7)
json-pointer - TBD (only for draft6 and draft7 with jsonpointer pkg installed)
uri-template - checks for valid uri template (only for draft6 and draft7 with uritemplate pkg installed)
Exemple #19
0
def schema_validate(request, schema):
    try:
        validate(instance=request, schema=schemas[schema], format_checker=FormatChecker())
    except ValidationError as e:
        return 'ValidationError: %s' % (e,)
    return None
Exemple #20
0
def validate_json(instance, schema):
    """Validate a dictionary using the provided json schema."""
    Validator(schema, format_checker=FormatChecker()).validate(instance)
# -*- coding: utf-8 -*-
import json
import logging
from jsonschema import validate, ValidationError, FormatChecker
from werkzeug.routing import Map, Rule, NotFound

__validate_kwargs = {"format_checker": FormatChecker()}
__required_keys = ["httpMethod", "resource"]


class Response(object):
    """Class to conceptualize a response with default attributes

    if no body is specified, empty string is returned
    if no status_code is specified, 200 is returned
    if no headers are specified, empty dict is returned
    """
    def __init__(self, body=None, status_code=None, headers=None):
        self.body = body
        self.status_code = status_code
        self.headers = headers

    def to_json(self):
        return {
            "body": json.dumps(self.body) if self.body else None,
            "statusCode": self.status_code or 200,
            "headers": self.headers or {}
        }


def _float_cast(value):
Exemple #22
0
def persist_lines(config, lines) -> None:
    state = None
    flushed_state = None
    schemas = {}
    key_properties = {}
    validators = {}
    records_to_load = {}
    csv_files_to_load = {}
    row_count = {}
    stream_to_sync = {}
    total_row_count = {}
    table_columns_cache = None
    batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS)

    # Cache the available schemas, tables and columns from redshift if not disabled in config
    # The cache will be used later use to avoid lot of small queries hitting redshift
    if not ('disable_table_cache' in config
            and config['disable_table_cache'] == True):
        logger.info("Caching available catalog objects in redshift...")
        filter_schemas = get_schema_names_from_config(config)
        table_columns_cache = DbSync(config).get_table_columns(
            filter_schemas=filter_schemas)

    # Loop over lines from stdin
    for line in lines:
        try:
            o = json.loads(line)
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(line))
            raise

        if 'type' not in o:
            raise Exception(
                "Line is missing required key 'type': {}".format(line))

        t = o['type']

        if t == 'RECORD':
            if 'stream' not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {} was encountered before a corresponding schema"
                    .format(o['stream']))

            # Get schema for this record's stream
            stream = o['stream']

            # Validate record
            try:
                validators[stream].validate(float_to_decimal(o['record']))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        "Data validation failed and cannot load to destination. RECORD: {}\n'multipleOf' validations "
                        "that allows long precisions are not supported (i.e. with 15 digits or more). Try removing "
                        "'multipleOf' methods from JSON schema. ".format(
                            o['record']))
                    raise ex

            primary_key_string = stream_to_sync[
                stream].record_primary_key_string(o['record'])
            if not primary_key_string:
                primary_key_string = 'RID-{}'.format(total_row_count[stream])

            if stream not in records_to_load:
                records_to_load[stream] = {}

            # increment row count only when a new PK is encountered in the current batch
            if primary_key_string not in records_to_load[stream]:
                row_count[stream] += 1
                total_row_count[stream] += 1

            # append record
            if config.get('add_metadata_columns') or config.get('hard_delete'):
                records_to_load[stream][
                    primary_key_string] = add_metadata_values_to_record(
                        o, stream_to_sync[stream])
            else:
                records_to_load[stream][primary_key_string] = o['record']

            if row_count[stream] >= batch_size_rows:
                # flush all streams, delete records if needed, reset counts and then emit current state
                if config.get('flush_all_streams'):
                    filter_streams = None
                else:
                    filter_streams = [stream]

                # Flush and return a new state dict with new positions only for the flushed streams
                flushed_state = flush_streams(records_to_load,
                                              row_count,
                                              stream_to_sync,
                                              config,
                                              state,
                                              flushed_state,
                                              filter_streams=filter_streams)

                # emit last encountered state
                emit_state(copy.deepcopy(flushed_state))

        elif t == 'SCHEMA':
            if 'stream' not in o:
                raise Exception(
                    "Line is missing required key 'stream': {}".format(line))

            stream = o['stream']

            schemas[stream] = o
            schema = float_to_decimal(o['schema'])
            validators[stream] = Draft4Validator(
                schema, format_checker=FormatChecker())

            # flush records from previous stream SCHEMA
            # if same stream has been encountered again, it means the schema might have been altered
            # so previous records need to be flushed
            if row_count.get(stream, 0) > 0:
                flushed_state = flush_streams(records_to_load, row_count,
                                              stream_to_sync, config, state,
                                              flushed_state)

                # emit latest encountered state
                emit_state(flushed_state)

            # key_properties key must be available in the SCHEMA message.
            if 'key_properties' not in o:
                raise Exception("key_properties field is required")

            # Log based and Incremental replications on tables with no Primary Key
            # cause duplicates when merging UPDATE events.
            # Stop loading data by default if no Primary Key.
            #
            # If you want to load tables with no Primary Key:
            #  1) Set ` 'primary_key_required': false ` in the target-redshift config.json
            #  or
            #  2) Use fastsync [postgres-to-redshift, mysql-to-redshift, etc.]
            if config.get('primary_key_required', True) and len(
                    o['key_properties']) == 0:
                logger.critical(
                    "Primary key is set to mandatory but not defined in the [{}] stream"
                    .format(stream))
                raise Exception("key_properties field is required")

            key_properties[stream] = o['key_properties']

            if config.get('add_metadata_columns') or config.get('hard_delete'):
                stream_to_sync[stream] = DbSync(
                    config, add_metadata_columns_to_schema(o))
            else:
                stream_to_sync[stream] = DbSync(config, o)

            stream_to_sync[stream].create_schema_if_not_exists(
                table_columns_cache)
            stream_to_sync[stream].sync_table(table_columns_cache)

            row_count[stream] = 0
            total_row_count[stream] = 0
            csv_files_to_load[stream] = NamedTemporaryFile(mode='w+b')

        elif t == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')

        elif t == 'STATE':
            logger.debug('Setting state to {}'.format(o['value']))
            state = o['value']

            # Initially set flushed state
            if not flushed_state:
                flushed_state = copy.deepcopy(state)

        else:
            raise Exception("Unknown message type {} in message {}".format(
                o['type'], o))

    # if some bucket has records that need to be flushed but haven't reached batch size
    # then flush all buckets.
    if sum(row_count.values()) > 0:
        # flush all streams one last time, delete records if needed, reset counts and then emit current state
        flushed_state = flush_streams(records_to_load, row_count,
                                      stream_to_sync, config, state,
                                      flushed_state)

    # emit latest state
    emit_state(copy.deepcopy(flushed_state))
Exemple #23
0
from datetime import datetime, timedelta
from uuid import UUID

from flask import current_app
from iso8601 import ParseError, iso8601
from jsonschema import Draft7Validator, FormatChecker, ValidationError
from notifications_utils.recipients import (
    InvalidEmailError,
    InvalidPhoneError,
    validate_email_address,
    validate_phone_number,
)

from app.notifications.validators import decode_personalisation_files

format_checker = FormatChecker()


@format_checker.checks("validate_uuid", raises=Exception)
def validate_uuid(instance):
    if isinstance(instance, str):
        UUID(instance)
    return True


@format_checker.checks("phone_number", raises=InvalidPhoneError)
def validate_schema_phone_number(instance):
    if isinstance(instance, str):
        validate_phone_number(instance, international=True)
    return True
Exemple #24
0
def validate(instance, schema, cls=None, *args, **kwargs):
    """
    Calls jsonschema.validate() with the arguments.
    """
    format_checker = FormatChecker()
    _validate(instance, schema, cls, *args, format_checker=format_checker, **kwargs)
Exemple #25
0
def persist_messages(messages, config, s3_client, do_timestamp_file=True):
    logger.info('persist_messages')
    state = None
    schemas = {}
    key_properties = {}
    validators = {}

    filenames = []
    filename = None
    timestamp_file_part = '-' + datetime.now().strftime(
        '%Y%m%dT%H%M%S') if do_timestamp_file else ''
    max_file_size_mb = config.get('max_temp_file_size_mb', 50)
    stream = None

    if config.get('record_unique_field'):
        a = set()
        write_temp_pickle()

    for message in messages:
        try:
            o = singer.parse_message(message).asdict()
        except json.decoder.JSONDecodeError:
            logger.error("Unable to parse:\n{}".format(message))
            raise
        message_type = o['type']

        if message_type == 'RECORD':
            if o['stream'] not in schemas:
                raise Exception(
                    "A record for stream {}"
                    "was encountered before a corresponding schema".format(
                        o['stream']))

            # Validate record
            try:
                validators[o['stream']].validate(
                    utils.float_to_decimal(o['record']))
            except Exception as ex:
                if type(ex).__name__ == "InvalidOperation":
                    logger.error(
                        """Data validation failed and cannot load to destination. RECORD: {}\n
                    'multipleOf' validations that allows long precisions are not supported 
                    (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema.
                    """.format(o['record']))
                    raise ex

            record_to_load = o['record']
            if config.get('add_metadata_columns'):
                record_to_load = utils.add_metadata_values_to_record(o, {})
            else:
                record_to_load = utils.remove_metadata_values_from_record(o)

            flattened_record = utils.flatten(record_to_load)
            filename = o['stream'] + timestamp_file_part + '.jsonl'
            filename = os.path.join(tempfile.gettempdir(), filename)
            filename = os.path.expanduser(filename)

            if not (filename, o['stream']) in filenames:
                filenames.append((filename, o['stream']))

            with open(filename, 'a') as f:
                f.write(json.dumps(flattened_record, cls=DecimalEncoder))
                f.write('\n')

            file_size = os.path.getsize(filename) if os.path.isfile(
                filename) else 0
            if file_size >> 20 > max_file_size_mb:
                logger.info('file_size: {} MB, filename: {}'.format(
                    round(file_size >> 20, 2), filename))
                upload_to_s3(s3_client, config.get("s3_bucket"),
                             os.environ["TARGET_S3_SOURCE_NAME"], filename,
                             o['stream'],
                             config.get('field_to_partition_by_time'),
                             config.get('record_unique_field'),
                             config.get("compression"),
                             config.get('encryption_type'),
                             config.get('encryption_key'))
                filenames.remove((filename, o['stream']))
            state = None
        elif message_type == 'STATE':
            logger.info('Setting state to {}'.format(o['value']))
            state = o['value']
        elif message_type == 'SCHEMA':
            stream = o['stream']
            schemas[stream] = o['schema']
            if config.get('add_metadata_columns'):
                schemas[stream] = utils.add_metadata_columns_to_schema(o)

            schema = utils.float_to_decimal(o['schema'])
            validators[stream] = Draft4Validator(
                schema, format_checker=FormatChecker())
            key_properties[stream] = o['key_properties']
        elif message_type == 'ACTIVATE_VERSION':
            logger.debug('ACTIVATE_VERSION message')
        else:
            logger.warning("Unknown message type {} in message {}".format(
                o['type'], o))

    # Upload created CSV files to S3
    for filename, stream in filenames:
        upload_to_s3(s3_client, config.get("s3_bucket"),
                     os.environ["TARGET_S3_SOURCE_NAME"], filename, stream,
                     config.get('field_to_partition_by_time'),
                     config.get('record_unique_field'),
                     config.get("compression"), config.get('encryption_type'),
                     config.get('encryption_key'))

    return state
Exemple #26
0
 def __init__(self, validator_class, schema_factory, module, resolver=None, format_checker=None):
     self.schema_factory = schema_factory
     self.validator_class = validator_class
     self.resolver = resolver
     self.format_checker = format_checker or FormatChecker()
     self.module = module
Exemple #27
0
 def test_it_returns_true_for_formats_it_does_not_know_about(self):
     validator = self.validator_class(
         {"format" : "carrot"}, format_checker=FormatChecker(),
     )
     validator.validate("bugs")
Exemple #28
0
def save_upload():
    """ Ajax route to create a manifest for each uploaded file  
	and insert it in the database.
	"""
    if request.method == 'POST':
        print('Working session folder: ' + session['IMPORT_DIR'])
        errors = []
        # Make sure the collection exists before handling form data
        try:
            result = list(corpus_db.find({'name': request.json['collection']}))
            assert result != []
            # Handle the form data
            exclude = ['branch', 'category', 'collection']
            node_metadata = {}
            for key, value in request.json.items():
                if key not in exclude and value != '' and value != []:
                    node_metadata[key] = value
            # Set the name and metapath
            if request.json['collection'].startswith('Corpus,'):
                collection = request.json['collection']
            else:
                collection = 'Corpus,' + request.json['collection']
        except:
            errors.append(
                'The specified collection does not exist in the database. Check your entry or <a href="/corpus/create">create a collection</a> before importing data.'
            )

        if len(errors) == 0:
            # Set the name and path for the new manifest
            node_metadata = {}
            if request.json['branch'] != '':
                node_metadata['name'] = request.json['branch']
                node_metadata[
                    'metapath'] = collection + ',' + request.json['category']
            else:
                node_metadata['name'] = request.json['category']
                node_metadata['metapath'] = collection

        # If the specified metapath does not exist, create it
        if len(errors) == 0:
            parent = list(
                corpus_db.find({
                    'name': node_metadata['name'],
                    'metapath': node_metadata['metapath']
                }))
            if len(parent) == 0:
                try:
                    corpus_db.insert_one(node_metadata)
                except:
                    errors.append(
                        '<p>The specified metapath does not exist and could not be created.</p>'
                    )

        # Now create a data manifest for each file and insert it
        if len(errors) == 0:
            for filename in os.listdir(session['IMPORT_DIR']):
                print('Creating manifest for ' + filename)
                if filename.endswith('.json'):
                    filepath = os.path.join(session['IMPORT_DIR'], filename)
                    metapath = node_metadata['metapath'] + ',' + node_metadata[
                        'name']
                    manifest = {
                        'name': os.path.splitext(filename)[0],
                        'namespace': 'we1sv2.0',
                        'metapath': metapath
                    }
                    try:
                        with open(filepath, 'rb') as f:
                            doc = json.loads(f.read())
                            for key, value in doc.items():
                                if key not in [
                                        'name', 'namespace', 'metapath'
                                ]:
                                    manifest[key] = value
                    except:
                        errors.append(
                            '<p>The file <code>' + filename +
                            '</code> could not be loaded or it did not have a <code>content</code> property.</p>'
                        )
                    # Validate the manifest before inserting
                    schema_file = 'https://raw.githubusercontent.com/whatevery1says/manifest/master/schema/v2.0/Corpus/Data.json'
                    schema = json.loads(requests.get(schema_file).text)
                    print(manifest['name'])
                    print(manifest['metapath'])
                    try:
                        methods.validate(manifest,
                                         schema,
                                         format_checker=FormatChecker())
                        result = methods.create_record(manifest)
                        print('Is this my error')
                        errors = errors + result
                        print(errors)
                    except:
                        errors.append(
                            '<p>A valid manifest could not be created from the file <code>'
                            + filename +
                            '</code> or the manifest could not be added to the database due to an unknown error.</p>'
                        )
                else:
                    errors.append('<p>The file <code>' + filename +
                                  '</code> is an invalid format.</p>')

        # We're done. Delete the import directory
        shutil.rmtree(session['IMPORT_DIR'])

        # Refresh the session
        token = datetime.now().strftime('%Y%m%d_') + str(randint(0, 99))
        session['IMPORT_DIR'] = os.path.join(TEMP_DIR,
                                             token).replace('\\', '/')

        if len(errors) == 0:
            return json.dumps({'result': 'success', 'session_token': 'token'})
        else:
            return json.dumps({'errors': errors})
Exemple #29
0
 def test_it_raises_a_key_error_for_unknown_formats(self):
     with self.assertRaises(KeyError):
         FormatChecker(formats=["o noes"])
def parse(filename):
    try:
        schema = json.loads(open("udl.schema").read())
        schema = Draft7Validator(schema, format_checker=FormatChecker())
    except ValueError as e:
        post_error("udl.schema - " + str(e))
        return

    try:
        udlfile = json.loads(open(filename, encoding="utf8").read())
    except ValueError as e:
        post_error(filename + " - " + str(e))
        return

    for error in schema.iter_errors(udlfile):
        post_error(error.message)

    idnames = []
    displaynames = []
    repositories = []
    response = []

    for udl in udlfile["UDLs"]:
        print(udl["display-name"])

        try:
            if udl["repository"] != "":
                response = requests.get(udl["repository"])
        except requests.exceptions.RequestException as e:
            post_error(str(e))
            continue

        if udl["repository"] != "" and response.status_code != 200:
            post_error(
                f'{udl["display-name"]}: failed to download udl. Returned code {response.status_code}'
            )
            continue

        # Hash it and make sure its what is expected
        #hash = sha256(response.content).hexdigest()
        #if udl["id"].lower() != hash.lower():
        #    post_error(f'{udl["display-name"]}: Invalid hash. Got {hash.lower()} but expected {udl["id"]}')
        #    continue

        #check uniqueness of json id-name, display-name and repository
        found = False
        for name in displaynames:
            if udl["display-name"] == name:
                post_error(
                    f'{udl["display-name"]}: non unique display-name entry')
                found = True
        if found == False:
            displaynames.append(udl["display-name"])

        found = False
        for idname in idnames:
            if udl["id-name"] == idname:
                post_error(f'{udl["id-name"]}: non unique id-name entry')
                found = True
        if found == False:
            idnames.append(udl["id-name"])

        found = False
        for repo in repositories:
            if udl["repository"] != "" and udl["repository"] == repo:
                post_error(f'{udl["repository"]}: non unique repository entry')
                found = True
        if found == False:
            repositories.append(udl["repository"])