def __init__(self, schemaformat): self.schema = schemaformat checker = FormatChecker() self.validator = Draft4Validator(self.schema, format_checker=checker)
def uri_blank(value): return value == "" or FormatChecker().conforms(value, "uri")
def test_it_can_validate_no_formats(self): checker = FormatChecker(formats=()) self.assertFalse(checker.checkers)
def persist_lines(config, lines): state = None schemas = {} key_properties = {} headers = {} validators = {} csv_files_to_load = {} row_count = {} stream_to_sync = {} primary_key_exists = {} batch_size = config['batch_size'] if 'batch_size' in config else 100000 now = datetime.now().strftime('%Y%m%dT%H%M%S') # Loop over lines from stdin for line in lines: try: o = json.loads(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if 'type' not in o: raise Exception( "Line is missing required key 'type': {}".format(line)) t = o['type'] if t == 'RECORD': if 'stream' not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) if o['stream'] not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(o['stream'])) # Get schema for this record's stream stream = o['stream'] # Validate record validators[stream].validate(float_to_decimal(o['record'])) sync = stream_to_sync[stream] primary_key_string = sync.record_primary_key_string(o['record']) if stream not in primary_key_exists: primary_key_exists[stream] = {} if primary_key_string and primary_key_string in primary_key_exists[ stream]: flush_records(o, csv_files_to_load, row_count, primary_key_exists, sync) csv_line = sync.record_to_csv_line(o['record']) csv_files_to_load[o['stream']].write( bytes(csv_line + '\n', 'UTF-8')) row_count[o['stream']] += 1 if primary_key_string: primary_key_exists[stream][primary_key_string] = True if row_count[o['stream']] >= batch_size: flush_records(o, csv_files_to_load, row_count, primary_key_exists, sync) state = None elif t == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] elif t == 'SCHEMA': if 'stream' not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) stream = o['stream'] schema = float_to_decimal(o['schema']) if 'properties' not in schema: logger.debug( f"Schema for stream '{stream}' misses properties. Waiting for properly-formed schema." ) continue schemas[stream] = o walk_schema_for_numeric_precision(schema) validators[stream] = Draft4Validator( schema, format_checker=FormatChecker()) if 'key_properties' not in o: raise Exception("key_properties field is required") key_properties[stream] = o['key_properties'] stream_to_sync[stream] = DbSync(config, o) stream_to_sync[stream].create_schema_if_not_exists() stream_to_sync[stream].sync_table() row_count[stream] = 0 csv_files_to_load[stream] = NamedTemporaryFile(mode='w+b') elif t == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') else: raise Exception("Unknown message type {} in message {}".format( o['type'], o)) for (stream_name, count) in row_count.items(): if count > 0: stream_to_sync[stream_name].load_csv( csv_files_to_load[stream_name], count) return state
def _validate_schema(self, items, json_name): schema = self.json_schemas[json_name] # May raise jsonschema.ValidationError validate(items, schema, format_checker=FormatChecker())
from flask_restplus import Api from jsonschema import FormatChecker import logging log = logging.getLogger(__name__) # Instantiate a Flask-RESTPlus API api = Api(version='1.0', title='iter8 analytics REST API', description='API to perform analytics to support canary releases ' 'and A/B tests', format_checker=FormatChecker(formats=("date-time", ))) def build_http_error(msg, http_code): '''Returns a specific error message and HTTP code pip that can be used by ' 'the REST API''' return {'message': msg}, http_code @api.errorhandler def default_error_handler(e): '''Error handler for uncaught exceptions''' message = 'An unexpected error occurred' log.exception(message) return {'message': message}, 500
def schema(self, value): self._schema = Draft7Validator(value, format_checker=FormatChecker())
def test_it_can_register_checkers(self): checker = FormatChecker() checker.checks("new")(self.fn) self.assertEqual(checker.checkers, dict(FormatChecker.checkers, new=(self.fn, ())))
def test_format_checker_object_on_constructor(self, app, client): from jsonschema import FormatChecker self._setup_api_format_checker_tests(app, format_checker=FormatChecker()) out = client.post_json('/format_checker/', {'ip': '192.168.1'}, status=400) assert 'ipv4' in out['errors']['ip']
def _validator(self): Draft4Validator.check_schema(self.request) return Draft4Validator(self.request, format_checker=FormatChecker())
def _update_validator(self): Draft4Validator.check_schema(self.update) return Draft4Validator(self.update, format_checker=FormatChecker())
def tracker_sync(): request_data = request.get_json(silent=True) requester_ip = request.remote_addr if request_data is None: return jsonify({ "error": "Request is not JSON", "success": False, }) if not models.tracker_ip_exists(requester_ip): # Return an error if the tracker is not in the tracker list return jsonify({ "success": False, "dead_tracker": True, "error": "Tracker not in tracker list", }) try: validate(request_data, schemas.TRACKER_SYNC_SCHEMA, format_checker=FormatChecker()) except ValidationError as e: return jsonify({ "error": str(e), "success": False, }) event = request_data["event"] event_ip = request_data["event_ip"] event_data = request_data["data"] # By default, don't rebroadcast and respond with success rebroadcast = False sync_response = {"success": True} try: if event == "new_tracker": # If the tracker doesn't exist, rebroadcast and add it if not models.tracker_ip_exists(event_ip): tracker = models.add_tracker(event_ip) # Can't just set rebroadcast here since we need to broadcast before adding the tracker broadcaster.new_event(event, event_ip, event_data) broadcaster.new_tracker(tracker) elif event == "add_file": peer_guid = event_data["guid"] models.ensure_peer_exists(event_ip, peer_guid, event_data["seq_number"]) # If the sequence number is new, apply and rebroadcast if event_data["seq_number"] >= models.peer_expected_seq(peer_guid): models.add_file(event_data, event_ip) rebroadcast = True elif event == "keep_alive": peer_guid = event_data["guid"] models.ensure_peer_exists(event_ip, peer_guid) # If the keepalive sequence number is new, apply and rebroadcast if event_data["ka_seq_number"] >= models.peer_expected_ka_seq( peer_guid): models.keep_alive(event_data, event_ip) rebroadcast = True elif event == "deregister_file_by_hash": peer_guid = event_data["guid"] models.ensure_peer_exists(event_ip, peer_guid) # If the sequence number is new, apply and rebroadcast if event_data["seq_number"] >= models.peer_expected_seq(peer_guid): models.deregister_file_by_hash(event_data, event_ip) rebroadcast = True # Only rebroadcast if specified if rebroadcast: broadcaster.new_event(event, event_ip, event_data) except Exception: print("Recieved exception during tracker sync", sys.stderr) print_exc() return jsonify({ "error": "Unexpected error", "success": False, }) return jsonify(sync_response)
from flask import Flask, Blueprint from flask_restplus import Api from app.core.log_wrapper import log from app.core.settings import LocalhostConfig, ProductionConfig, TestConfig from app.core.env_var_wrapper import EnvironmentVariableWrapper from werkzeug.contrib.fixers import ProxyFix from jsonschema import FormatChecker bp = Blueprint('api', __name__) api = Api(bp, version='1.0', title='Calls API', description='A Olist test', validate=True, format_checker=FormatChecker(formats=('date-time', )), doc='/docs') def init_app(): app = Flask(__name__) app.wsgi_app = ProxyFix(app.wsgi_app) env = EnvironmentVariableWrapper().env() if env == 'development': LocalhostConfig(app).config() elif env == 'production': ProductionConfig(app).config() elif env == 'test': TestConfig(app).config() else: raise ValueError('Please, inform FLASK_ENV. Possible values = ' + '[development, qa, production]')
def test(self, format=format): v = validators.Draft4Validator( {"format": format}, format_checker=FormatChecker(), ) v.validate(123)
import logging from flask_restplus import Api from src import settings from jsonschema import FormatChecker api = Api(version='1.0', title='Groceries API', description='For creating a shopping list for the groceries.', format_checker=FormatChecker( formats=['date-time'])) # add the jsonschema formatcheckers here @api.errorhandler def default_error_handler(e): message = 'An unhandled exception occurred.' logging.exception(message) if not settings.FLASK_DEBUG: return {'message': message}, 500
def parse(filename): try: schema = json.loads(open("pl.schema").read()) schema = Draft4Validator(schema, format_checker=FormatChecker()) except ValueError as e: post_error("pl.schema - " + str(e)) return try: pl = json.loads(open(filename).read()) except ValueError as e: post_error(filename + " - " + str(e)) return for error in schema.iter_errors(pl): pprint(error.absolute_path) post_error(error.message) foldernames = [] displaynames = [] repositories = [] path = Path(bitness_from_input) if path.exists(): shutil.rmtree(path) os.mkdir(bitness_from_input) for plugin in pl["npp-plugins"]: print(plugin["display-name"]) try: response = requests.get(plugin["repository"]) except requests.exceptions.RequestException as e: post_error(str(e)) continue if response.status_code != 200: post_error( f'{plugin["display-name"]}: failed to download plugin. Returned code {response.status_code}' ) continue # Hash it and make sure its what is expected hash = sha256(response.content).hexdigest() if plugin["id"].lower() != hash.lower(): post_error( f'{plugin["display-name"]}: Invalid hash. Got {hash.lower()} but expected {plugin["id"]}' ) continue # Make sure its a valid zip file try: zip = zipfile.ZipFile(io.BytesIO(response.content)) except zipfile.BadZipFile as e: post_error(f'{plugin["display-name"]}: Invalid zip file') continue # The expected DLL name dll_name = f'{plugin["folder-name"]}.dll'.lower() # Notepad++ is not case sensitive, but extracting files from the zip is, # so find the exactfile name to use for file in zip.namelist(): if dll_name == file.lower(): dll_name = file break else: post_error( f'{plugin["display-name"]}: Zip file does not contain {plugin["folder-name"]}.dll' ) continue with zip.open(dll_name) as dll_file, open( "./" + bitness_from_input + "/" + dll_name, 'wb') as f: f.write(dll_file.read()) version = plugin["version"] # Fill in any of the missing numbers as zeros version = version + (3 - version.count('.')) * ".0" try: dll_version = get_version_number("./" + bitness_from_input + "/" + dll_name) except win32api.error: post_error( f'{plugin["display-name"]}: Does not contain any version information' ) continue if dll_version != version: post_error( f'{plugin["display-name"]}: Unexpected DLL version. DLL is {dll_version} but expected {version}' ) continue #check uniqueness of json folder-name, display-name and repository found = False for name in displaynames: if plugin["display-name"] == name: post_error( f'{plugin["display-name"]}: non unique display-name entry') found = True if found == False: displaynames.append(plugin["display-name"]) found = False for folder in foldernames: if plugin["folder-name"] == folder: post_error( f'{plugin["folder-name"]}: non unique folder-name entry') found = True if found == False: foldernames.append(plugin["folder-name"]) found = False for repo in repositories: if plugin["repository"] == repo: post_error( f'{plugin["repository"]}: non unique repository entry') found = True if found == False: repositories.append(plugin["repository"])
def persist_messages(messages, config, s3_client): state = None schemas = {} key_properties = {} headers = {} validators = {} delimiter = config.get('delimiter', ',') quotechar = config.get('quotechar', '"') # Use the system specific temp directory if no custom temp_dir provided temp_dir = os.path.expanduser(config.get('temp_dir', tempfile.gettempdir())) # Create temp_dir if not exists if temp_dir: os.makedirs(temp_dir, exist_ok=True) filenames = [] now = datetime.now().strftime('%Y%m%dT%H%M%S') for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o['type'] if message_type == 'RECORD': if o['stream'] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o['stream'])) # Validate record try: validators[o['stream']].validate( utils.float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( "Data validation failed and cannot load to destination. RECORD: {}\n" "'multipleOf' validations that allows long precisions are not supported" " (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema." .format(o['record'])) raise ex record_to_load = o['record'] if config.get('add_metadata_columns'): record_to_load = utils.add_metadata_values_to_record(o, {}) else: record_to_load = utils.remove_metadata_values_from_record(o) filename = o['stream'] + '-' + now + '.csv' filename = os.path.expanduser(os.path.join(temp_dir, filename)) target_key = utils.get_target_key( o, prefix=config.get('s3_key_prefix', ''), timestamp=now, naming_convention=config.get('naming_convention')) if not (filename, target_key) in filenames: filenames.append((filename, target_key)) file_is_empty = ( not os.path.isfile(filename)) or os.stat(filename).st_size == 0 flattened_record = utils.flatten_record(record_to_load) if o['stream'] not in headers and not file_is_empty: with open(filename, 'r') as csvfile: reader = csv.reader(csvfile, delimiter=delimiter, quotechar=quotechar) first_line = next(reader) headers[o[ 'stream']] = first_line if first_line else flattened_record.keys( ) else: headers[o['stream']] = flattened_record.keys() with open(filename, 'a') as csvfile: writer = csv.DictWriter(csvfile, headers[o['stream']], extrasaction='ignore', delimiter=delimiter, quotechar=quotechar) if file_is_empty: writer.writeheader() writer.writerow(flattened_record) state = None elif message_type == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] elif message_type == 'SCHEMA': stream = o['stream'] schemas[stream] = o['schema'] if config.get('add_metadata_columns'): schemas[stream] = utils.add_metadata_columns_to_schema(o) schema = utils.float_to_decimal(o['schema']) validators[stream] = Draft7Validator( schema, format_checker=FormatChecker()) key_properties[stream] = o['key_properties'] elif message_type == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') else: logger.warning("Unknown message type {} in message {}".format( o['type'], o)) # Upload created CSV files to S3 for filename, target_key in filenames: compressed_file = None if config.get("compression") is None or config["compression"].lower( ) == "none": pass # no compression else: if config["compression"] == "gzip": compressed_file = f"{filename}.gz" with open(filename, 'rb') as f_in: with gzip.open(compressed_file, 'wb') as f_out: logger.info(f"Compressing file as '{compressed_file}'") shutil.copyfileobj(f_in, f_out) else: raise NotImplementedError( "Compression type '{}' is not supported. " "Expected: 'none' or 'gzip'".format(config["compression"])) s3.upload_file(compressed_file or filename, s3_client, config.get('s3_bucket'), target_key, encryption_type=config.get('encryption_type'), encryption_key=config.get('encryption_key')) # Remove the local file(s) os.remove(filename) if compressed_file: os.remove(compressed_file) return state
""" OpenC2 Specific Format Checkers """ from jsonschema import FormatChecker # Format Checker ExtendedFormatChecker = FormatChecker() """ Default formats email/idn-email - checks if string and `@` in string ip-address/ipv4 - checks if valid ipv4 address (ip-address only valid on draft3) host-name/hostname - checks if valid hostname (host-name only valid on draft3) uri - checks if valid Resource Identifier (only valid if rfc3987 pk installed) date-time - checks if valid datetime (only valid if strict_rfc3339 pkg installed) regex - checks for valid regex # Draft 3 Only time - checks if valid time color - checks if valid webcolor (only valid if webcolors pkg installed) # Draft 7 Only idn-hostname - checks if valid Internationalized Internet host name (only if idna pkg installed) iri - checks if valid Internationalized Resource Identifier (only valid if rfc3987 pk installed) iri-reference - checks if valid Internationalized Resource Identifier Reference (only valid if rfc3987 pk installed) time - checks if valid time (only valid if strict_rfc3339 pkg installed) relative-json-pointer - TBD (only valid if jsonpointer pkg installed) # Mixed date - checks for valid date (only for draft3 and draft7) json-pointer - TBD (only for draft6 and draft7 with jsonpointer pkg installed) uri-template - checks for valid uri template (only for draft6 and draft7 with uritemplate pkg installed)
def schema_validate(request, schema): try: validate(instance=request, schema=schemas[schema], format_checker=FormatChecker()) except ValidationError as e: return 'ValidationError: %s' % (e,) return None
def validate_json(instance, schema): """Validate a dictionary using the provided json schema.""" Validator(schema, format_checker=FormatChecker()).validate(instance)
# -*- coding: utf-8 -*- import json import logging from jsonschema import validate, ValidationError, FormatChecker from werkzeug.routing import Map, Rule, NotFound __validate_kwargs = {"format_checker": FormatChecker()} __required_keys = ["httpMethod", "resource"] class Response(object): """Class to conceptualize a response with default attributes if no body is specified, empty string is returned if no status_code is specified, 200 is returned if no headers are specified, empty dict is returned """ def __init__(self, body=None, status_code=None, headers=None): self.body = body self.status_code = status_code self.headers = headers def to_json(self): return { "body": json.dumps(self.body) if self.body else None, "statusCode": self.status_code or 200, "headers": self.headers or {} } def _float_cast(value):
def persist_lines(config, lines) -> None: state = None flushed_state = None schemas = {} key_properties = {} validators = {} records_to_load = {} csv_files_to_load = {} row_count = {} stream_to_sync = {} total_row_count = {} table_columns_cache = None batch_size_rows = config.get('batch_size_rows', DEFAULT_BATCH_SIZE_ROWS) # Cache the available schemas, tables and columns from redshift if not disabled in config # The cache will be used later use to avoid lot of small queries hitting redshift if not ('disable_table_cache' in config and config['disable_table_cache'] == True): logger.info("Caching available catalog objects in redshift...") filter_schemas = get_schema_names_from_config(config) table_columns_cache = DbSync(config).get_table_columns( filter_schemas=filter_schemas) # Loop over lines from stdin for line in lines: try: o = json.loads(line) except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(line)) raise if 'type' not in o: raise Exception( "Line is missing required key 'type': {}".format(line)) t = o['type'] if t == 'RECORD': if 'stream' not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) if o['stream'] not in schemas: raise Exception( "A record for stream {} was encountered before a corresponding schema" .format(o['stream'])) # Get schema for this record's stream stream = o['stream'] # Validate record try: validators[stream].validate(float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( "Data validation failed and cannot load to destination. RECORD: {}\n'multipleOf' validations " "that allows long precisions are not supported (i.e. with 15 digits or more). Try removing " "'multipleOf' methods from JSON schema. ".format( o['record'])) raise ex primary_key_string = stream_to_sync[ stream].record_primary_key_string(o['record']) if not primary_key_string: primary_key_string = 'RID-{}'.format(total_row_count[stream]) if stream not in records_to_load: records_to_load[stream] = {} # increment row count only when a new PK is encountered in the current batch if primary_key_string not in records_to_load[stream]: row_count[stream] += 1 total_row_count[stream] += 1 # append record if config.get('add_metadata_columns') or config.get('hard_delete'): records_to_load[stream][ primary_key_string] = add_metadata_values_to_record( o, stream_to_sync[stream]) else: records_to_load[stream][primary_key_string] = o['record'] if row_count[stream] >= batch_size_rows: # flush all streams, delete records if needed, reset counts and then emit current state if config.get('flush_all_streams'): filter_streams = None else: filter_streams = [stream] # Flush and return a new state dict with new positions only for the flushed streams flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state, filter_streams=filter_streams) # emit last encountered state emit_state(copy.deepcopy(flushed_state)) elif t == 'SCHEMA': if 'stream' not in o: raise Exception( "Line is missing required key 'stream': {}".format(line)) stream = o['stream'] schemas[stream] = o schema = float_to_decimal(o['schema']) validators[stream] = Draft4Validator( schema, format_checker=FormatChecker()) # flush records from previous stream SCHEMA # if same stream has been encountered again, it means the schema might have been altered # so previous records need to be flushed if row_count.get(stream, 0) > 0: flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state) # emit latest encountered state emit_state(flushed_state) # key_properties key must be available in the SCHEMA message. if 'key_properties' not in o: raise Exception("key_properties field is required") # Log based and Incremental replications on tables with no Primary Key # cause duplicates when merging UPDATE events. # Stop loading data by default if no Primary Key. # # If you want to load tables with no Primary Key: # 1) Set ` 'primary_key_required': false ` in the target-redshift config.json # or # 2) Use fastsync [postgres-to-redshift, mysql-to-redshift, etc.] if config.get('primary_key_required', True) and len( o['key_properties']) == 0: logger.critical( "Primary key is set to mandatory but not defined in the [{}] stream" .format(stream)) raise Exception("key_properties field is required") key_properties[stream] = o['key_properties'] if config.get('add_metadata_columns') or config.get('hard_delete'): stream_to_sync[stream] = DbSync( config, add_metadata_columns_to_schema(o)) else: stream_to_sync[stream] = DbSync(config, o) stream_to_sync[stream].create_schema_if_not_exists( table_columns_cache) stream_to_sync[stream].sync_table(table_columns_cache) row_count[stream] = 0 total_row_count[stream] = 0 csv_files_to_load[stream] = NamedTemporaryFile(mode='w+b') elif t == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') elif t == 'STATE': logger.debug('Setting state to {}'.format(o['value'])) state = o['value'] # Initially set flushed state if not flushed_state: flushed_state = copy.deepcopy(state) else: raise Exception("Unknown message type {} in message {}".format( o['type'], o)) # if some bucket has records that need to be flushed but haven't reached batch size # then flush all buckets. if sum(row_count.values()) > 0: # flush all streams one last time, delete records if needed, reset counts and then emit current state flushed_state = flush_streams(records_to_load, row_count, stream_to_sync, config, state, flushed_state) # emit latest state emit_state(copy.deepcopy(flushed_state))
from datetime import datetime, timedelta from uuid import UUID from flask import current_app from iso8601 import ParseError, iso8601 from jsonschema import Draft7Validator, FormatChecker, ValidationError from notifications_utils.recipients import ( InvalidEmailError, InvalidPhoneError, validate_email_address, validate_phone_number, ) from app.notifications.validators import decode_personalisation_files format_checker = FormatChecker() @format_checker.checks("validate_uuid", raises=Exception) def validate_uuid(instance): if isinstance(instance, str): UUID(instance) return True @format_checker.checks("phone_number", raises=InvalidPhoneError) def validate_schema_phone_number(instance): if isinstance(instance, str): validate_phone_number(instance, international=True) return True
def validate(instance, schema, cls=None, *args, **kwargs): """ Calls jsonschema.validate() with the arguments. """ format_checker = FormatChecker() _validate(instance, schema, cls, *args, format_checker=format_checker, **kwargs)
def persist_messages(messages, config, s3_client, do_timestamp_file=True): logger.info('persist_messages') state = None schemas = {} key_properties = {} validators = {} filenames = [] filename = None timestamp_file_part = '-' + datetime.now().strftime( '%Y%m%dT%H%M%S') if do_timestamp_file else '' max_file_size_mb = config.get('max_temp_file_size_mb', 50) stream = None if config.get('record_unique_field'): a = set() write_temp_pickle() for message in messages: try: o = singer.parse_message(message).asdict() except json.decoder.JSONDecodeError: logger.error("Unable to parse:\n{}".format(message)) raise message_type = o['type'] if message_type == 'RECORD': if o['stream'] not in schemas: raise Exception( "A record for stream {}" "was encountered before a corresponding schema".format( o['stream'])) # Validate record try: validators[o['stream']].validate( utils.float_to_decimal(o['record'])) except Exception as ex: if type(ex).__name__ == "InvalidOperation": logger.error( """Data validation failed and cannot load to destination. RECORD: {}\n 'multipleOf' validations that allows long precisions are not supported (i.e. with 15 digits or more). Try removing 'multipleOf' methods from JSON schema. """.format(o['record'])) raise ex record_to_load = o['record'] if config.get('add_metadata_columns'): record_to_load = utils.add_metadata_values_to_record(o, {}) else: record_to_load = utils.remove_metadata_values_from_record(o) flattened_record = utils.flatten(record_to_load) filename = o['stream'] + timestamp_file_part + '.jsonl' filename = os.path.join(tempfile.gettempdir(), filename) filename = os.path.expanduser(filename) if not (filename, o['stream']) in filenames: filenames.append((filename, o['stream'])) with open(filename, 'a') as f: f.write(json.dumps(flattened_record, cls=DecimalEncoder)) f.write('\n') file_size = os.path.getsize(filename) if os.path.isfile( filename) else 0 if file_size >> 20 > max_file_size_mb: logger.info('file_size: {} MB, filename: {}'.format( round(file_size >> 20, 2), filename)) upload_to_s3(s3_client, config.get("s3_bucket"), os.environ["TARGET_S3_SOURCE_NAME"], filename, o['stream'], config.get('field_to_partition_by_time'), config.get('record_unique_field'), config.get("compression"), config.get('encryption_type'), config.get('encryption_key')) filenames.remove((filename, o['stream'])) state = None elif message_type == 'STATE': logger.info('Setting state to {}'.format(o['value'])) state = o['value'] elif message_type == 'SCHEMA': stream = o['stream'] schemas[stream] = o['schema'] if config.get('add_metadata_columns'): schemas[stream] = utils.add_metadata_columns_to_schema(o) schema = utils.float_to_decimal(o['schema']) validators[stream] = Draft4Validator( schema, format_checker=FormatChecker()) key_properties[stream] = o['key_properties'] elif message_type == 'ACTIVATE_VERSION': logger.debug('ACTIVATE_VERSION message') else: logger.warning("Unknown message type {} in message {}".format( o['type'], o)) # Upload created CSV files to S3 for filename, stream in filenames: upload_to_s3(s3_client, config.get("s3_bucket"), os.environ["TARGET_S3_SOURCE_NAME"], filename, stream, config.get('field_to_partition_by_time'), config.get('record_unique_field'), config.get("compression"), config.get('encryption_type'), config.get('encryption_key')) return state
def __init__(self, validator_class, schema_factory, module, resolver=None, format_checker=None): self.schema_factory = schema_factory self.validator_class = validator_class self.resolver = resolver self.format_checker = format_checker or FormatChecker() self.module = module
def test_it_returns_true_for_formats_it_does_not_know_about(self): validator = self.validator_class( {"format" : "carrot"}, format_checker=FormatChecker(), ) validator.validate("bugs")
def save_upload(): """ Ajax route to create a manifest for each uploaded file and insert it in the database. """ if request.method == 'POST': print('Working session folder: ' + session['IMPORT_DIR']) errors = [] # Make sure the collection exists before handling form data try: result = list(corpus_db.find({'name': request.json['collection']})) assert result != [] # Handle the form data exclude = ['branch', 'category', 'collection'] node_metadata = {} for key, value in request.json.items(): if key not in exclude and value != '' and value != []: node_metadata[key] = value # Set the name and metapath if request.json['collection'].startswith('Corpus,'): collection = request.json['collection'] else: collection = 'Corpus,' + request.json['collection'] except: errors.append( 'The specified collection does not exist in the database. Check your entry or <a href="/corpus/create">create a collection</a> before importing data.' ) if len(errors) == 0: # Set the name and path for the new manifest node_metadata = {} if request.json['branch'] != '': node_metadata['name'] = request.json['branch'] node_metadata[ 'metapath'] = collection + ',' + request.json['category'] else: node_metadata['name'] = request.json['category'] node_metadata['metapath'] = collection # If the specified metapath does not exist, create it if len(errors) == 0: parent = list( corpus_db.find({ 'name': node_metadata['name'], 'metapath': node_metadata['metapath'] })) if len(parent) == 0: try: corpus_db.insert_one(node_metadata) except: errors.append( '<p>The specified metapath does not exist and could not be created.</p>' ) # Now create a data manifest for each file and insert it if len(errors) == 0: for filename in os.listdir(session['IMPORT_DIR']): print('Creating manifest for ' + filename) if filename.endswith('.json'): filepath = os.path.join(session['IMPORT_DIR'], filename) metapath = node_metadata['metapath'] + ',' + node_metadata[ 'name'] manifest = { 'name': os.path.splitext(filename)[0], 'namespace': 'we1sv2.0', 'metapath': metapath } try: with open(filepath, 'rb') as f: doc = json.loads(f.read()) for key, value in doc.items(): if key not in [ 'name', 'namespace', 'metapath' ]: manifest[key] = value except: errors.append( '<p>The file <code>' + filename + '</code> could not be loaded or it did not have a <code>content</code> property.</p>' ) # Validate the manifest before inserting schema_file = 'https://raw.githubusercontent.com/whatevery1says/manifest/master/schema/v2.0/Corpus/Data.json' schema = json.loads(requests.get(schema_file).text) print(manifest['name']) print(manifest['metapath']) try: methods.validate(manifest, schema, format_checker=FormatChecker()) result = methods.create_record(manifest) print('Is this my error') errors = errors + result print(errors) except: errors.append( '<p>A valid manifest could not be created from the file <code>' + filename + '</code> or the manifest could not be added to the database due to an unknown error.</p>' ) else: errors.append('<p>The file <code>' + filename + '</code> is an invalid format.</p>') # We're done. Delete the import directory shutil.rmtree(session['IMPORT_DIR']) # Refresh the session token = datetime.now().strftime('%Y%m%d_') + str(randint(0, 99)) session['IMPORT_DIR'] = os.path.join(TEMP_DIR, token).replace('\\', '/') if len(errors) == 0: return json.dumps({'result': 'success', 'session_token': 'token'}) else: return json.dumps({'errors': errors})
def test_it_raises_a_key_error_for_unknown_formats(self): with self.assertRaises(KeyError): FormatChecker(formats=["o noes"])
def parse(filename): try: schema = json.loads(open("udl.schema").read()) schema = Draft7Validator(schema, format_checker=FormatChecker()) except ValueError as e: post_error("udl.schema - " + str(e)) return try: udlfile = json.loads(open(filename, encoding="utf8").read()) except ValueError as e: post_error(filename + " - " + str(e)) return for error in schema.iter_errors(udlfile): post_error(error.message) idnames = [] displaynames = [] repositories = [] response = [] for udl in udlfile["UDLs"]: print(udl["display-name"]) try: if udl["repository"] != "": response = requests.get(udl["repository"]) except requests.exceptions.RequestException as e: post_error(str(e)) continue if udl["repository"] != "" and response.status_code != 200: post_error( f'{udl["display-name"]}: failed to download udl. Returned code {response.status_code}' ) continue # Hash it and make sure its what is expected #hash = sha256(response.content).hexdigest() #if udl["id"].lower() != hash.lower(): # post_error(f'{udl["display-name"]}: Invalid hash. Got {hash.lower()} but expected {udl["id"]}') # continue #check uniqueness of json id-name, display-name and repository found = False for name in displaynames: if udl["display-name"] == name: post_error( f'{udl["display-name"]}: non unique display-name entry') found = True if found == False: displaynames.append(udl["display-name"]) found = False for idname in idnames: if udl["id-name"] == idname: post_error(f'{udl["id-name"]}: non unique id-name entry') found = True if found == False: idnames.append(udl["id-name"]) found = False for repo in repositories: if udl["repository"] != "" and udl["repository"] == repo: post_error(f'{udl["repository"]}: non unique repository entry') found = True if found == False: repositories.append(udl["repository"])