def test_validator_invalid_number(): TEST_DATA = {"number_field": "one hundred"} TEST_SCHEMA = {"fields": [{"name": "number_field", "type": "numeric"}]} test = Schema(TEST_SCHEMA) assert (not test.validate(TEST_DATA)) TEST_DATA = {"number_field": print} TEST_SCHEMA = {"fields": [{"name": "number_field", "type": "numeric"}]} test = Schema(TEST_SCHEMA) assert (not test.validate(TEST_DATA))
def main(context): # create the run context from the config and context passed to main # this would allow dates etc to be passed from something external context = build_context(**context) extract_tweet = ExtractTweetDetailsOperator() extract_cves = ExtractCvesFromTweetOperator() save = SaveToBucketOperator( project=context['config'].get('target_project'), to_path=context['config'].get('target_path'), schema=Schema(context), compress=context['config'].get('compress')) end = EndOperator() flow = extract_tweet > extract_cves > save > end while True: try: listener = TwitterListener(api, flow) stream = tweepy.Stream(api.auth, listener, tweet_mode="extended") stream.filter(track=["CVE"], languages=["en"]) except KeyboardInterrupt: print('Keyboard Interrupt') quit() except Exception as err: print( F"Error {type(err).__name__} {err} - restarting in 5 seconds") print(gva.errors.RenderErrorStack()) time.sleep(5)
def test_validator_nonnative_types(): TEST_DATA = { "integer_field": "100", "boolean_field": "True", "date_field": "2000-01-01T00:00:00.000", "nullable_field": "" } TEST_SCHEMA = { "fields": [{ "name": "integer_field", "type": "numeric" }, { "name": "boolean_field", "type": "boolean" }, { "name": "date_field", "type": "date" }, { "name": "nullable_field", "type": "nullable" }] } test = Schema(TEST_SCHEMA) assert (test.validate(TEST_DATA))
def test_validator_invalid_boolean(): TEST_DATA = {"boolean_field": "not true"} TEST_SCHEMA = {"fields": [{"name": "boolean_field", "type": "boolean"}]} test = Schema(TEST_SCHEMA) assert (not test.validate(TEST_DATA))
def test_call_alias(): TEST_DATA = {"number_field": 100} TEST_SCHEMA = {"fields": [{"name": "number_field", "type": "numeric"}]} test = Schema(TEST_SCHEMA) assert test(TEST_DATA)
def test_validator_invalid_string(): TEST_DATA = {"string_field": 100} TEST_SCHEMA = {"fields": [{"name": "string_field", "type": "string"}]} test = Schema(TEST_SCHEMA) assert (not test.validate(TEST_DATA))
def test_validator_invalid_schema(): result = True try: Schema({"name": "string"}) except: result = False assert (not result)
def test_validator_date(): INVALID_TEST_DATA = {"key": "tomorrow"} VALID_TEST_DATA = {"key": "2020-01-01"} TEST_SCHEMA = {"fields": [{"name": "key", "type": "date"}]} test = Schema(TEST_SCHEMA) assert (not test.validate(INVALID_TEST_DATA)) assert (test.validate(VALID_TEST_DATA))
def test_validator_list(): INVALID_TEST_DATA = {"key": "not a list"} VALID_TEST_DATA = {"key": ["is", "a", "list"]} TEST_SCHEMA = {"fields": [{"name": "key", "type": "list"}]} test = Schema(TEST_SCHEMA) assert (not test.validate(INVALID_TEST_DATA)) assert (test.validate(VALID_TEST_DATA))
def test_unknown_type(): TEST_SCHEMA = {"fields": [{"name": "key", "type": "not_a_known_type"}]} failed = False try: test = Schema(TEST_SCHEMA) except ValueError: failed = True assert failed
def test_extract_qid_summary_operator_validates(): """ Test the output from the operator complies to the schema """ valid_xml = xmltodict.parse(VALID_XML)['VULN'] extract_qid_summary = ExtractQidSummaryOperator() data, context = extract_qid_summary(data=valid_xml, context={}) schema_file = find_file('QID_SUMMARY.metadata') validator = Schema(schema_file) assert validator.validate(data), validator.last_error
def test_validator_number_ranges(): OVER_TEST_DATA = {"number": 1000} UNDER_TEST_DATA = {"number": 100} IN_TEST_DATA = {"number": 500} TEST_SCHEMA = { "fields": [{ "name": "number", "type": "numeric", "min": 250, "max": 750 }] } test = Schema(TEST_SCHEMA) assert (not test.validate(OVER_TEST_DATA)) assert (not test.validate(UNDER_TEST_DATA)) assert (test.validate(IN_TEST_DATA)) TEST_SCHEMA_MIN = { "fields": [{ "name": "number", "type": "numeric", "min": 250 }] } test = Schema(TEST_SCHEMA_MIN) assert (test.validate(OVER_TEST_DATA)), test.last_error assert not (test.validate(UNDER_TEST_DATA)), test.last_error TEST_SCHEMA_MAX = { "fields": [{ "name": "number", "type": "numeric", "max": 750 }] } test = Schema(TEST_SCHEMA_MAX) assert (test.validate(UNDER_TEST_DATA)), test.last_error assert not (test.validate(OVER_TEST_DATA)), test.last_error
def test_raise_exception(): TEST_DATA = {"number_field": "one hundred"} TEST_SCHEMA = {"fields": [{"name": "number_field", "type": "numeric"}]} test = Schema(TEST_SCHEMA) failed = False try: test.validate(TEST_DATA, raise_exception=True) except ValueError: failed = True assert failed
def test_validator_string_format(): INVALID_TEST_DATA = {"cve": "eternalblue"} VALID_TEST_DATA = {"cve": "CVE-2017-0144"} TEST_SCHEMA = { "fields": [{ "name": "cve", "type": "string", "format": r"(?i)CVE-\d{4}-\d{4,7}" }] } test = Schema(TEST_SCHEMA) assert (not test.validate(INVALID_TEST_DATA)) assert (test.validate(VALID_TEST_DATA))
def test_validator_enum(): INVALID_TEST_DATA = {"key": "left"} VALID_TEST_DATA = {"key": "north"} TEST_SCHEMA = { "fields": [{ "name": "key", "type": "enum", "symbols": ['north', 'south'] }] } test = Schema(TEST_SCHEMA) assert (not test.validate(INVALID_TEST_DATA)) assert (test.validate(VALID_TEST_DATA))
def test_validator_loaders(): """ Ensure dictionary, json and json files load """ import json TEST_SCHEMA_DICT = {"fields": [{"name": "string_field", "type": "string"}]} TEST_SCHEMA_STRING = json.dumps(TEST_SCHEMA_DICT) TEST_SCHEMA_FILE = 'temp' with open(TEST_SCHEMA_FILE, 'w') as file: file.write(TEST_SCHEMA_STRING) failed = False try: test = Schema(TEST_SCHEMA_DICT) test.validate({"string_field": "pass"}) except Exception: failed = True assert not failed, "load schema from dictionary" failed = False try: test = Schema(TEST_SCHEMA_STRING) test.validate({"string_field": "pass"}) except Exception: failed = True assert not failed, "load schema from string" failed = False try: test = Schema(TEST_SCHEMA_FILE) test.validate({"string_field": "pass"}) except Exception: failed = True assert not failed, "load schema from file"
def test_validator_multiple_types(): TEST_DATA_1 = {"multi": "True"} TEST_DATA_2 = {"multi": True} TEST_DATA_3 = {"multi": None} TEST_SCHEMA = { "fields": [{ "name": "multi", "type": ["string", "boolean", "nullable"] }] } test = Schema(TEST_SCHEMA) assert (test.validate(TEST_DATA_1)) assert (test.validate(TEST_DATA_2)) assert (test.validate(TEST_DATA_3))
def test_validator_extended_schema(): """ Ensure the validator will ignore additional fields in the schema """ TEST_DATA = {"string_field": "the"} TEST_SCHEMA = { "table": "this is a test schema", "fields": [{ "name": "string_field", "type": "string", "description": "character array", "last_updated": datetime.datetime.today() }] } test = Schema(TEST_SCHEMA) assert (test.validate(TEST_DATA))
def build_flow(context: dict): # define the operations in the flow save_to_bucket = SaveToBucketOperator( project=context['config'].get('target_project'), to_path=context['config'].get('target_path'), schema=Schema(context), date=context.get('date'), compress=context['config'].get('compress')) end = EndOperator() # chain the operations to create the flow flow = save_to_bucket > end # attach the writers flow.attach_writers(context['config'].get('writers', [])) return flow
def test_validator_all_valid_values(): TEST_DATA = { "string_field": "string", "integer_field": 100, "boolean_field": True, "date_field": datetime.datetime.today(), "other_field": ["abc"], "nullable_field": None, "list_field": ['a', 'b', 'c'], "enum_field": "RED" } TEST_SCHEMA = { "fields": [{ "name": "string_field", "type": "string" }, { "name": "integer_field", "type": "numeric" }, { "name": "boolean_field", "type": "boolean" }, { "name": "date_field", "type": "date" }, { "name": "other_field", "type": "other" }, { "name": "nullable_field", "type": "nullable" }, { "name": "list_field", "type": "list" }, { "name": "enum_field", "type": "enum", "symbols": ['RED', 'GREEN', 'BLUE'] }] } test = Schema(TEST_SCHEMA) assert (test.validate(TEST_DATA))
def execute_test(compress, schema, reader): writer = Writer( writer=file_writer, to_path='%datefolders', compress=compress, schema=schema ) #reader = read_jsonl('tweets.jsonl') start = time.perf_counter_ns() for record in reader: writer.append(record) writer.finalize() return (time.perf_counter_ns() - start) / 1e9 schema = Schema(schema_definition) lines = list(read_jsonl('tweets.jsonl')) print(len(lines)) print(lines[1]) results = [] result = { 'compression': False, 'validation': False, 'time': execute_test(False, None, lines) } results.append(result) shutil.rmtree("year_2021") result = {
def validate(): for i in range(1000000): s = Schema(schema_definition) s.validate(data)