def ingestion_callback(message): data, metadata = pulsarclient.callback_info(message) user_id = metadata['user_id'] patient_id = metadata['patient_id'] ingestion_time = metadata['ingestion_time'] app = metadata['app_name'] log.info( f'DEBUG: {type(data["Patient"][0]["id"])} {data["Patient"][0]["id"]}') log.info(f'Received patient data: {type(data)}') log.info(f'Received patient data: {data["Patient"]}') # bundle = { # 'resourceType': 'Bundle', # 'type': 'transaction', # 'entry': [] # } # for entry in data: # bundle['entry'].append(entry) # sanitize_null sanitized_data = defaultdict(list) for resourceType, resources in data.items(): if resources: for resource in resources: # TODO: hack to remove None, till I figure how to make null pass validation in avro new_resource = remove_none(resource) sanitized_data[resourceType].append(new_resource) log.info(sanitized_data) fastavro_schema = avroutil.get_bundle_schema() fastavro_schema = pulsarclient.AvroSchema( schema_definition=fastavro_schema, schema_name=fastavro_schema['name']) fastavro.validate(sanitized_data, fastavro_schema) buffer = io.BytesIO() fastavro.schemaless_writer(buffer, fastavro_schema, sanitized_data) buffer.seek(0) upload_file( buffer, f"{user_id}/{patient_id}/{ingestion_time}/ingested/bundle.avro", app, {}) if PROFILING: profile.print_stats()
def validate(self, record): """Validate packet contents against this schema. Parameters ---------- record : `dict` The data to be checked for schema compliance. Returns ------- valid : `bool` Whether or not the data complies with the schema. """ fastavro.parse_schema(self.definition) return fastavro.validate(record, self.definition)
def test_poll(self): topic, alert, key = self.consumer.poll() self.assertIsNotNone(alert) self.assertTrue(fastavro.validate(alert, self.consumer._parsed_schema))
def convert_to_avro(schema_path: str, log_path: str, output_path: str = None, delete_existing_avro_file: bool = True, validate_percentage: float = 100.0, avro_batch_size: int = 2000, offset: int = 0, max_lines: int = None) -> dict: """Converts a log file to Avro format.""" t0 = time.time() def _get_output_path(input_path: str): if input_path.endswith('.bz2') or input_path.endswith('.gz'): return f'{os.path.splitext(input_path)[0]}.avro' else: return f'{input_path}.avro' if not output_path: output_path = _get_output_path(log_path) print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z ' f'Converting log file {log_path!r} ' f'to Avro file {output_path!r} ' f'using schema {schema_path!r} ' f'and {validate_percentage} % output validation ' f'(PID {os.getpid()})...') with open(schema_path, 'rb') as schema_file: avro_schema = fastavro.parse_schema(json.loads(schema_file.read())) if delete_existing_avro_file and os.path.exists(output_path): os.remove(output_path) records, records_validated, batch_sizes = [], 0, [] def _write_avro_output(): if not os.path.exists(output_path): with open(output_path, 'wb') as avro_file: fastavro.writer(avro_file, avro_schema, records, codec='deflate') else: with open(output_path, 'a+b') as avro_file: fastavro.writer(avro_file, avro_schema, records, codec='deflate') batch_sizes.append(len(records)) print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z ' f'Wrote {len(records)} records ' f'in batch {str(len(batch_sizes)).zfill(4)} ' f'to {output_path!r} (PID {os.getpid()}).') records.clear() log_file = open_log_file(log_path) lines_in, lines_ignored = 0, 0 decode_errors, validation_errors, total_errors = 0, 0, 0 typecasting = { 'field_238_to_int': 0, 'field_256_to_int': 0, 'field_256_to_null': 0, 'field_255_to_str': 0 } for log_line in log_file: lines_in += 1 if lines_in < offset: continue if max_lines and (lines_in - offset) >= max_lines: break try: server_date, versionstring, token, ip, raw_json_data = \ log_line.decode().split('\t', maxsplit=5) if raw_json_data == '{"d":}\n': lines_ignored += 1 continue json_data = json.loads(raw_json_data)['d'] if len(json_data) != 7: raise ValueError(f'Data is not a 7-tuple ({json_data!r}).') (log_format, client_date_orig, project_id, version, uuid, event_name, fields) = json_data tz_offset = client_date_orig[-6:] if len(tz_offset) != 6 and tz_offset.startsWith(('-', '+')): raise ValueError(f'Malformatted date {client_date_orig!r}.') avro_record = { 'server_date': server_date, 'datestamp': server_date.split(maxsplit=1)[0], 'versionstring': versionstring, 'token': token, 'ip': ip, 'log_format': log_format, 'client_date_orig': client_date_orig, 'client_date': client_date_orig.split(maxsplit=1)[0], 'client_local_date': client_date_orig[:-6], 'tz_offset': tz_offset, 'project_id': project_id, 'version': version, 'uuid': uuid, 'event_name': event_name } for field, value in fields.items(): if value: if field == '238': if not isinstance(value, int): typecasting['field_238_to_int'] += 1 value = int(value) elif field == '256': if not isinstance(value, int): typecasting['field_256_to_int'] += 1 value = int(value) else: if field == '256': if not isinstance(value, int): typecasting['field_256_to_null'] += 1 value = None if field == '255': if not isinstance(value, str): typecasting['field_255_to_str'] += 1 value = str(value) avro_record[f"c_{str(field).replace('.', '_')}"] = value if validate_percentage and records_validated < avro_batch_size: records_validated += 1 fastavro.validate(avro_record, avro_schema) elif random.random() * 100.0 <= validate_percentage: records_validated += 1 fastavro.validate(avro_record, avro_schema) records.append(avro_record) if len(records) >= avro_batch_size: _write_avro_output() except ValueError as parse_error: decode_errors += 1 total_errors += 1 print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z ' f'{parse_error.__class__.__name__}, line {lines_in} in ' f'{log_path!r}: {parse_error} / Content: {log_line!r}') except ValidationError as validation_err: validation_errors += 1 total_errors += 1 print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z ' f'{validation_err.__class__.__name__}, line {lines_in} in ' f'{log_path!r}: {validation_err} / Content: {log_line!r}') finally: if total_errors > 50 and total_errors / (lines_in - offset) > 0.001: raise Exception(f'Excessive error rate ' f'{total_errors / (lines_in - offset)}.') _write_avro_output() duration_sec = int(time.time() - t0) print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z ' f'Converted {lines_in - offset} lines to Avro records ' f'in {duration_sec} seconds, ' f'dropping {lines_ignored} empty lines, ' f'failing to decode {decode_errors} lines, ' f'invalidating {validation_errors} records ' f'(in {records_validated} validations), ' f'with casting summary {typecasting!r} ' f'(PID {os.getpid()}).') return { 'output_path': os.path.abspath(output_path), 'metrics': { 'lines_in': lines_in, 'lines_ignored': lines_ignored, 'decode_errors': decode_errors, 'validation_errors': validation_errors, 'records_out': sum(batch_sizes), 'records_validated': records_validated, 'typecasting': typecasting, 'duration_sec': duration_sec } }
def encode(obj, schema): fastavro.validate(obj, schema) buffer = io.BytesIO() fastavro.schemaless_writer(buffer, schema, obj) return buffer.getvalue()
def __init__(self, data, schema=None): """ :param data: dict, list of dicts, JSON str, file, bytes :param schema: dict """ self._last_error = None # Last error captured self._object_data = None self._json_data = None self._avro_data = None self._origin = None self._schema = None self._schema_origin = None self._ok = False if schema is None: self._schema = None elif isinstance(schema, str): try: success, schema, origin = AvroTools.fetch_json(schema) if success: schema = json.loads(schema) self._schema_origin = origin else: schema = None except Exception as e: self._last_error = str(e) schema = None if schema is not None: try: self._schema = parse_schema(schema) if self._schema_origin is None: self._schema_origin = type(schema).__name__ except Exception as e: self._last_error = str(e) schema = None if isinstance(data, bytes): b_avro = False try: bdata = io.BytesIO(data) if is_avro(bdata): self._origin = 'binary_avro' bdata.seek(0) b_avro = True avro_reader = reader(bdata) self._schema = avro_reader.schema obj_data = [] for record in avro_reader: obj_data.append(record) self._object_data = None if len( obj_data) == 0 else obj_data[0] if len( obj_data) == 1 else obj_data self._ok = True else: self._origin = 'binary_string' data = data.decode('utf-8') except Exception as e: self._last_error = ('Avro binary' if b_avro else 'String decoding') + f' error: {e}' if isinstance(data, str): success, json_data, origin = AvroTools.fetch_json(data) if not self._origin: self._origin = origin if not success: self._last_error = json_data return try: self._object_data = json.loads(json_data) self._json_data = json_data if self._schema is None: self._ok = True except Exception as e: self._last_error = f'JSON parsing error: {e}' elif isinstance(data, dict) or isinstance(data, list): self._origin = type(data).__name__ self._object_data = data if self._schema is None: self._ok = True if self._object_data is not None and not self._ok and self._schema is not None: try: validate(self._object_data, self._schema) self._ok = True except Exception as e: self._last_error = f'Schema error: {e}'
import json import avro.schema import fastavro with open("./record-copy.avsc", "r") as fp: schema = json.load(fp) with open("./payload.json", "r") as fp: payload = json.load(fp) fastavro.validate(datum=payload[0], schema=schema) avro.schema.SchemaFromJSONData(schema)