def __create_standard(out_path): os.makedirs(out_path) schema_path = os.path.join(os.path.dirname(__file__), 'data/user.avsc') schema = avro.schema.parse(open(schema_path).read()) with DataFileWriter(open(os.path.join(out_path, 'part-m-00000.avro'), 'w'), DatumWriter(), schema) as writer: writer.append({'position': 0, 'name': 'Alyssa', 'favorite_number': 256}) writer.append({'position': 1, 'name': 'Ben', 'favorite_number': 4, 'favorite_color': 'red'}) with DataFileWriter(open(os.path.join(out_path, 'part-m-00001.avro'), 'w'), DatumWriter(), schema) as writer: writer.append({'position': 2, 'name': 'Alyssa2', 'favorite_number': 512}) writer.append({'position': 3, 'name': 'Ben2', 'favorite_number': 8, 'favorite_color': 'blue', 'secret':b'0987654321'}) writer.append({'position': 4, 'name': 'Ben3', 'favorite_number': 2, 'favorite_color': 'green', 'secret':b'12345abcd'}) with DataFileWriter(open(os.path.join(out_path, 'part-m-00002.avro'), 'w'), DatumWriter(), schema) as writer: pass with DataFileWriter(open(os.path.join(out_path, 'part-m-00003.avro'), 'w'), DatumWriter(), schema) as writer: writer.append({'position': 5, 'name': 'Alyssa3', 'favorite_number': 16}) writer.append({'position': 6, 'name': 'Mallet', 'favorite_color': 'blue', 'secret': b'asdfgf'}) writer.append({'position': 7, 'name': 'Mikel', 'favorite_color': ''})
def respond(self, call_request): buffer_reader = io.BytesIO(call_request) buffer_decoder = BinaryDecoder(buffer_reader) buffer_writer = io.BytesIO() buffer_encoder = BinaryEncoder(buffer_writer) error = None response_metadata = {} try: remote_protocol = self.process_handshake(buffer_decoder, buffer_encoder) if remote_protocol is None or self.local_protocol is None: return buffer_writer.getvalue() DatumReader(schema.parse( '{"type": "map", "values": "bytes"}')).read(buffer_decoder) remote_message_name = buffer_decoder.read_utf8() remote_message = remote_protocol.messages.get(remote_message_name) if remote_message is None: fail_msg = 'Unknown remote message: %s' % remote_message_name raise schema.AvroException(fail_msg) local_message = self.local_protocol.messages.get( remote_message_name) if local_message is None: fail_msg = 'Unknown local message: %s' % remote_message_name raise schema.AvroException(fail_msg) writers_schema = remote_message.request readers_schema = local_message.request request = self.read_request(writers_schema, readers_schema, buffer_decoder) response = None try: response = self.invoke(self.local_protocol, local_message, request) except AvroRemoteException as e: error = e except Exception as e: error = AvroRemoteException(str(e)) DatumWriter( schema.parse('{"type": "map", "values": "bytes"}')).write( response_metadata, buffer_encoder) buffer_encoder.write_boolean(error is not None) if error is None: writers_schema = local_message.response self.write_response(writers_schema, response, buffer_encoder) else: writers_schema = local_message.errors self.write_error(writers_schema, error, buffer_encoder) except schema.AvroException as e: error = AvroRemoteException(str(e)) buffer_encoder = BinaryEncoder(io.BytesIO()) DatumWriter( schema.parse('{"type": "map", "values": "bytes"}')).write( response_metadata, buffer_encoder) buffer_encoder.write_boolean(True) self.write_error(schema.parse('["string"]'), error, buffer_encoder) return buffer_encoder.writer.getvalue() return buffer_writer.getvalue()
def process_handshake(self, decoder, encoder): handshake_response = {} try: handshake_request = DatumReader( _load_request_schema()).read(decoder) except SchemaResolutionException: if self.local_protocol is None: handshake_response['match'] = 'NONE' handshake_response['serverProtocol'] = str(NO_FOUND) handshake_response['serverHash'] = NO_FOUND.md5 DatumWriter(_load_response_schema()).write( handshake_response, encoder) raise HandshakeError(encoder.writer.getvalue()) # reset reader decoder.reader.seek(0, 0) return self.local_protocol client_hash = handshake_request.get('clientHash') client_protocol = handshake_request.get('clientProtocol') remote_protocol = self.get_protocol_cache(client_hash) # new handshake if remote_protocol is None and client_protocol is None: handshake_response['match'] = 'NONE' handshake_response['serverProtocol'] = str(NO_FOUND) handshake_response['serverHash'] = NO_FOUND.md5 DatumWriter(_load_response_schema()).write(handshake_response, encoder) return remote_protocol # client request handshake if remote_protocol is None and client_protocol is not None: # compare with client_protocol and cache_protocol self._local_protocol = self.contains(client_protocol) if self.local_protocol is None: handshake_response['match'] = 'NONE' handshake_response['serverProtocol'] = str(NO_FOUND) handshake_response['serverHash'] = NO_FOUND.md5 DatumWriter(_load_response_schema()).write( handshake_response, encoder) raise HandshakeError(encoder.writer.getvalue()) else: remote_protocol = protocol.parse(client_protocol) self.set_protocol_cache(client_hash, remote_protocol) handshake_response['match'] = 'CLIENT' handshake_response['serverProtocol'] = str(self.local_protocol) handshake_response['serverHash'] = self.local_protocol.md5 DatumWriter(_load_response_schema()).write( handshake_response, encoder) return remote_protocol # success handshake if remote_protocol is not None: handshake_response['match'] = 'BOTH' DatumWriter(_load_response_schema()).write(handshake_response, encoder) return remote_protocol
def main(): parser = ArgumentParser(description="Simple AMS example of subscription pull/consume") parser.add_argument('--host', type=str, default='messaging-devel.argo.grnet.gr', help='FQDN of AMS Service') parser.add_argument('--token', type=str, required=True, help='Given token') parser.add_argument('--project', type=str, required=True, help='Project registered in AMS Service') parser.add_argument('--subscription', type=str, required=True, help='Subscription name') parser.add_argument('--topic', type=str, required=True, help='Given topic') parser.add_argument('--nummsgs', type=int, default=3, help='Number of messages to pull and ack') parser.add_argument('--schema', type=str, required=True, help='Avro schema') parser.add_argument('--outfile', type=str, required=True, help='Output avro file') args = parser.parse_args() # initialize service with given token and project ams = ArgoMessagingService(endpoint=args.host, token=args.token, project=args.project) # ensure that subscription is created in first run. messages can be # pulled from the subscription only when subscription already exists # for given topic prior messages being published to topic try: if not ams.has_sub(args.subscription): ams.create_sub(args.subscription, args.topic) subscription = ams.get_sub(args.subscription, retobj=True) except AmsException as e: print(e) raise SystemExit(1) # try to pull number of messages from subscription. method will # return (ackIds, AmsMessage) tuples from which ackIds and messages # payload will be extracted. avro_payloads = list() for msg in subscription.pullack(args.nummsgs, retry=5, retrysleep=15, return_immediately=True): data = msg.get_data() msgid = msg.get_msgid() print('msgid={0}'.format(msgid)) avro_payloads.append(data) try: schema = load_schema(args.schema) if os.path.exists(args.outfile): avroFile = open(args.outfile, 'a+') writer = DataFileWriter(avroFile, DatumWriter()) else: avroFile = open(args.outfile, 'w+') writer = DataFileWriter(avroFile, DatumWriter(), schema) for am in avro_payloads: msg = avro_deserialize(am, args.schema) writer.append(msg) writer.close() avroFile.close() except Exception as e: print(e) raise SystemExit(1)
def __init__(self, callback, service_name, param_schema, result_schema, version=0): self.callback = callback self.service_name = service_name self.param_schema = SchemaFromJSONData(param_schema, Names()) self.result_schema = SchemaFromJSONData(result_schema, Names()) self.version = version self._param_writer = DatumWriter(self.param_schema) self._param_reader = DatumReader(self.param_schema) self._result_writer = DatumWriter(self.result_schema) self._result_reader = DatumReader(self.result_schema)
def encode(self, obj: BaseRecord) -> bytes: """ Encode *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* to bytes format This function is used by kafka-python Args: obj (BaseModel): *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* Raises: MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema) AvroEncodeError: fail to encode BaseModel to bytes Returns: bytes: BaseModel in bytes """ try: schema = self._schemas[obj.event_name()] except KeyError as err: self.logger.exception('%s', err.__str__()) raise MissingEventClass try: output = BytesIO() writer = DataFileWriter(output, DatumWriter(), schema) writer.append(obj.to_dict()) writer.flush() encoded_event = output.getvalue() writer.close() except AvroTypeException as err: self.logger.exception('%s', err.__str__()) raise AvroEncodeError return encoded_event
def write_avro(rows, file_out, schema_path): schema = avro.schema.parse(open(schema_path, "rb").read()) writer = DataFileWriter(open(file_out, "wb"), DatumWriter(), schema) for line in rows: print("INPUT LINE: ", line) writer.append({"name": line[0], "sex": line[1], "count": line[2], "year": line[3]}) writer.close()
def hello_gcs(event, context): # set storage client client = storage.Client() # get bucket bucket = client.get_bucket(bucket_name) # get the data print('URL: {}'.format(url)) response = urllib.request.urlopen(url) data = json.loads(response.read()) # remove unneeded data AND convert to bytes #small_data = json.dumps( data['observations'] ).encode('utf-8') # write to local file file_name = '{}.{}'.format(series_id, file_type) local_path = '/tmp/{}'.format(file_name) writer = DataFileWriter(open(local_path, "wb"), DatumWriter(), schema) for record in data['observations']: days_since_epoch, data_point = convert_data_types(record) writer.append({"date": days_since_epoch, "value": data_point}) writer.close() # set Blob file_name = '{}_{}.{}'.format(series_id, get_datetime(), file_type) blob = storage.Blob(file_name, bucket) # upload the file to GCS blob.upload_from_filename(local_path) print('Event ID: {}'.format(context.event_id)) print('Event type: {}'.format(context.event_type)) print("""This Function was triggered by messageId {} published at {} """.format(context.event_id, context.timestamp))
def handle_avro_client_print_to_file(connection, address): schema = avro.schema.Parse(open("schema/addressbook.avsc", "rb").read()) data = connection.recv(4) message_length, = struct.unpack('>I', data) message = connection.recv(message_length) message_buf = io.BytesIO(message) reader = avro.datafile.DataFileReader(message_buf, avro.io.DatumReader()) # Create a data file using DataFileWriter dataFile = open("schema/addressbook.avro", "wb") writer = DataFileWriter(dataFile, DatumWriter(), schema) for thing in reader: writer.append(thing) reader.close() writer.close() return (len(message))
def write(self, filename, records): if filename.split('.')[-2] == 'snappy': compress = 'snappy' else: compress = 'null' try: with client.write(filename, overwrite=True) as writer: with DataFileWriter(writer, DatumWriter(), self.schema, codec=compress) as data_file_writer: for record in records: self.exit() _id = record['_id']['$oid'] etl(record) self.log_count() data_file_writer.append(record) self.save_count += 1 except AttributeError as e: logger.error(f'record: {_id}') logger.info(json.dumps(record, indent=4, ensure_ascii=False)) traceback.print_exc() # raise e except AvroTypeException as e: logger.info(f'Save Count: {self.save_count}') logger.error(f'record: {_id}') logger.info(json.dumps(record, indent=4, ensure_ascii=False)) raise e
def save_avro(data, file_name='data.avro', test=True): import json import avro.schema from avro.datafile import DataFileWriter from avro.io import DatumWriter schema_path = str(DATA_ROOT / 'schemas.avsc') with open(schema_path) as f: schema = avro.schema.SchemaFromJSONData(json.load(f)) if test: file_name = "{}.{}".format(file_name, os.getpid()) path = str(DATA_ROOT / file_name) writer = DataFileWriter(open(path, "wb"), DatumWriter(), schema) try: for datum in data: writer.append(datum) finally: writer.close() if test: os.remove(path)
def _exp_wcctrn(p): global cnxpool, count, file_path, schema flag, dest = p print('{} [{}] exporting {}...'.format(strftime("%H:%M:%S"), os.getpid(), flag)) cnx = cnxpool.get_connection() writer = None _schema = None if file_path is None or count >= parallel_threshold: file_path = os.path.join( dest, "wcc_trn", "{}_{}.avro".format(os.getpid(), strftime("%Y%m%d_%H%M%S"))) print('{} allocating new file {}...'.format(strftime("%H:%M:%S"), file_path)) count = 0 _schema = schema try: cursor = cnx.cursor(dictionary=True, buffered=True) cursor.execute("SELECT * from wcc_trn where flag = %s", (flag, )) rows = cursor.fetchall() total = cursor.rowcount cursor.close() writer = DataFileWriter(open(file_path, "ab+"), DatumWriter(), _schema) for row in rows: writer.append(row) count += total except: print(sys.exc_info()[0]) raise finally: cnx.close() if writer: writer.close()
def serializeDataToOCFFile(schemaFile, outputFile, dataToSerialize): logging.debug("Parsing in avro schema:" + schemaFile) schema = parse_schema(schemaFile) logging.debug("Writing avro data to:" + outputFile) writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema) writer.append(dataToSerialize) writer.close()
def encode_avro_message(data): datum_writer = DatumWriter(get_media_avro_schema()) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) datum_writer.write(data, encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def toKey(self, x, avroType): x = jsonEncoder(avroType, x, False) bytes = io.BytesIO() writer = DatumWriter(avroType.schema) writer.write(x, BinaryEncoder(bytes)) bytes.flush() return base64.b64encode(bytes.getvalue())
def main(args): log = logging.getLogger(__name__) log.setLevel(logging.INFO) sys_log = logging.handlers.SysLogHandler("/dev/log") sys_format = logging.Formatter( '%(name)s[%(process)d]: %(levelname)s %(message)s') sys_log.setFormatter(sys_format) log.addHandler(sys_log) reader = DataFileReader(open(args.avro_file, "r"), DatumReader()) schema = reader.datum_reader.writers_schema for i, row in enumerate(reader): log.debug("Consumer row:" + str(row)) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(row, encoder) raw_bytes = bytes_writer.getvalue() b64enc = base64.b64encode(raw_bytes) msg = {"messages": [{"data": b64enc}]} json_str = json.dumps(msg) log.debug("json msg:" + json_str) publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)
def _loadAvro(config, superSchema, daysArray): print("**********************Loading ForecastDataAvro****************") autGenSchemaFile = config["ETL"]["Extract"]["AutGenSchemaFile"] forecastAvroFile = config["ETL"]["Load"]["Avro"]["File"] dWHForecastPath = config["ETL"]["Load"]["AvgData"]["DWHForecastPath"] dayAvroSchema = _autogenerateSchema(superSchema) with open(dWHForecastPath+autGenSchemaFile, "w") as file: file.write(json.dumps(dayAvroSchema, indent=4)) # create avro.schema from json schema dayAvroSchemaString = json.dumps(dayAvroSchema) schema = avro.schema.Parse(dayAvroSchemaString) avroFile = dWHForecastPath + forecastAvroFile # create a writer for DWH writer = DataFileWriter(open(avroFile, "wb"), DatumWriter(), schema) # append each day for day in daysArray: # pp.pprint(day) writer.append(day) # close writer writer.close() # pp.pprint(writer) _readAvro(avroFile)
def test_infer_schema_avro(): with tempfile.TemporaryFile(mode="w+b") as file: schema = avro_schema.parse( ujson.dumps( { "type": "record", "name": "test", "fields": [ {"name": "boolean_field", "type": "boolean"}, {"name": "integer_field", "type": "int"}, {"name": "string_field", "type": "string"}, ], } ) ) writer = DataFileWriter(file, DatumWriter(), schema) records = test_table.to_dict(orient="records") for record in records: writer.append(record) writer.sync() file.seek(0) fields = avro.AvroInferrer().infer_schema(file) fields.sort(key=lambda x: x.fieldPath) assert_field_paths_match(fields, expected_field_paths_avro) assert_field_types_match(fields, expected_field_types)
def write_data_to_avro(raw_data, data_type): data_folder = Path('avro') avro_file = data_type + '.avro' avro_file_path = data_folder / avro_file avsc_file = data_type + '.avsc' avsc_file_path = data_folder / avsc_file schema = avro.schema.Parse(open(avsc_file_path.resolve(), "rb").read()) writer = DataFileWriter(open(avro_file_path.resolve(), "wb"), DatumWriter(), schema) for _ , record in raw_data.iterrows(): dict = record.to_dict() if data_type == "stops": dict['stop_lat_lon'] = {'stop_lon': dict['stop_lat_lon'].x, 'stop_lat': dict['stop_lat_lon'].y} ''' if data_type == "stop_times": #del dict['arrival_time'] #del dict['departure_time'] del dict['stop_id'] del dict['stop_sequence'] del dict['pickup_type'] del dict['drop_off_type'] del dict['timepoint'] ''' writer.append(dict) writer.close() '''