Example #1
0
def __create_standard(out_path):
    os.makedirs(out_path)
    schema_path = os.path.join(os.path.dirname(__file__), 'data/user.avsc')
    schema = avro.schema.parse(open(schema_path).read())
  
    with DataFileWriter(open(os.path.join(out_path, 'part-m-00000.avro'), 'w'), 
            DatumWriter(), schema) as writer:
        writer.append({'position': 0, 'name': 'Alyssa', 'favorite_number': 256})
        writer.append({'position': 1, 'name': 'Ben', 'favorite_number': 4, 'favorite_color': 'red'})
    
    with DataFileWriter(open(os.path.join(out_path, 'part-m-00001.avro'), 'w'), 
            DatumWriter(), schema) as writer:
        writer.append({'position': 2, 'name': 'Alyssa2', 'favorite_number': 512})
        writer.append({'position': 3, 'name': 'Ben2', 'favorite_number': 8, 'favorite_color': 'blue', 'secret':b'0987654321'})
        writer.append({'position': 4, 'name': 'Ben3', 'favorite_number': 2, 'favorite_color': 'green', 'secret':b'12345abcd'})
    
    with DataFileWriter(open(os.path.join(out_path, 'part-m-00002.avro'), 'w'), 
            DatumWriter(), schema) as writer:
        pass
    
    with DataFileWriter(open(os.path.join(out_path, 'part-m-00003.avro'), 'w'), 
            DatumWriter(), schema) as writer:
        writer.append({'position': 5, 'name': 'Alyssa3', 'favorite_number': 16})
        writer.append({'position': 6, 'name': 'Mallet', 'favorite_color': 'blue', 'secret': b'asdfgf'})
        writer.append({'position': 7, 'name': 'Mikel', 'favorite_color': ''})
Example #2
0
    def respond(self, call_request):
        buffer_reader = io.BytesIO(call_request)
        buffer_decoder = BinaryDecoder(buffer_reader)
        buffer_writer = io.BytesIO()
        buffer_encoder = BinaryEncoder(buffer_writer)
        error = None
        response_metadata = {}
        try:
            remote_protocol = self.process_handshake(buffer_decoder,
                                                     buffer_encoder)
            if remote_protocol is None or self.local_protocol is None:
                return buffer_writer.getvalue()

            DatumReader(schema.parse(
                '{"type": "map", "values": "bytes"}')).read(buffer_decoder)
            remote_message_name = buffer_decoder.read_utf8()

            remote_message = remote_protocol.messages.get(remote_message_name)
            if remote_message is None:
                fail_msg = 'Unknown remote message: %s' % remote_message_name
                raise schema.AvroException(fail_msg)
            local_message = self.local_protocol.messages.get(
                remote_message_name)
            if local_message is None:
                fail_msg = 'Unknown local message: %s' % remote_message_name
                raise schema.AvroException(fail_msg)
            writers_schema = remote_message.request
            readers_schema = local_message.request
            request = self.read_request(writers_schema, readers_schema,
                                        buffer_decoder)

            response = None
            try:
                response = self.invoke(self.local_protocol, local_message,
                                       request)
            except AvroRemoteException as e:
                error = e
            except Exception as e:
                error = AvroRemoteException(str(e))

            DatumWriter(
                schema.parse('{"type": "map", "values": "bytes"}')).write(
                    response_metadata, buffer_encoder)
            buffer_encoder.write_boolean(error is not None)
            if error is None:
                writers_schema = local_message.response
                self.write_response(writers_schema, response, buffer_encoder)
            else:
                writers_schema = local_message.errors
                self.write_error(writers_schema, error, buffer_encoder)
        except schema.AvroException as e:
            error = AvroRemoteException(str(e))
            buffer_encoder = BinaryEncoder(io.BytesIO())
            DatumWriter(
                schema.parse('{"type": "map", "values": "bytes"}')).write(
                    response_metadata, buffer_encoder)
            buffer_encoder.write_boolean(True)
            self.write_error(schema.parse('["string"]'), error, buffer_encoder)
            return buffer_encoder.writer.getvalue()
        return buffer_writer.getvalue()
Example #3
0
    def process_handshake(self, decoder, encoder):
        handshake_response = {}
        try:
            handshake_request = DatumReader(
                _load_request_schema()).read(decoder)
        except SchemaResolutionException:
            if self.local_protocol is None:
                handshake_response['match'] = 'NONE'
                handshake_response['serverProtocol'] = str(NO_FOUND)
                handshake_response['serverHash'] = NO_FOUND.md5
                DatumWriter(_load_response_schema()).write(
                    handshake_response, encoder)
                raise HandshakeError(encoder.writer.getvalue())
            # reset reader
            decoder.reader.seek(0, 0)
            return self.local_protocol

        client_hash = handshake_request.get('clientHash')
        client_protocol = handshake_request.get('clientProtocol')
        remote_protocol = self.get_protocol_cache(client_hash)

        # new handshake
        if remote_protocol is None and client_protocol is None:
            handshake_response['match'] = 'NONE'
            handshake_response['serverProtocol'] = str(NO_FOUND)
            handshake_response['serverHash'] = NO_FOUND.md5
            DatumWriter(_load_response_schema()).write(handshake_response,
                                                       encoder)
            return remote_protocol

        # client request handshake
        if remote_protocol is None and client_protocol is not None:
            # compare with client_protocol and cache_protocol
            self._local_protocol = self.contains(client_protocol)
            if self.local_protocol is None:
                handshake_response['match'] = 'NONE'
                handshake_response['serverProtocol'] = str(NO_FOUND)
                handshake_response['serverHash'] = NO_FOUND.md5
                DatumWriter(_load_response_schema()).write(
                    handshake_response, encoder)
                raise HandshakeError(encoder.writer.getvalue())
            else:
                remote_protocol = protocol.parse(client_protocol)
                self.set_protocol_cache(client_hash, remote_protocol)
                handshake_response['match'] = 'CLIENT'
                handshake_response['serverProtocol'] = str(self.local_protocol)
                handshake_response['serverHash'] = self.local_protocol.md5
                DatumWriter(_load_response_schema()).write(
                    handshake_response, encoder)
                return remote_protocol

        # success handshake
        if remote_protocol is not None:
            handshake_response['match'] = 'BOTH'

        DatumWriter(_load_response_schema()).write(handshake_response, encoder)
        return remote_protocol
Example #4
0
def main():
    parser = ArgumentParser(description="Simple AMS example of subscription pull/consume")
    parser.add_argument('--host', type=str, default='messaging-devel.argo.grnet.gr', help='FQDN of AMS Service')
    parser.add_argument('--token', type=str, required=True, help='Given token')
    parser.add_argument('--project', type=str, required=True, help='Project  registered in AMS Service')
    parser.add_argument('--subscription', type=str, required=True, help='Subscription name')
    parser.add_argument('--topic', type=str, required=True, help='Given topic')
    parser.add_argument('--nummsgs', type=int, default=3, help='Number of messages to pull and ack')
    parser.add_argument('--schema', type=str, required=True, help='Avro schema')
    parser.add_argument('--outfile', type=str, required=True, help='Output avro file')
    args = parser.parse_args()

    # initialize service with given token and project
    ams = ArgoMessagingService(endpoint=args.host, token=args.token, project=args.project)

    # ensure that subscription is created in first run. messages can be
    # pulled from the subscription only when subscription already exists
    # for given topic prior messages being published to topic
    try:
        if not ams.has_sub(args.subscription):
            ams.create_sub(args.subscription, args.topic)
        subscription = ams.get_sub(args.subscription, retobj=True)
    except AmsException as e:
        print(e)
        raise SystemExit(1)

    # try to pull number of messages from subscription. method will
    # return (ackIds, AmsMessage) tuples from which ackIds and messages
    # payload will be extracted.
    avro_payloads = list()
    for msg in subscription.pullack(args.nummsgs, retry=5, retrysleep=15, return_immediately=True):
        data = msg.get_data()
        msgid = msg.get_msgid()
        print('msgid={0}'.format(msgid))
        avro_payloads.append(data)

    try:
        schema = load_schema(args.schema)
        if os.path.exists(args.outfile):
            avroFile = open(args.outfile, 'a+')
            writer = DataFileWriter(avroFile, DatumWriter())
        else:
            avroFile = open(args.outfile, 'w+')
            writer = DataFileWriter(avroFile, DatumWriter(), schema)

        for am in avro_payloads:
            msg = avro_deserialize(am, args.schema)
            writer.append(msg)

        writer.close()
        avroFile.close()

    except Exception as e:
        print(e)
        raise SystemExit(1)
Example #5
0
 def __init__(self,
              callback,
              service_name,
              param_schema,
              result_schema,
              version=0):
     self.callback = callback
     self.service_name = service_name
     self.param_schema = SchemaFromJSONData(param_schema, Names())
     self.result_schema = SchemaFromJSONData(result_schema, Names())
     self.version = version
     self._param_writer = DatumWriter(self.param_schema)
     self._param_reader = DatumReader(self.param_schema)
     self._result_writer = DatumWriter(self.result_schema)
     self._result_reader = DatumReader(self.result_schema)
Example #6
0
    def encode(self, obj: BaseRecord) -> bytes:
        """ Encode *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* to bytes format

        This function is used by kafka-python

        Args:
            obj (BaseModel): *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult*

        Raises:
            MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema)
            AvroEncodeError: fail to encode BaseModel to bytes

        Returns:
            bytes: BaseModel in bytes
        """
        try:
            schema = self._schemas[obj.event_name()]
        except KeyError as err:
            self.logger.exception('%s', err.__str__())
            raise MissingEventClass

        try:
            output = BytesIO()
            writer = DataFileWriter(output, DatumWriter(), schema)
            writer.append(obj.to_dict())
            writer.flush()
            encoded_event = output.getvalue()
            writer.close()
        except AvroTypeException as err:
            self.logger.exception('%s', err.__str__())
            raise AvroEncodeError
        return encoded_event
Example #7
0
def write_avro(rows, file_out, schema_path):
    schema = avro.schema.parse(open(schema_path, "rb").read())
    writer = DataFileWriter(open(file_out, "wb"), DatumWriter(), schema)
    for line in rows:
        print("INPUT LINE: ", line)
        writer.append({"name": line[0], "sex": line[1], "count": line[2], "year": line[3]})
    writer.close()
Example #8
0
def hello_gcs(event, context):
    # set storage client
    client = storage.Client()

    # get bucket
    bucket = client.get_bucket(bucket_name)

    # get the data
    print('URL: {}'.format(url))
    response = urllib.request.urlopen(url)
    data = json.loads(response.read())

    # remove unneeded data AND convert to bytes
    #small_data = json.dumps( data['observations'] ).encode('utf-8')

    # write to local file
    file_name = '{}.{}'.format(series_id, file_type)
    local_path = '/tmp/{}'.format(file_name)
    writer = DataFileWriter(open(local_path, "wb"), DatumWriter(), schema)
    for record in data['observations']:
        days_since_epoch, data_point = convert_data_types(record)
        writer.append({"date": days_since_epoch, "value": data_point})
    writer.close()

    # set Blob
    file_name = '{}_{}.{}'.format(series_id, get_datetime(), file_type)
    blob = storage.Blob(file_name, bucket)

    # upload the file to GCS
    blob.upload_from_filename(local_path)

    print('Event ID: {}'.format(context.event_id))
    print('Event type: {}'.format(context.event_type))
    print("""This Function was triggered by messageId {} published at {}
  """.format(context.event_id, context.timestamp))
Example #9
0
def handle_avro_client_print_to_file(connection, address):

    schema = avro.schema.Parse(open("schema/addressbook.avsc", "rb").read())

    data = connection.recv(4)

    message_length, = struct.unpack('>I', data)

    message = connection.recv(message_length)

    message_buf = io.BytesIO(message)
    reader = avro.datafile.DataFileReader(message_buf, avro.io.DatumReader())

    # Create a data file using DataFileWriter

    dataFile = open("schema/addressbook.avro", "wb")

    writer = DataFileWriter(dataFile, DatumWriter(), schema)

    for thing in reader:
        writer.append(thing)
    reader.close()

    writer.close()
    return (len(message))
Example #10
0
    def write(self, filename, records):
        if filename.split('.')[-2] == 'snappy':
            compress = 'snappy'
        else:
            compress = 'null'
        try:
            with client.write(filename, overwrite=True) as writer:
                with DataFileWriter(writer, DatumWriter(), self.schema, codec=compress) as data_file_writer:
                    for record in records:
                        self.exit()
                        _id = record['_id']['$oid']
                        etl(record)

                        self.log_count()

                        data_file_writer.append(record)
                        self.save_count += 1
        except AttributeError as e:
            logger.error(f'record: {_id}')

            logger.info(json.dumps(record, indent=4, ensure_ascii=False))
            traceback.print_exc()
            # raise e
        except AvroTypeException as e:
            logger.info(f'Save Count: {self.save_count}')
            logger.error(f'record: {_id}')
            logger.info(json.dumps(record, indent=4, ensure_ascii=False))
            raise e
Example #11
0
def save_avro(data, file_name='data.avro', test=True):
    import json

    import avro.schema

    from avro.datafile import DataFileWriter
    from avro.io import DatumWriter

    schema_path = str(DATA_ROOT / 'schemas.avsc')
    with open(schema_path) as f:
        schema = avro.schema.SchemaFromJSONData(json.load(f))

    if test:
        file_name = "{}.{}".format(file_name, os.getpid())

    path = str(DATA_ROOT / file_name)
    writer = DataFileWriter(open(path, "wb"), DatumWriter(), schema)

    try:
        for datum in data:
            writer.append(datum)
    finally:
        writer.close()

        if test:
            os.remove(path)
Example #12
0
def _exp_wcctrn(p):
    global cnxpool, count, file_path, schema
    flag, dest = p
    print('{} [{}] exporting {}...'.format(strftime("%H:%M:%S"), os.getpid(),
                                           flag))
    cnx = cnxpool.get_connection()
    writer = None
    _schema = None
    if file_path is None or count >= parallel_threshold:
        file_path = os.path.join(
            dest, "wcc_trn", "{}_{}.avro".format(os.getpid(),
                                                 strftime("%Y%m%d_%H%M%S")))
        print('{} allocating new file {}...'.format(strftime("%H:%M:%S"),
                                                    file_path))
        count = 0
        _schema = schema
    try:
        cursor = cnx.cursor(dictionary=True, buffered=True)
        cursor.execute("SELECT * from wcc_trn where flag = %s", (flag, ))
        rows = cursor.fetchall()
        total = cursor.rowcount
        cursor.close()
        writer = DataFileWriter(open(file_path, "ab+"), DatumWriter(), _schema)
        for row in rows:
            writer.append(row)
        count += total
    except:
        print(sys.exc_info()[0])
        raise
    finally:
        cnx.close()
        if writer:
            writer.close()
Example #13
0
def serializeDataToOCFFile(schemaFile, outputFile, dataToSerialize):
    logging.debug("Parsing in avro schema:" + schemaFile)
    schema = parse_schema(schemaFile)
    logging.debug("Writing avro data to:" + outputFile)
    writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema)
    writer.append(dataToSerialize)
    writer.close()
Example #14
0
def encode_avro_message(data):
    datum_writer = DatumWriter(get_media_avro_schema())
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    datum_writer.write(data, encoder)
    raw_bytes = bytes_writer.getvalue()
    return raw_bytes
Example #15
0
 def toKey(self, x, avroType):
     x = jsonEncoder(avroType, x, False)
     bytes = io.BytesIO()
     writer = DatumWriter(avroType.schema)
     writer.write(x, BinaryEncoder(bytes))
     bytes.flush()
     return base64.b64encode(bytes.getvalue())
def main(args):
    log = logging.getLogger(__name__)
    log.setLevel(logging.INFO)

    sys_log = logging.handlers.SysLogHandler("/dev/log")
    sys_format = logging.Formatter(
        '%(name)s[%(process)d]: %(levelname)s %(message)s')
    sys_log.setFormatter(sys_format)

    log.addHandler(sys_log)

    reader = DataFileReader(open(args.avro_file, "r"), DatumReader())

    schema = reader.datum_reader.writers_schema

    for i, row in enumerate(reader):
        log.debug("Consumer row:" + str(row))
        writer = DatumWriter(schema)
        bytes_writer = io.BytesIO()
        encoder = BinaryEncoder(bytes_writer)
        writer.write(row, encoder)
        raw_bytes = bytes_writer.getvalue()
        b64enc = base64.b64encode(raw_bytes)
        msg = {"messages": [{"data": b64enc}]}

        json_str = json.dumps(msg)
        log.debug("json msg:" + json_str)
        publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic,
                args.ams_key, log)
Example #17
0
def _loadAvro(config, superSchema, daysArray):
    print("**********************Loading ForecastDataAvro****************")
    autGenSchemaFile = config["ETL"]["Extract"]["AutGenSchemaFile"]
    forecastAvroFile = config["ETL"]["Load"]["Avro"]["File"]
    dWHForecastPath = config["ETL"]["Load"]["AvgData"]["DWHForecastPath"]
    
    dayAvroSchema = _autogenerateSchema(superSchema)

    with open(dWHForecastPath+autGenSchemaFile, "w") as file:
        file.write(json.dumps(dayAvroSchema, indent=4))
    # create avro.schema from json schema
    dayAvroSchemaString = json.dumps(dayAvroSchema)
    schema = avro.schema.Parse(dayAvroSchemaString)

    avroFile = dWHForecastPath + forecastAvroFile
    # create a writer for DWH
    writer = DataFileWriter(open(avroFile, "wb"),
                            DatumWriter(), schema)

    # append each day
    for day in daysArray:
        # pp.pprint(day)
        writer.append(day)

    # close writer
    writer.close()
    # pp.pprint(writer)
    _readAvro(avroFile)
def test_infer_schema_avro():
    with tempfile.TemporaryFile(mode="w+b") as file:
        schema = avro_schema.parse(
            ujson.dumps(
                {
                    "type": "record",
                    "name": "test",
                    "fields": [
                        {"name": "boolean_field", "type": "boolean"},
                        {"name": "integer_field", "type": "int"},
                        {"name": "string_field", "type": "string"},
                    ],
                }
            )
        )
        writer = DataFileWriter(file, DatumWriter(), schema)
        records = test_table.to_dict(orient="records")
        for record in records:
            writer.append(record)
        writer.sync()

        file.seek(0)

        fields = avro.AvroInferrer().infer_schema(file)
        fields.sort(key=lambda x: x.fieldPath)

        assert_field_paths_match(fields, expected_field_paths_avro)
        assert_field_types_match(fields, expected_field_types)
def write_data_to_avro(raw_data, data_type):
    
    data_folder = Path('avro')
    avro_file = data_type + '.avro'
    avro_file_path = data_folder / avro_file
    avsc_file = data_type + '.avsc'
    avsc_file_path = data_folder / avsc_file

    schema = avro.schema.Parse(open(avsc_file_path.resolve(), "rb").read())

    writer = DataFileWriter(open(avro_file_path.resolve(), "wb"), DatumWriter(), schema)

    for _ , record in raw_data.iterrows():
        dict = record.to_dict()

        if data_type == "stops":
            dict['stop_lat_lon'] = {'stop_lon': dict['stop_lat_lon'].x, 'stop_lat': dict['stop_lat_lon'].y}
        '''
        if data_type == "stop_times":
            #del dict['arrival_time']
            #del dict['departure_time']
            del dict['stop_id']
            del dict['stop_sequence']
            del dict['pickup_type']
            del dict['drop_off_type']
            del dict['timepoint']
        '''
        writer.append(dict)
    
    writer.close()
    '''