Ejemplo n.º 1
0
    def run(self, n):
        # JSON Serializer
        # serializer = ajs.AvroJsonSerializer(self.movies_schema)
        # json_data = serializer.to_json(self.movies_data)
        total_ser = 0
        total_deser = 0
        bytes_len = 0
        for i in range(0, n):
            datum_writer = DatumWriter(self.movies_schema)
            bytes_writer = io.BytesIO()

            encoder = BinaryEncoder(bytes_writer)
            tic = timeit.default_timer()
            datum_writer.write(self.movies_data, encoder)
            elapsed = timeit.default_timer() - tic
            payload = bytes_writer.getvalue()
            total_ser = total_ser + elapsed
            bytes_len = len(payload)

            bytes_reader = io.BytesIO(payload)
            decoder = BinaryDecoder(bytes_reader)
            reader = DatumReader(self.movies_schema)
            tic2 = timeit.default_timer()
            movies = reader.read(decoder)
            elapsed2 = timeit.default_timer() - tic2
            total_deser = total_deser + elapsed2

        self.logger.log(logging.INFO, "serialized len: %s bytes", bytes_len)
        avg_ser = (total_ser*(10**9))/n
        avg_deser = (total_deser*(10**9))/n
        self.logger.log(logging.INFO, "Serialization time: \n%s", avg_ser)
        self.logger.log(logging.INFO, "De-serialization time: \n%s", avg_deser)
def main(args):
    log = logging.getLogger(__name__)
    log.setLevel(logging.INFO)

    sys_log = logging.handlers.SysLogHandler("/dev/log")
    sys_format = logging.Formatter('%(name)s[%(process)d]: %(levelname)s %(message)s')
    sys_log.setFormatter(sys_format)

    log.addHandler(sys_log)

    reader = DataFileReader(open(args.avro_file, "r"), DatumReader())

    schema = reader.datum_reader.writers_schema

    for i, row in enumerate(reader):
        log.debug("Consumer row:" + str(row))
        writer = DatumWriter(schema)
        bytes_writer = io.BytesIO()
        encoder = BinaryEncoder(bytes_writer)
        writer.write(row, encoder)
        raw_bytes = bytes_writer.getvalue()
        b64enc = base64.b64encode(raw_bytes)
        msg = {"messages": [{"data": b64enc}]}

        json_str = json.dumps(msg)
        log.debug("json msg:" + json_str)
        publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)
Ejemplo n.º 3
0
 def toKey(self, x, avroType):
     x = jsonEncoder(avroType, x, False)
     bytes = io.BytesIO()
     writer = DatumWriter(avroType.schema)
     writer.write(x, BinaryEncoder(bytes))
     bytes.flush()
     return base64.b64encode(bytes.getvalue())
Ejemplo n.º 4
0
 def __call__(self, state, scope, pos, paramTypes, x):
     schema = avro.schema.parse(json.dumps(paramTypes[0]))
     x = untagUnion(x, paramTypes[0])
     bytes = io.BytesIO()
     writer = DatumWriter(schema)
     writer.write(x, BinaryEncoder(bytes))
     bytes.flush()
     return bytes.getvalue()
Ejemplo n.º 5
0
def compose_data(timestamp, src_vmtype, host_ip, account_id, dest_ip):
    writer = DatumWriter(get_schema())
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    message = '{"eventName": "Neighbour_Unreachable", "accountId":"%s", "destIp":"%s"}' \
              % (account_id, dest_ip)
    raw_data = bytes(message)
    writer.write({"timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata":raw_data}, encoder)
    raw_bytes = bytes_writer.getvalue()
    return raw_bytes
Ejemplo n.º 6
0
def compose_data(timestamp, src_vmtype, host_ip, account_id, proc_name):
    writer = DatumWriter(get_schema())
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    message = '{"eventName": "Process_Down", "accountId":"%s", "ProcName":"%s"}' \
              % (account_id, proc_name)
    raw_data = bytes(message)
    writer.write({"timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata":raw_data}, encoder)
    raw_bytes = bytes_writer.getvalue()
    return raw_bytes
Ejemplo n.º 7
0
class AvroSerializer(object):

    def __init__(self, schema):
        self.schema = schema
        self.datum_writer = DatumWriter(schema)

    def serialize(self, record):
        f = StringIO()
        encoder = BinaryEncoder(f)
        self.datum_writer.write(record, encoder)
        return f.getvalue()
Ejemplo n.º 8
0
class Serializer(object):

    def __init__(self, schema_str):
        schema = avro.schema.parse(schema_str)
        self.writer = DatumWriter(schema)

    def serialize(self, record):
        f = StringIO()
        encoder = BinaryEncoder(f)
        self.writer.write(record, encoder)
        return f.getvalue()
Ejemplo n.º 9
0
class Serializer(object):

    def __init__(self, schema_str):
        if sys.version_info >= (3,):
            schema = avro.schema.Parse(schema_str)
        else:
            schema = avro.schema.parse(schema_str)
        self.writer = DatumWriter(schema)

    def serialize(self, record):
        f = string_io()
        encoder = BinaryEncoder(f)
        self.writer.write(record, encoder)
        return f.getvalue()
Ejemplo n.º 10
0
 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
Ejemplo n.º 11
0
 def encode(self, raw_data):
     byte_stream = BytesIO()
     writer = DataFileWriter(byte_stream, DatumWriter(), self._schema)
     writer.append(raw_data)
     writer.flush()
     serialized_data = byte_stream.getvalue()
     writer.close()
     return serialized_data
Ejemplo n.º 12
0
def write_pipeline_template_to_avro(pipeline, rtasks_d, output_file):

    d = pipeline_template_to_dict(pipeline, rtasks_d)
    f = open(output_file, 'w')
    with DataFileWriter(f, DatumWriter(), PT_SCHEMA) as writer:
        writer.append(d)

    return d
Ejemplo n.º 13
0
 def write_avro_file(self, rec_creator, n_samples, sync_interval):
     avdf.SYNC_INTERVAL = sync_interval
     self.assertEqual(avdf.SYNC_INTERVAL, sync_interval)
     fo = self._mkf('data.avro')
     with avdf.DataFileWriter(fo, DatumWriter(), self.schema) as writer:
         for i in xrange(n_samples):
             writer.append(rec_creator(i))
     return fo.name
Ejemplo n.º 14
0
def gen_avro(filename):
    schema = avro.schema.parse(SCHEMA)
    fo = open(filename, "wb")
    writer = DataFileWriter(fo, DatumWriter(), schema)
    for record in looney_records():
        writer.append(record)
    writer.close()
    fo.close()
Ejemplo n.º 15
0
 def _write(self, data):
     "Internal write API"
     wmaid = self.wmaid(data)
     schema = self.schema
     fname = file_name(self.hdir, wmaid)
     with open_file(fname, 'w') as ostream:
         with DataFileWriter(ostream, DatumWriter(), schema) as writer:
             writer.append(data)
Ejemplo n.º 16
0
def __create_nested(out_path):
    os.makedirs(out_path)
    schema_path = os.path.join(os.path.dirname(__file__), 'data/nested.avsc')
    schema = avro.schema.parse(open(schema_path).read())
    with DataFileWriter(open(os.path.join(out_path, 'part-m-00004.avro'), 'w'), 
                    DatumWriter(), schema) as writer:
            writer.append({'sup': 1, 'sub':{'level2':2}})
            writer.append({'sup': 2, 'sub':{'level2':1}})
Ejemplo n.º 17
0
def prepare(producer, arr, root, level):
    for it in arr:
        buf = io.BytesIO()
        writer = DataFileWriter(buf, DatumWriter(), sch)
        item = Item(root, it, False)
        writer.append(item.get_dict())
        writer.flush()
        send(buf, level, producer)
Ejemplo n.º 18
0
    def produce(self, msg):
        if self.ser_type == kfkcfg.SERIALIZATIO_JSON:
            # s = json.dumps(msg)
            s = json.dumps(msg, default=json_util.default)
            future = self.kfkprod.produce(bytes(s, 'utf-8'))
            # msg = json.dumps(msg, default=json_util.default).encode('utf-8')
            # future = self.kfkprod.produce(bytes(msg))

        elif self.ser_type == kfkcfg.SERIALIZATIO_AVRO:

            writer = DatumWriter(self.avro_schema)
            bytes_writer = io.BytesIO()
            encoder = BinaryEncoder(bytes_writer)
            writer.write(msg, encoder)
            raw_bytes = bytes_writer.getvalue()

            future = self.kfkprod.produce(raw_bytes)
Ejemplo n.º 19
0
    def run(self):
        # for normalizing alcohol
        minimum, maximum, average = 100, 0, 0

        with open('raw.csv', 'r') as fd:
            csv_reader = csv.reader(fd, delimiter=',')

            collection = {}
            for i, row in enumerate(csv_reader):
                desc = row[3].lower().replace('.', '').replace(',', '')

                alc = float(row[-1])
                if alc < minimum:
                    minimum = alc
                if alc > maximum:
                    maximum = alc
                average += alc

                # Remove gifts or items without description
                if 'engin' in desc:
                    continue

                if 'gjafa' in desc or 'gjafa' in row[0]:
                    continue

                if 'öskju' in desc or 'öskju' in row[0]:
                    continue

                if 'flöskur m/glasi' in desc or 'kútur' in row[0]:
                    continue

                features = self.parse(desc.split(), row[0])
                features['alcohol'] = alc
                collection[row[0]] = features

        average = average / (i + 1)

        with open('beers.avsc', 'r') as fd:
            schema = avro.schema.Parse(fd.read())

        with open('beers.avro', 'wb') as fd:
            writer = DataFileWriter(fd, DatumWriter(), schema)

            denominator_alc = maximum - minimum

            for k, v in collection.items():
                v['bitterness'] = self.BITTERNESS['class'][
                    v['bitterness']] / self.BITTERNESS['maximum']
                v['color'] = self.COLOR['class'][
                    v['color']] / self.COLOR['maximum']
                v['clarity'] = self.CLARITY['class'][
                    v['clarity']] / self.CLARITY['maximum']
                v['sweetness'] = self.SWEETNESS['class'][
                    v['sweetness']] / self.CLARITY['maximum']
                v['alcohol'] = (v['alcohol'] - minimum) / denominator_alc
                v['name'] = k
                writer.append(v)
            writer.close()
Ejemplo n.º 20
0
class AvroInference():
    """Class representing a sink of Avro inference data to Apache Kafka.

        Args:
            boostrap_servers (str): List of Kafka brokers
            topic (str): Kafka topic 
            data_scheme_filename (str): Filename of the AVRO scheme for training data
            group_id (str): Group ID of the Kafka consumer. Defaults to sink

    """
    def __init__(self,
                 boostrap_servers,
                 topic,
                 data_scheme_filename,
                 group_id='sink'):

        self.boostrap_servers = boostrap_servers
        self.topic = topic

        self.data_scheme_filename = data_scheme_filename

        self.data_schema = open(self.data_scheme_filename, "r").read()

        self.avro_data_schema = avro.schema.Parse(self.data_schema)
        self.data_writer = DatumWriter(self.avro_data_schema)

        self.data_io = io.BytesIO()
        self.data_encoder = BinaryEncoder(self.data_io)
        self.__producer = KafkaProducer(
            bootstrap_servers=self.boostrap_servers)

    def send(self, data):

        self.data_writer.write(data, self.data_encoder)
        data_bytes = self.data_io.getvalue()

        self.__producer.send(self.topic, data_bytes)

        self.data_io.seek(0)
        self.data_io.truncate(0)
        """Cleans data buffer"""

    def close(self):
        self.__producer.flush()
        self.__producer.close()
Ejemplo n.º 21
0
    def check_schema(self, data, schema_path):
        schema = avro.schema.Parse(
            open(schema_path, "rb").read().decode("utf-8"))

        writer = DataFileWriter(open('_test.avro', "wb"), DatumWriter(),
                                schema)

        writer.append(data)
        writer.close()
Ejemplo n.º 22
0
    def _load_datawriter(self):
        try:
            lschema = load_schema(self.schema)
            self.avrofile = open(self.outfile, 'w+b')
            self.datawrite = DataFileWriter(self.avrofile, DatumWriter(), lschema)
        except Exception:
            return False

        return True
Ejemplo n.º 23
0
def _write_items(base_name, schema_str, items):
    avro_schema = schema.Parse(schema_str)
    avro_file = base_name + '.avro'
    with DataFileWriter(open(avro_file, "w"), DatumWriter(),
                        avro_schema) as writer:
        for i in items:
            writer.append(i)
    writer.close
    return (avro_file)
Ejemplo n.º 24
0
def _create_avro_file(schema, items, file_prefix):
    _, result_file_path = tempfile.mkstemp(prefix=file_prefix, suffix='.avro')
    parsed_schema = avro.schema.Parse(schema)
    with open(result_file_path, 'wb') as f:
        writer = DataFileWriter(f, DatumWriter(), parsed_schema)
        for s in items:
            writer.append(s)
        writer.close()
    return result_file_path
Ejemplo n.º 25
0
    def _create_or_update_table(
        self,
        data,
        table_name,
        schema_name,
        schema_suffix,
        columns_definition,
        load_strategy,
        upload_call_count,
        database_name=None,
        primary_key=None,
    ):
        # This method doesn't actually create or update a table. It just creates
        # and populates a single .avro file which is used in the data upload.
        # The actual upload happens when the commit() method is called.
        if upload_call_count == 1:
            # Create avro writer and file in temporary folder
            self.avro_folder = TemporaryDirectory()
            self.avro_file_name = self.avro_folder.name + os.sep + table_name + ".avro"
            avro_schema = avro.schema.parse(
                json.dumps({
                    "type":
                    "record",
                    "name":
                    table_name,
                    "namespace":
                    table_name,
                    "fields": [{
                        "name":
                        name,
                        "type": [
                            "null",
                            map_bq_data_type_to_avro(field["data_type"]),
                        ],
                    } for name, field in columns_definition.items()],
                }))
            # Create the avro_writer object to be used going forward
            self.avro_writer = DataFileWriter(open(self.avro_file_name, "wb"),
                                              DatumWriter(), avro_schema)
            # Save the relevant kwargs for later use in the commit() method
            self.table_creation_config = {
                "table_name": table_name,
                "schema_name": schema_name,
                "schema_suffix": schema_suffix,
                "columns_definition": columns_definition,
                "load_strategy": load_strategy,
                "database_name": database_name,
                "primary_key": primary_key,
            }

        self.log.info(
            "BigQuery Uploader writes data into Avro file for later one-off upload!"
        )
        while data:
            # Write records to .avro file
            self.avro_writer.append(data.pop(0))
Ejemplo n.º 26
0
 def serialize_records(records, coin, avro_output=None):
     if avro_output == None:
         avro_output = str(coin) + ".avro"
     transformer = transform_data()
     schema = transformer.parse_schema()
     #avro_output=str(coin) + ".avro"
     with open(avro_output, 'wb') as out:
         writer = DataFileWriter(out, DatumWriter(), schema)
         for record in records:
             writer.append(record)
Ejemplo n.º 27
0
def test_sanity():
  """

  Ensures that our "base" and "good" schemas are actually forwards- and
  backwards-compatible

  """
  # fst schema / record
  fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read())
  fst_writer = DatumWriter(writers_schema=fst_schema)
  fst_record = {
      "fieldWithoutDefaultValue": 0,
      "properField": 0,
      "enumField": "A",
      "unionField": None,
      "arrayField": ["world"],
      "mapField": {"hello": "world"},
      "fixedField": "aaaaaaaaaaaaaaaa"
  }

  # sec schema / record
  sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read())
  sec_writer = DatumWriter(writers_schema=sec_schema)
  sec_record = {
      "fieldWithoutDefaultValue": 0,
      "properField2": 0,
      "enumField": "B",
      "unionField": None,
      "arrayField": ["world"],
      "fixedField": "bbbbbbbbbbbbbbbb"
  }

  # Encode record w/ fst
  fst_buf = StringIO.StringIO()
  fst_encoder = BinaryEncoder(fst_buf)
  fst_writer.write(fst_record, fst_encoder)
  fst_data = fst_buf.getvalue()

  # Encode record w/ sec
  sec_buf = StringIO.StringIO()
  sec_encoder = BinaryEncoder(sec_buf)
  sec_writer.write(sec_record, sec_encoder)
  sec_data = sec_buf.getvalue()

  # writers == fst, readers == sec
  sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema)
  sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data))
  sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good

  # writers == sec, readers == fst
  fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema)
  fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data))
  fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
Ejemplo n.º 28
0
def put_frame(video_name, video_number, pic):
    writer = DatumWriter(SCHEMA)
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    writer.write(
        {
            "rtsp":
            "rtsp",
            "createTime":
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())),
            "videoName":
            video_name,
            "videoNumber":
            video_number,
            "picContents":
            pic
        }, encoder)
    raw_bytes = bytes_writer.getvalue()
    PRODUCER.send_messages(TOPIC, raw_bytes)
Ejemplo n.º 29
0
 def _load_file(self, file_path, schema) -> DataFileWriter:
     f = open(file_path, 'ab+')
     self.cache[file_path] = dict()
     self.cache[file_path]['file_io'] = f
     writer = DataFileWriter(f, DatumWriter(), schema)
     self.cache[file_path]['datum_writer'] = writer
     self.cache.move_to_end(file_path)
     if len(self.cache) > self.capacity:
         self._remove_item()
     return writer
Ejemplo n.º 30
0
def serialize_records(records, outpath="funding.avro"):
    schema = parse_schema()
    # with open(outpath, 'wb') as out:
    out = StringIO()
    writer = DataFileWriter(out, DatumWriter(), schema)
    for record in records:
        record = dict((f, getattr(record, f)) for f in record._fields)
        record['fundedDate'] = record['fundedDate'].strftime('%Y-%m-%dT%H:M:S')
        writer.append(record)
    return out
Ejemplo n.º 31
0
def objToBin2():
    file = io.BytesIO()
    datum_writer = DatumWriter()
    fwriter = DataFileWriter(file, datum_writer, sc)
    for d in datum:
        fwriter.append(d)
    ab = file.getvalue()
    fwriter.close()

    return ab
Ejemplo n.º 32
0
def import_data(schema, src, dest, index, debug):
    global next_update
    global verbose

    index = int(index)
    verbose = int(debug)
    in_file = os.path.join(src, "MLHD_%03d.tar" % index)
    out_file = os.path.join(dest, "MLHD_%03d.avro" % index)
    count = 0
    next_update = time() + UPDATE_INTERVAL

    schema = avro.schema.Parse(open(schema, "rb").read().decode('ascii'))

    with DataFileWriter(open(out_file, "wb"),
                        DatumWriter(),
                        schema,
                        codec='deflate') as writer:
        tar = tarfile.open(in_file)
        total = 0
        chunks = []
        size = 0
        for i, member in enumerate(tar.getnames()):
            count, data = handle_file(member, tar.extractfile(member).read())
            chunks.append(data)
            total += count
            size += len(data)
            if verbose:
                print(
                    "%03d: %d rows processed, %s total rows, %d bytes of output."
                    % (index, count, total, size))
                sys.stdout.flush()

            if size > MAX_SIZE:
                for chunk in chunks:
                    try:
                        for js in chunk:
                            writer.append(js)
                    except Exception as err:
                        print("%03d: err writing file: %s" % (index, err))
                        sys.exit(-1)
                chunks = []
                size = 0

        tar.close()

        if verbose:
            print("%03d: finish writing output file." % index)
            sys.stdout.flush()
        for chunk in chunks:
            try:
                for js in chunk:
                    writer.append(js)
            except Exception as err:
                print("%03d: err writing file: %s" % (index, err))
                sys.exit(-1)
Ejemplo n.º 33
0
def main ():

    # Define schema of avro file.
    schema = avro.schema.Parse(open("logs_uuid.avsc", "rb").read())

    # Create a datum writer.
    rwriter = DatumWriter(schema)

    files = ['logs_0.txt', 'logs_1.txt', 'logs_2.txt', 'logs_3.txt']
    
    # Loop to process the files 
    for f in files:

        # open file and store in a variable
        logfile = open(f, "r")
        text    = logfile.readlines()
        logfile.close()

        # Set the avro file name (new)
        newfile = str(f).replace('.txt','uuid.avro')

        # Create a data file writer.
        dfwriter = DataFileWriter (open(newfile, "wb"), DatumWriter(), schema)

        # Loop to get information from each line
        for line in text:

            # Get the variables from line.
            sdt, surl, suser = line.strip().split('\t')

            # Defines a dictionary structure
            data = {}
            data['timestamp'] = sdt
            data['url']       = surl
            data['user']      = suser
            data['uuid']      = str(uuid.uuid1())

            # Write the data in the file.
            dfwriter.append (data)

        # Close the file after the loop.
        dfwriter.close()
Ejemplo n.º 34
0
def testWrite(filename, schema):
    fd = open(filename, 'wb')

    datum = DatumWriter()
    writer = DataFileWriter(fd, datum, schema)

    writer.append(makeObject("Person A", 23))
    writer.append(makeObject("Person B", 31))
    writer.append(makeObject("Person C", 28))

    writer.close()
Ejemplo n.º 35
0
	def make_record_set(self, schema_path: str, items: list) -> bytes:
		if schema_path not in self.schemas:
			with open(schema_path, 'rb') as raw:
				self.schemas[schema_path] = avro.schema.Parse(raw.read())
		out = BytesIO()
		writer = DataFileWriter(out, DatumWriter(), self.schemas[schema_path])
		for item in items:
			writer.append(item)
		writer.flush()

		return out.getvalue()
Ejemplo n.º 36
0
def _write_records_to_avro(schema, _d_or_ds, output_file):
    # FIXME. There's only one record being written here,
    # why does this not support a single item
    if isinstance(_d_or_ds, dict):
        _d_or_ds = [_d_or_ds]
    with open(output_file, 'w') as outs:
        with DataFileWriter(outs, DatumWriter(), schema) as writer:
            for record in _d_or_ds:
                writer.append(record)
    log.debug("Write avro file to {p}".format(p=output_file))
    return _d_or_ds
Ejemplo n.º 37
0
 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     self.logger = LOGGER.getChild('AvroWriter')
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.logger.debug('created hdfs file %s', outfn)
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
     self.logger.debug('opened AvroWriter')
Ejemplo n.º 38
0
    def __init__(self,
                 boostrap_servers,
                 topic,
                 data_scheme_filename,
                 group_id='sink'):

        self.boostrap_servers = boostrap_servers
        self.topic = topic

        self.data_scheme_filename = data_scheme_filename

        self.data_schema = open(self.data_scheme_filename, "r").read()

        self.avro_data_schema = avro.schema.Parse(self.data_schema)
        self.data_writer = DatumWriter(self.avro_data_schema)

        self.data_io = io.BytesIO()
        self.data_encoder = BinaryEncoder(self.data_io)
        self.__producer = KafkaProducer(
            bootstrap_servers=self.boostrap_servers)
    def write(self, data):
        #Parsing data to select only keys in schema
        store_data = {}
        for key in self.keys:
            if key in data:
                store_data[key] = data[key]
            else:
                store_data[key] = None

        #Serialize data using AVRO
        writer = DatumWriter(self.schema)
        bytes_writer = io.BytesIO()
        encoder = avro.io.BinaryEncoder(bytes_writer)

        writer.write(store_data, encoder)
        raw_bytes = bytes_writer.getvalue()

        #Place into pipeline
        print(data)
        self.producer.send(self.topic, raw_bytes)
Ejemplo n.º 40
0
 def __init__(self, schema_str):
     schema = avro.schema.parse(schema_str)
     self.writer = DatumWriter(schema)
Ejemplo n.º 41
0
 def toKey(self, x, schema):
     bytes = io.BytesIO()
     writer = DatumWriter(schema)
     writer.write(x, BinaryEncoder(bytes))
     bytes.flush()
     return base64.b64encode(bytes.getvalue())
Ejemplo n.º 42
0
def serialize(data):
    writer = DatumWriter(schema)
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    writer.write(data, encoder)
    return bytes_writer.getvalue()
Ejemplo n.º 43
0
 def __init__(self, schema_str):
     if sys.version_info >= (3,):
         schema = avro.schema.Parse(schema_str)
     else:
         schema = avro.schema.parse(schema_str)
     self.writer = DatumWriter(schema)
Ejemplo n.º 44
0
"""Python avro official implementation encoding benchmark."""

from io import BytesIO
from itertools import repeat
from time import time
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter, BinaryEncoder, BinaryDecoder
import sys


LOOPS = 1

with open(sys.argv[1]) as reader:
  datum_reader = DatumReader()
  file_reader = DataFileReader(reader, datum_reader)
  SCHEMA = datum_reader.writers_schema
  RECORDS = list(file_reader)

buf = BytesIO()
datum_writer = DatumWriter(SCHEMA)
start = time()
n = 0
for _ in repeat(None, LOOPS):
  for record in RECORDS:
    buf.seek(0)
    encoder = BinaryEncoder(buf)
    datum_writer.write(record, encoder)
    n += 1
print 1000. * (time() - start) / n
Ejemplo n.º 45
0
 def __init__(self, schema):
     self.schema = schema
     self.datum_writer = DatumWriter(schema)
Ejemplo n.º 46
0
 def createAvroMemoryRecord(data,schema):
     f = StringIO()
     encoder = BinaryEncoder(f)
     writer = DatumWriter(schema)
     writer.write(dict(data),encoder)
     return f.getvalue()
Ejemplo n.º 47
0
#
# NB: the AvroOutputReader specific portion begins here
#

def new_column(name, value):
    column = dict()
    column['name'] = '%s' % name
    column['value'] = '%s' % value
    column['timestamp'] = long(time.time() * 1e6)
    column['ttl'] = 0
    return column

# parse the current avro schema
proto = avro.protocol.parse(open('cassandra.avpr').read())
schema = proto.types_dict['StreamingMutation']
# open an avro encoder and writer for stdout
enc = BinaryEncoder(sys.stdout)
writer = DatumWriter(schema)

# output a series of objects matching 'StreamingMutation' in the Avro interface
smutation = dict()
try:
    for word, count in word2count.iteritems():
        smutation['key'] = word
        smutation['mutation'] = {'column_or_supercolumn': {'column': new_column('count', count)}}
        writer.write(smutation, enc)
finally:
    sys.stdout.flush()

Ejemplo n.º 48
0
from time import time



# To send messages synchronously
producer = KafkaProducer(bootstrap_servers = "localhost:9092", compression_type = "gzip")

# Kafka topic
topic = "tnx"

# Path to user.avsc avro schema
schema_path = "/home/cloudera/workspace/kafka-clients-python/transactions.avsc"
schema = avro.schema.Parse(open(schema_path).read())
print("Schema", schema.to_json())

writer = DatumWriter(schema)
bytes_writer = io.BytesIO()
encoder = avro.io.BinaryEncoder(bytes_writer)

def get_record():
    return {"id": "123"
            , "merchant_id": "m123"
            , "customer_id": "c345"
            , "amount": 100.1
            , "category": "pos"
            , "timestamp": int(time())}


for i in range(10):
    record = get_record()
    writer.write(record, encoder)