コード例 #1
0
def get_schema_from_registry(schemaid, schemaversion, restapi, apikey, cert):
    url=urljoin(restapi, "/schemas/" + schemaid + "/versions/" + schemaversion)
    headers={ "Accept":"application/vnd.apache.avro+json" }

    r = requests.get(url, auth=("token", apikey), headers=headers, verify=cert)
    if r.status_code == 200:
        return Parse(r.text)
    else:
        raise Exception(r.text)
コード例 #2
0
ファイル: avro.py プロジェクト: ohemelaar/tonga
 def _load_schema_from_file(self, file_path: str) -> None:
     with open(file_path, 'r') as fd:
         for s in yaml.load_all(fd, Loader=FullLoader):
             avro_schema_data = json.dumps(s)
             avro_schema = Parse(avro_schema_data)
             schema_name = avro_schema.namespace + '.' + avro_schema.name
             if schema_name in self._schemas:
                 raise AvroAlreadyRegister
             self._schemas[schema_name] = avro_schema
コード例 #3
0
 def __init__(self, block_bytes, num_records, codec, schema_string,
              offset, size):
   # Decompress data early on (if needed) and thus decrease the number of
   # parallel copies of the data in memory at any given time during block
   # iteration.
   self._decompressed_block_bytes = self._decompress_bytes(block_bytes, codec)
   self._num_records = num_records
   self._schema = Parse(schema_string)
   self._offset = offset
   self._size = size
コード例 #4
0
def create_athena_schema_from_avro(avro_schema_literal: str) -> str:
    avro_schema: RecordSchema = Parse(avro_schema_literal)

    column_schemas = []
    for field in avro_schema.fields:
        column_name = field.name.lower()
        column_type = create_athena_column_schema(field.type)
        column_schemas.append(f"`{column_name}` {column_type}")

    return ', '.join(column_schemas)
コード例 #5
0
 def _extract_schema_from_file(self, file_path: str) -> None:
     with open(file_path, 'r') as f:
         for s in yaml.load_all(f):
             avro_schema_data = json.dumps(s)
             avro_schema = Parse(avro_schema_data)
             schema_name = avro_schema.namespace + '.' + avro_schema.name
             if schema_name in self._schemas:
                 raise Exception(
                     f"Avro schema {schema_name} was defined more than once!")
             self._schemas[schema_name] = avro_schema
コード例 #6
0
def deserialize(flight_info_bytes):
    if flight_info_bytes is not None:
        bytes_reader = BytesIO(flight_info_bytes)
        decoder = BinaryDecoder(bytes_reader)
        schema_flight_info = Parse(
            open(dir_path + "/flight-info.schema.avsc", "rb").read())
        reader = DatumReader(schema_flight_info)
        flight_info = reader.read(decoder)
        return [{"id": "1"}, {"id": "2"}]
    else:
        return None
コード例 #7
0
ファイル: stream.py プロジェクト: xxsacxx/bigdata-playground
def deserialize(flight_info_bytes):
    if flight_info_bytes is not None:
        bytes_reader = BytesIO(flight_info_bytes)
        decoder = BinaryDecoder(bytes_reader)
        schema_flight_info = Parse(
            open(dir_path + "/flight-info.schema.avsc", "rb").read())
        reader = DatumReader(schema_flight_info)
        flight_info = reader.read(decoder)

        return json.dumps([{"id": 907955534287978496}])
    else:
        return None
コード例 #8
0
ファイル: datatype_inference.py プロジェクト: xsm110/Beam15.0
def infer_avro_schema(data, use_fastavro=False):
    """For internal use only; no backwards-compatibility guarantees.

  Infer avro schema for tabular data.

  Args:
    data (List[dict]): A list of dictionaries representing rows in a table.
    use_fastavro (bool): A flag indicating whether the schema should be
        constructed using fastavro.

  Returns:
    An avro schema object.
  """
    _typehint_to_avro_type = {
        type(None): "null",
        int: "int",
        float: "double",
        str: "string",
        unicode: "string",
        bytes: "bytes",
        np.ndarray: "bytes",
        array.array: "bytes",
    }

    def typehint_to_avro_type(value):
        if isinstance(value, typehints.UnionConstraint):
            return sorted(
                typehint_to_avro_type(union_type)
                for union_type in value.union_types)
        else:
            return _typehint_to_avro_type[value]

    column_types = infer_typehints_schema(data)
    avro_fields = [{
        "name": str(key),
        "type": typehint_to_avro_type(value)
    } for key, value in column_types.items()]
    schema_dict = {
        "namespace": "example.avro",
        "name": "User",
        "type": "record",
        "fields": avro_fields
    }
    if use_fastavro:
        from fastavro import parse_schema
        return parse_schema(schema_dict)
    else:
        return Parse(json.dumps(schema_dict))
コード例 #9
0
def create_athena_schema_from_avro(
        avro_schema_literal: str,
        partitions: List[str] = []) -> Tuple[str, str]:
    avro_schema: RecordSchema = Parse(avro_schema_literal)

    column_schemas = []
    partitions_schemas = []
    for field in avro_schema.fields:
        column_name = field.name.lower()
        column_type = create_athena_column_schema(field.type)

        if column_name in partitions:
            partitions_schemas.append(f"`{column_name}` {column_type}")
        else:
            column_schemas.append(f"`{column_name}` {column_type}")

    return ', \n'.join(column_schemas), ', '.join(partitions_schemas)
コード例 #10
0
def consume(brokers: str, topic: str):

    value_schema = Parse(dumps({
        "type": "record",
        "namespace": "example.avro", #VERY IMPORTANT TO MAP MESSAGE TO JAVA OBJECT
        "name": "test_record",
        "fields": [
            {"name": "id", "type": "int"},
            {"name": "date", "type": ["int", "null"]},
            {"name": "info", "type": "string"}
        ]
    }))

    consumer_settings = {
        'bootstrap.servers': brokers,
        'group.id': 'raw_1',
        'client.id': 'client-1',
        'enable.auto.commit': True,
        'session.timeout.ms': 6000,
        'default.topic.config': {
            'auto.offset.reset': 'smallest'
        }
        #,'debug': 'consumer' #activate debug on consumer side
    }
    consumer = Consumer(consumer_settings)
    consumer.subscribe([topic])

    try:
        while True:
            msg = consumer.poll(10000.0)
            if msg is None:
                continue
            elif msg.error():
                print("Consumer error: {}".format(msg.error()))
            else:
                deserialized_value = avro_deserialization(value_schema,msg.value())
                print("Message Consumed: key = {} value = {}".format(
                    json_deserialization(msg.key()), deserialized_value))
    except Exception as e:
        print(e)
    finally:
        consumer.close()
コード例 #11
0
    def _load_schema_from_file(self, file_path: str) -> None:
        """ AvroSerializer internal function, he was call by _scan_schema_folder for load schema file

        Args:
            file_path: Path to schema

        Raises:
            AvroAlreadyRegister: This error was raised when schema is already register the Avro schema

        Returns:
            None
        """
        with open(file_path, 'r') as fd:
            for s in load_all(fd, Loader=FullLoader):
                avro_schema_data = json.dumps(s)
                avro_schema = Parse(avro_schema_data)
                schema_name = avro_schema.namespace + '.' + avro_schema.name
                if schema_name in self._schemas:
                    raise AvroAlreadyRegister
                self._schemas[schema_name] = avro_schema
コード例 #12
0
def produce(brokers: str, topic: str):

    value_schema = Parse(
        dumps({
            "type":
            "record",
            "namespace":
            "example.avro",  #VERY IMPORTANT TO MAP MESSAGE TO JAVA OBJECT
            "name":
            "test_record",
            "fields": [{
                "name": "id",
                "type": "int"
            }, {
                "name": "date",
                "type": ["int", "null"]
            }, {
                "name": "info",
                "type": "string"
            }]
        }))

    producer_settings = {'bootstrap.servers': brokers, 'group.id': 'groupid'}
    producer = Producer(producer_settings)

    i = 0
    while True:
        sleep(1)
        key = "message_key_" + str(i)
        value = {
            "id": i,
            "date": 100000 * (2 + i**3),
            "info": "sensor_" + str(i)
        }
        print("Message Produced: key = {} value = {}".format(key, value))
        producer.produce(topic=topic,
                         key=key,
                         value=avro_serialization(value_schema, value))
        i += 1
    producer.flush()
コード例 #13
0
def get_schema_from_avsc_file(avsc_filename):
    """
    Purpose:
        Get the file schema from an .avsc filename (with path in the filename)
    Args:
        avsc_filename (String): Path/filename of the .avsc file to get the schema from
    Return:
        avro_schema (AVRO Schema Object): Schema object from the avro library
    """
    logging.info(f"Getting AVRO Schema from {avsc_filename}")

    if not os.path.isfile(avsc_filename):
        raise AvscNotFound(f"{avsc_filename} not found")

    avro_schema = None
    try:
        with open(avsc_filename) as avsc_file_obj:
            avro_schema = Parse(avsc_file_obj.read())
    except Exception as err:
        error_msg = f"Error Reading {avsc_filename} into Schema: {err}"
        logging.exception(error_msg)
        raise AvscInvalid(error_msg)

    return avro_schema
コード例 #14
0
ファイル: fastavro_it_test.py プロジェクト: l2pg/beam_moremmr
class FastavroIT(unittest.TestCase):

    SCHEMA = Parse('''
    {"namespace": "example.avro",
     "type": "record",
     "name": "User",
     "fields": [
         {"name": "label", "type": "string"},
         {"name": "number",  "type": ["int", "null"]},
         {"name": "number_str", "type": ["string", "null"]},
         {"name": "color", "type": ["string", "null"]}
     ]
    }
    ''')

    def setUp(self):
        self.test_pipeline = TestPipeline(is_integration_test=True)
        self.uuid = str(uuid.uuid4())
        self.output = '/'.join(
            [self.test_pipeline.get_option('output'), self.uuid])

    @attr('IT')
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=range-builtin-not-iterating
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=range-builtin-not-iterating
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        fastavro_output = '/'.join([self.output, 'fastavro'])
        avro_output = '/'.join([self.output, 'avro'])

        self.addCleanup(delete_files, [self.output + '*'])

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            self.SCHEMA,
            use_fastavro=True
        )

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_avro' >> WriteToAvro(
            avro_output,
            self.SCHEMA,
            use_fastavro=False
        )

        result = self.test_pipeline.run()
        result.wait_until_finish()
        assert result.state == PipelineState.DONE

        fastavro_read_pipeline = TestPipeline(is_integration_test=True)

        fastavro_records = \
            fastavro_read_pipeline \
            | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
            | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \
            | Map(lambda rec: (rec['number'], rec))

        avro_records = \
            fastavro_read_pipeline \
            | 'create-avro' >> Create(['%s*' % avro_output]) \
            | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \
            | Map(lambda rec: (rec['number'], rec))

        def check(elem):
            v = elem[1]

            def assertEqual(l, r):
                if l != r:
                    raise BeamAssertException('Assertion failed: %s == %s' %
                                              (l, r))

            assertEqual(v.keys(), ['avro', 'fastavro'])
            avro_values = v['avro']
            fastavro_values = v['fastavro']
            assertEqual(avro_values, fastavro_values)
            assertEqual(len(avro_values), 1)

        # pylint: disable=expression-not-assigned
        {
            'avro': avro_records,
            'fastavro': fastavro_records
        } \
        | CoGroupByKey() \
        | Map(check)

        fastavro_read_pipeline.run().wait_until_finish()
        assert result.state == PipelineState.DONE
コード例 #15
0
ファイル: fastavro_it_test.py プロジェクト: wanwanzhu/beam
    def test_avro_it(self):
        num_records = self.test_pipeline.get_option('records')
        num_records = int(num_records) if num_records else 1000000

        # Seed a `PCollection` with indices that will each be FlatMap'd into
        # `batch_size` records, to avoid having a too-large list in memory at
        # the outset
        batch_size = self.test_pipeline.get_option('batch-size')
        batch_size = int(batch_size) if batch_size else 10000

        # pylint: disable=range-builtin-not-iterating
        batches = range(int(num_records / batch_size))

        def batch_indices(start):
            # pylint: disable=range-builtin-not-iterating
            return range(start * batch_size, (start + 1) * batch_size)

        # A `PCollection` with `num_records` avro records
        records_pcoll = \
            self.test_pipeline \
            | 'create-batches' >> Create(batches) \
            | 'expand-batches' >> FlatMap(batch_indices) \
            | 'create-records' >> Map(record)

        fastavro_output = '/'.join([self.output, 'fastavro'])
        avro_output = '/'.join([self.output, 'avro'])

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_fastavro' >> WriteToAvro(
            fastavro_output,
            parse_schema(json.loads(self.SCHEMA_STRING)),
            use_fastavro=True
        )

        # pylint: disable=expression-not-assigned
        records_pcoll \
        | 'write_avro' >> WriteToAvro(
            avro_output,
            Parse(self.SCHEMA_STRING),
            use_fastavro=False
        )

        result = self.test_pipeline.run()
        result.wait_until_finish()
        assert result.state == PipelineState.DONE

        with TestPipeline(is_integration_test=True) as fastavro_read_pipeline:

            fastavro_records = \
                fastavro_read_pipeline \
                | 'create-fastavro' >> Create(['%s*' % fastavro_output]) \
                | 'read-fastavro' >> ReadAllFromAvro(use_fastavro=True) \
                | Map(lambda rec: (rec['number'], rec))

            avro_records = \
                fastavro_read_pipeline \
                | 'create-avro' >> Create(['%s*' % avro_output]) \
                | 'read-avro' >> ReadAllFromAvro(use_fastavro=False) \
                | Map(lambda rec: (rec['number'], rec))

            def check(elem):
                v = elem[1]

                def assertEqual(l, r):
                    if l != r:
                        raise BeamAssertException(
                            'Assertion failed: %s == %s' % (l, r))

                assertEqual(v.keys(), ['avro', 'fastavro'])
                avro_values = v['avro']
                fastavro_values = v['fastavro']
                assertEqual(avro_values, fastavro_values)
                assertEqual(len(avro_values), 1)

            # pylint: disable=expression-not-assigned
            {
                'avro': avro_records,
                'fastavro': fastavro_records
            } \
            | CoGroupByKey() \
            | Map(check)

            self.addCleanup(delete_files, [self.output])
        assert result.state == PipelineState.DONE
コード例 #16
0
 def __init__(self, methodName='runTest'):
   super(TestAvro, self).__init__(methodName)
   self.use_fastavro = False
   self.SCHEMA = Parse(self.SCHEMA_STRING)
コード例 #17
0
ファイル: avro_bitcoin.py プロジェクト: joshuawinter/beamer
            "block_id": elem["block_id"],
            "previous_block": elem["previous_block"],
            "num_inputs": num_inputs,
            "num_outputs": num_outputs,
            "sum_output": total,
        }]


SCHEMA = Parse('''
  {
    "namespace": "example.avro",
    "type": "record",
    "name": "Transaction",
    "fields": [
      {"name": "transaction_id", "type": "string"},
      {"name": "timestamp", "type": "long"},
      {"name": "block_id", "type": "string"},
      {"name": "previous_block", "type": "string"},
      {"name": "num_inputs", "type": "int"},
      {"name": "num_outputs", "type": "int"},
      {"name": "sum_output", "type": "long"}
    ]
  }
  ''')


def run(argv=None):
    """Test Avro IO (backed by fastavro or Apache Avro) on a simple pipeline
  that transforms bitcoin transactions"""
    parser = argparse.ArgumentParser()
    parser.add_argument('--input',
                        dest='input',
コード例 #18
0
    def test_convert_bigquery_schema_to_avro_schema(self):
        subfields = [
            bigquery.TableFieldSchema(name="species",
                                      type="STRING",
                                      mode="NULLABLE"),
        ]

        fields = [
            bigquery.TableFieldSchema(name="number",
                                      type="INTEGER",
                                      mode="REQUIRED"),
            bigquery.TableFieldSchema(name="species",
                                      type="STRING",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="quality",
                                      type="FLOAT"),  # default to NULLABLE
            bigquery.TableFieldSchema(name="quantity",
                                      type="INTEGER"),  # default to NULLABLE
            bigquery.TableFieldSchema(name="birthday",
                                      type="TIMESTAMP",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="birthdayMoney",
                                      type="NUMERIC",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="flighted",
                                      type="BOOL",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="flighted2",
                                      type="BOOLEAN",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="sound",
                                      type="BYTES",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="anniversaryDate",
                                      type="DATE",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="anniversaryDatetime",
                                      type="DATETIME",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="anniversaryTime",
                                      type="TIME",
                                      mode="NULLABLE"),
            bigquery.TableFieldSchema(name="scion",
                                      type="RECORD",
                                      mode="NULLABLE",
                                      fields=subfields),
            bigquery.TableFieldSchema(name="associates",
                                      type="RECORD",
                                      mode="REPEATED",
                                      fields=subfields),
            bigquery.TableFieldSchema(name="geoPositions",
                                      type="GEOGRAPHY",
                                      mode="NULLABLE"),
        ]

        table_schema = bigquery.TableSchema(fields=fields)
        avro_schema = bigquery_avro_tools.get_record_schema_from_dict_table_schema(
            "root", bigquery_tools.get_dict_table_schema(table_schema))

        # Test that schema can be parsed correctly by fastavro
        fastavro.parse_schema(avro_schema)

        # Test that schema can be parsed correctly by avro
        parsed_schema = Parse(json.dumps(avro_schema))
        # Avro RecordSchema provides field_map in py3 and fields_dict in py2
        field_map = getattr(parsed_schema, "field_map", None) or \
          getattr(parsed_schema, "fields_dict", None)

        self.assertEqual(field_map["number"].type, Parse(json.dumps("long")))
        self.assertEqual(field_map["species"].type,
                         Parse(json.dumps(["null", "string"])))
        self.assertEqual(field_map["quality"].type,
                         Parse(json.dumps(["null", "double"])))
        self.assertEqual(field_map["quantity"].type,
                         Parse(json.dumps(["null", "long"])))
        self.assertEqual(
            field_map["birthday"].type,
            Parse(
                json.dumps([
                    "null", {
                        "type": "long",
                        "logicalType": "timestamp-micros"
                    }
                ])))
        self.assertEqual(
            field_map["birthdayMoney"].type,
            Parse(
                json.dumps([
                    "null", {
                        "type": "bytes",
                        "logicalType": "decimal",
                        "precision": 38,
                        "scale": 9
                    }
                ])))
        self.assertEqual(field_map["flighted"].type,
                         Parse(json.dumps(["null", "boolean"])))
        self.assertEqual(field_map["flighted2"].type,
                         Parse(json.dumps(["null", "boolean"])))
        self.assertEqual(field_map["sound"].type,
                         Parse(json.dumps(["null", "bytes"])))
        self.assertEqual(
            field_map["anniversaryDate"].type,
            Parse(json.dumps(["null", {
                "type": "int",
                "logicalType": "date"
            }])))
        self.assertEqual(field_map["anniversaryDatetime"].type,
                         Parse(json.dumps(["null", "string"])))
        self.assertEqual(
            field_map["anniversaryTime"].type,
            Parse(
                json.dumps(
                    ["null", {
                        "type": "long",
                        "logicalType": "time-micros"
                    }])))
        self.assertEqual(field_map["geoPositions"].type,
                         Parse(json.dumps(["null", "string"])))

        self.assertEqual(
            field_map["scion"].type,
            Parse(
                json.dumps([
                    "null",
                    {
                        "type":
                        "record",
                        "name":
                        "scion",
                        "fields": [
                            {
                                "type": ["null", "string"],
                                "name": "species",
                            },
                        ],
                        "doc":
                        "Translated Avro Schema for scion",
                        "namespace":
                        "apache_beam.io.gcp.bigquery.root.scion",
                    },
                ])))

        self.assertEqual(
            field_map["associates"].type,
            Parse(
                json.dumps({
                    "type": "array",
                    "items": {
                        "type":
                        "record",
                        "name":
                        "associates",
                        "fields": [
                            {
                                "type": ["null", "string"],
                                "name": "species",
                            },
                        ],
                        "doc":
                        "Translated Avro Schema for associates",
                        "namespace":
                        "apache_beam.io.gcp.bigquery.root.associates",
                    }
                })))
コード例 #19
0
ファイル: main.py プロジェクト: robidev/rsyslog_avro
from io import BytesIO

from typing import List

from abr.adapters.kafka import (KafkaOutputStream, KafkaConfiguration)

from pyspark.sql.types import (BinaryType, StringType, StructType, StructField)

from avro.io import (DatumWriter, BinaryEncoder)

from avro.schema import Parse

from abr.adapters.base import INFO

SYSLOG_AVRO_SCHEMA = Parse(open(os.getenv("SYSLOG_AVRO_SCHEMA"), "r").read())

SPARK_SCHEMA = StructType(
    [StructField('topic', StringType()),
     StructField('value', BinaryType())])

# --- LOAD CONFIGURATION ---
KAFKA_CONFIG = KafkaConfiguration.load()
TOPIC = os.getenv("TOPIC")


class MessageModel(BaseModel):
    """
    Represents a message
    """
    syslogmessage: str
コード例 #20
0
ファイル: avroio_test.py プロジェクト: yuhonghong7035/beam
class TestAvro(unittest.TestCase):

  _temp_files = []

  def __init__(self, methodName='runTest'):
    super(TestAvro, self).__init__(methodName)
    self.use_fastavro = False

  @classmethod
  def setUpClass(cls):
    # Method has been renamed in Python 3
    if sys.version_info[0] < 3:
      cls.assertCountEqual = cls.assertItemsEqual

  def setUp(self):
    # Reducing the size of thread pools. Without this test execution may fail in
    # environments with limited amount of resources.
    filebasedsource.MAX_NUM_THREADS_FOR_SIZE_ESTIMATION = 2

  def tearDown(self):
    for path in self._temp_files:
      if os.path.exists(path):
        os.remove(path)
    self._temp_files = []

  RECORDS = [{'name': 'Thomas',
              'favorite_number': 1,
              'favorite_color': 'blue'}, {'name': 'Henry',
                                          'favorite_number': 3,
                                          'favorite_color': 'green'},
             {'name': 'Toby',
              'favorite_number': 7,
              'favorite_color': 'brown'}, {'name': 'Gordon',
                                           'favorite_number': 4,
                                           'favorite_color': 'blue'},
             {'name': 'Emily',
              'favorite_number': -1,
              'favorite_color': 'Red'}, {'name': 'Percy',
                                         'favorite_number': 6,
                                         'favorite_color': 'Green'}]

  SCHEMA = Parse('''
  {"namespace": "example.avro",
   "type": "record",
   "name": "User",
   "fields": [
       {"name": "name", "type": "string"},
       {"name": "favorite_number",  "type": ["int", "null"]},
       {"name": "favorite_color", "type": ["string", "null"]}
   ]
  }
  ''')

  def _write_data(self,
                  directory=None,
                  prefix=tempfile.template,
                  codec='null',
                  count=len(RECORDS)):

    with tempfile.NamedTemporaryFile(
        delete=False, dir=directory, prefix=prefix) as f:
      writer = DataFileWriter(f, DatumWriter(), self.SCHEMA, codec=codec)
      len_records = len(self.RECORDS)
      for i in range(count):
        writer.append(self.RECORDS[i % len_records])
      writer.close()

      self._temp_files.append(f.name)
      return f.name

  def _write_pattern(self, num_files):
    assert num_files > 0
    temp_dir = tempfile.mkdtemp()

    file_name = None
    for _ in range(num_files):
      file_name = self._write_data(directory=temp_dir, prefix='mytemp')

    assert file_name
    file_name_prefix = file_name[:file_name.rfind(os.path.sep)]
    return file_name_prefix + os.path.sep + 'mytemp*'

  def _run_avro_test(self, pattern, desired_bundle_size, perform_splitting,
                     expected_result):
    source = _create_avro_source(pattern, use_fastavro=self.use_fastavro)

    read_records = []
    if perform_splitting:
      assert desired_bundle_size
      splits = [
          split
          for split in source.split(desired_bundle_size=desired_bundle_size)
      ]
      if len(splits) < 2:
        raise ValueError('Test is trivial. Please adjust it so that at least '
                         'two splits get generated')

      sources_info = [
          (split.source, split.start_position, split.stop_position)
          for split in splits
      ]
      source_test_utils.assert_sources_equal_reference_source(
          (source, None, None), sources_info)
    else:
      read_records = source_test_utils.read_from_source(source, None, None)
      self.assertCountEqual(expected_result, read_records)

  def test_read_without_splitting(self):
    file_name = self._write_data()
    expected_result = self.RECORDS
    self._run_avro_test(file_name, None, False, expected_result)

  def test_read_with_splitting(self):
    file_name = self._write_data()
    expected_result = self.RECORDS
    self._run_avro_test(file_name, 100, True, expected_result)

  def test_source_display_data(self):
    file_name = 'some_avro_source'
    source = \
        _create_avro_source(
            file_name,
            validate=False,
            use_fastavro=self.use_fastavro
        )
    dd = DisplayData.create_from(source)

    # No extra avro parameters for AvroSource.
    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

  def test_read_display_data(self):
    file_name = 'some_avro_source'
    read = \
        avroio.ReadFromAvro(
            file_name,
            validate=False,
            use_fastavro=self.use_fastavro)
    dd = DisplayData.create_from(read)

    # No extra avro parameters for AvroSource.
    expected_items = [
        DisplayDataItemMatcher('compression', 'auto'),
        DisplayDataItemMatcher('file_pattern', file_name)]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

  def test_sink_display_data(self):
    file_name = 'some_avro_sink'
    sink = _create_avro_sink(
        file_name,
        self.SCHEMA,
        'null',
        '.end',
        0,
        None,
        'application/x-avro',
        use_fastavro=self.use_fastavro)
    dd = DisplayData.create_from(sink)
    expected_items = [
        DisplayDataItemMatcher(
            'schema',
            str(self.SCHEMA)),
        DisplayDataItemMatcher(
            'file_pattern',
            'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d.end'),
        DisplayDataItemMatcher(
            'codec',
            'null'),
        DisplayDataItemMatcher(
            'compression',
            'uncompressed')]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

  def test_write_display_data(self):
    file_name = 'some_avro_sink'
    write = avroio.WriteToAvro(file_name,
                               self.SCHEMA,
                               use_fastavro=self.use_fastavro)
    dd = DisplayData.create_from(write)
    expected_items = [
        DisplayDataItemMatcher(
            'schema',
            str(self.SCHEMA)),
        DisplayDataItemMatcher(
            'file_pattern',
            'some_avro_sink-%(shard_num)05d-of-%(num_shards)05d'),
        DisplayDataItemMatcher(
            'codec',
            'deflate'),
        DisplayDataItemMatcher(
            'compression',
            'uncompressed')]
    hc.assert_that(dd.items, hc.contains_inanyorder(*expected_items))

  def test_read_reentrant_without_splitting(self):
    file_name = self._write_data()
    source = _create_avro_source(file_name, use_fastavro=self.use_fastavro)
    source_test_utils.assert_reentrant_reads_succeed((source, None, None))

  def test_read_reantrant_with_splitting(self):
    file_name = self._write_data()
    source = _create_avro_source(file_name, use_fastavro=self.use_fastavro)
    splits = [
        split for split in source.split(desired_bundle_size=100000)]
    assert len(splits) == 1
    source_test_utils.assert_reentrant_reads_succeed(
        (splits[0].source, splits[0].start_position, splits[0].stop_position))

  def test_read_without_splitting_multiple_blocks(self):
    file_name = self._write_data(count=12000)
    expected_result = self.RECORDS * 2000
    self._run_avro_test(file_name, None, False, expected_result)

  def test_read_with_splitting_multiple_blocks(self):
    file_name = self._write_data(count=12000)
    expected_result = self.RECORDS * 2000
    self._run_avro_test(file_name, 10000, True, expected_result)

  def test_split_points(self):
    file_name = self._write_data(count=12000)
    source = _create_avro_source(file_name, use_fastavro=self.use_fastavro)

    splits = [
        split
        for split in source.split(desired_bundle_size=float('inf'))
    ]
    assert len(splits) == 1

    range_tracker = splits[0].source.get_range_tracker(
        splits[0].start_position, splits[0].stop_position)

    split_points_report = []

    for _ in splits[0].source.read(range_tracker):
      split_points_report.append(range_tracker.split_points())

    # There are a total of three blocks. Each block has more than 10 records.

    # When reading records of the first block, range_tracker.split_points()
    # should return (0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)
    self.assertEquals(
        split_points_report[:10],
        [(0, iobase.RangeTracker.SPLIT_POINTS_UNKNOWN)] * 10)

    # When reading records of last block, range_tracker.split_points() should
    # return (2, 1)
    self.assertEquals(split_points_report[-10:], [(2, 1)] * 10)

  def test_read_without_splitting_compressed_deflate(self):
    file_name = self._write_data(codec='deflate')
    expected_result = self.RECORDS
    self._run_avro_test(file_name, None, False, expected_result)

  def test_read_with_splitting_compressed_deflate(self):
    file_name = self._write_data(codec='deflate')
    expected_result = self.RECORDS
    self._run_avro_test(file_name, 100, True, expected_result)

  @unittest.skipIf(snappy is None, 'python-snappy not installed.')
  def test_read_without_splitting_compressed_snappy(self):
    file_name = self._write_data(codec='snappy')
    expected_result = self.RECORDS
    self._run_avro_test(file_name, None, False, expected_result)

  @unittest.skipIf(snappy is None, 'python-snappy not installed.')
  def test_read_with_splitting_compressed_snappy(self):
    file_name = self._write_data(codec='snappy')
    expected_result = self.RECORDS
    self._run_avro_test(file_name, 100, True, expected_result)

  def test_read_without_splitting_pattern(self):
    pattern = self._write_pattern(3)
    expected_result = self.RECORDS * 3
    self._run_avro_test(pattern, None, False, expected_result)

  def test_read_with_splitting_pattern(self):
    pattern = self._write_pattern(3)
    expected_result = self.RECORDS * 3
    self._run_avro_test(pattern, 100, True, expected_result)

  def test_dynamic_work_rebalancing_exhaustive(self):
    # Adjusting block size so that we can perform a exhaustive dynamic
    # work rebalancing test that completes within an acceptable amount of time.
    old_sync_interval = avro.datafile.SYNC_INTERVAL
    try:
      avro.datafile.SYNC_INTERVAL = 2
      file_name = self._write_data(count=5)
      source = _create_avro_source(file_name, use_fastavro=self.use_fastavro)
      splits = [split
                for split in source.split(desired_bundle_size=float('inf'))]
      assert len(splits) == 1
      source_test_utils.assert_split_at_fraction_exhaustive(splits[0].source)
    finally:
      avro.datafile.SYNC_INTERVAL = old_sync_interval

  def test_corrupted_file(self):
    file_name = self._write_data()
    with open(file_name, 'rb') as f:
      data = f.read()

    # Corrupt the last character of the file which is also the last character of
    # the last sync_marker.
    last_char_index = len(data) - 1
    corrupted_data = data[:last_char_index]
    corrupted_data += b'A' if data[last_char_index] == b'B' else b'B'
    with tempfile.NamedTemporaryFile(
        delete=False, prefix=tempfile.template) as f:
      f.write(corrupted_data)
      corrupted_file_name = f.name

    source = _create_avro_source(
        corrupted_file_name, use_fastavro=self.use_fastavro)
    with self.assertRaises(ValueError) as exn:
      source_test_utils.read_from_source(source, None, None)
      self.assertEqual(0, exn.exception.message.find('Unexpected sync marker'))

  def test_read_from_avro(self):
    path = self._write_data()
    with TestPipeline() as p:
      assert_that(
          p | avroio.ReadFromAvro(path, use_fastavro=self.use_fastavro),
          equal_to(self.RECORDS))

  def test_read_all_from_avro_single_file(self):
    path = self._write_data()
    with TestPipeline() as p:
      assert_that(
          p \
          | Create([path]) \
          | avroio.ReadAllFromAvro(use_fastavro=self.use_fastavro),
          equal_to(self.RECORDS))

  def test_read_all_from_avro_many_single_files(self):
    path1 = self._write_data()
    path2 = self._write_data()
    path3 = self._write_data()
    with TestPipeline() as p:
      assert_that(
          p \
          | Create([path1, path2, path3]) \
          | avroio.ReadAllFromAvro(use_fastavro=self.use_fastavro),
          equal_to(self.RECORDS * 3))

  def test_read_all_from_avro_file_pattern(self):
    file_pattern = self._write_pattern(5)
    with TestPipeline() as p:
      assert_that(
          p \
          | Create([file_pattern]) \
          | avroio.ReadAllFromAvro(use_fastavro=self.use_fastavro),
          equal_to(self.RECORDS * 5))

  def test_read_all_from_avro_many_file_patterns(self):
    file_pattern1 = self._write_pattern(5)
    file_pattern2 = self._write_pattern(2)
    file_pattern3 = self._write_pattern(3)
    with TestPipeline() as p:
      assert_that(
          p \
          | Create([file_pattern1, file_pattern2, file_pattern3]) \
          | avroio.ReadAllFromAvro(use_fastavro=self.use_fastavro),
          equal_to(self.RECORDS * 10))

  def test_sink_transform(self):
    with tempfile.NamedTemporaryFile() as dst:
      path = dst.name
      with TestPipeline() as p:
        # pylint: disable=expression-not-assigned
        p \
        | beam.Create(self.RECORDS) \
        | avroio.WriteToAvro(path, self.SCHEMA, use_fastavro=self.use_fastavro)
      with TestPipeline() as p:
        # json used for stable sortability
        readback = \
            p \
            | avroio.ReadFromAvro(path + '*', use_fastavro=self.use_fastavro) \
            | beam.Map(json.dumps)
        assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))

  @unittest.skipIf(snappy is None, 'python-snappy not installed.')
  def test_sink_transform_snappy(self):
    with tempfile.NamedTemporaryFile() as dst:
      path = dst.name
      with TestPipeline() as p:
        # pylint: disable=expression-not-assigned
        p \
        | beam.Create(self.RECORDS) \
        | avroio.WriteToAvro(
            path,
            self.SCHEMA,
            codec='snappy',
            use_fastavro=self.use_fastavro)
      with TestPipeline() as p:
        # json used for stable sortability
        readback = \
            p \
            | avroio.ReadFromAvro(path + '*', use_fastavro=self.use_fastavro) \
            | beam.Map(json.dumps)
        assert_that(readback, equal_to([json.dumps(r) for r in self.RECORDS]))
コード例 #21
0
from io import BytesIO
from avro.schema import Parse
from avro.io import DatumReader, BinaryDecoder

schema = Parse("""{
"namespace": "org.buildroot.package.python_avro",
"type": "record",
"name": "Developer",
"fields": [
    {"name": "email", "type": "string"},
    {"name": "maintainer_of", "type": "string"}
]
}""")

example = b'<[email protected]\x16python_avro'

reader = DatumReader(schema)
deserialized = reader.read(BinaryDecoder(BytesIO(example)))

assert deserialized == {
    'email': '*****@*****.**',
    'maintainer_of': 'python_avro',
}
コード例 #22
0
ファイル: util.py プロジェクト: jjdoor/goods
def write_avro(write_schema_str, writer, datum):
    write_schema = Parse(write_schema_str)
    datum_writer = DatumWriter(write_schema)
    encoder = BinaryEncoder(writer)
    datum_writer.write(datum, encoder)
コード例 #23
0
ファイル: util.py プロジェクト: jjdoor/goods
def read_avro(read_schema_str, write_schema_str, reader):
    write_schema = Parse(write_schema_str)
    read_schema = Parse(read_schema_str)
    datum_reader = DatumReader(write_schema, read_schema)
    decoder = BinaryDecoder(reader)
    return datum_reader.read(decoder)
コード例 #24
0
    def process_partition(self, partition):

        now = int(time.time() * 1e3)
        hdfs_client = InsecureClient(self.host, user=self.user)
        json_list = []
        for row in partition:
            profile_version = 393320  # TODO: remove magic constant, validate it's value
            system = [{
                'id': {
                    'primary': 10000,
                    'secondary': 10000
                },
                'value': self.cid,
                'confidence': 1.0
            }, {
                'id': {
                    'primary': 10000,
                    'secondary': 10001
                },
                'confidence': 1.0,
                'value': now
            }]
            id = []
            for k in self.mapping['id']:
                if k in row and row[k] is not None:
                    id.append({
                        'id': self.mapping['id'][k],
                        'confidence': 1.0,
                        'value': str(row[k])
                    })
            attributes = []
            for k in self.mapping['attributes']:
                if k in row and row[k] is not None:
                    if 'mapping' in self.mapping['attributes'][k]:
                        if row[k] in self.mapping['attributes'][k]['mapping']:
                            attributes.append({
                                'id': {
                                    'secondary':
                                    self.mapping['attributes'][k]['mapping'][
                                        row[k]],
                                    'primary':
                                    self.mapping['attributes'][k]['primary']
                                },
                                'confidence': 1.0
                            })
                    else:
                        attributes.append({
                            'id': {
                                'secondary': -1,
                                'primary':
                                self.mapping['attributes'][k]['primary']
                            },
                            'confidence': 1.0,
                            'value': row[k]
                        })
            json_dict = {'id': id, 'attributes': attributes, 'system': system}
            json_list.append(json_dict)
        filename = 'part_{}.avro'.format(str(uuid4()))
        clever_schema = Parse(self.raw_schema)
        with DataFileWriter(open(filename, "wb"), DatumWriter(),
                            clever_schema) as writer:
            for record in json_list:
                writer.append(record)
        hdfs_client.upload(
            '/data/{}/.dmpkit/profiles/{}/cdm/ts={}/{}'.format(
                self.cid, self.source, self.timestamp, filename), filename)
        os.remove(filename)
コード例 #25
0
def get_schema_from_file(filename):
    return Parse(open(filename, "rb").read())