Ejemplo n.º 1
0
 def test_other_attributes(self):
     print_name('TEST OTHER ATTRIBUTES')
     correct = 0
     props = {}
     for example in OTHER_PROP_EXAMPLES:
         original_schema = schema.parse(example.schema_string)
         round_trip_schema = schema.parse(str(original_schema))
         self.assertEqual(original_schema.other_props,
                          round_trip_schema.other_props)
         if original_schema.type == "record":
             field_props = 0
             for f in original_schema.fields:
                 if f.other_props:
                     props.update(f.other_props)
                     field_props += 1
             self.assertEqual(field_props, len(original_schema.fields))
         if original_schema.other_props:
             props.update(original_schema.other_props)
             correct += 1
     for k in props:
         v = props[k]
         if k == "cp_boolean":
             self.assertEqual(type(v), bool)
         elif k == "cp_int":
             self.assertEqual(type(v), int)
         elif k == "cp_object":
             self.assertEqual(type(v), dict)
         elif k == "cp_float":
             self.assertEqual(type(v), float)
         elif k == "cp_array":
             self.assertEqual(type(v), list)
     self.assertEqual(correct, len(OTHER_PROP_EXAMPLES))
Ejemplo n.º 2
0
 def test_correct_recursive_extraction(self):
     s = schema.parse(
         '{"type": "record", "name": "X", "fields": [{"name": "y", "type": {"type": "record", "name": "Y", "fields": [{"name": "Z", "type": "X"}]}}]}'
     )
     t = schema.parse(str(s.fields[0].type))
     # If we've made it this far, the subschema was reasonably stringified; it ccould be reparsed.
     self.assertEqual("X", t.fields[0].type.name)
def validate_avro_schema(value):
    '''
    Attempt to parse ``value`` into an Avro schema.
    Raise ``ValidationError`` on error.
    '''
    try:
        parse(json.dumps(value))
    except SchemaParseException as e:
        raise ValidationError(str(e))
Ejemplo n.º 4
0
    def test_exception_is_not_swallowed_on_parse_error(self):
        print_name('TEST EXCEPTION NOT SWALLOWED ON PARSE ERROR')

        try:
            schema.parse('/not/a/real/file')
            caught_exception = False
        except schema.SchemaParseException, e:
            expected_message = 'Error parsing JSON: /not/a/real/file, error = ' \
                               'No JSON object could be decoded'
            self.assertEqual(expected_message, e.args[0])
            caught_exception = True
Ejemplo n.º 5
0
    def test_valid_cast_to_string_after_parse(self):
        # Test that the string generated by an Avro Schema object
        # is, in fact, a valid Avro schema.
        print_name('TEST CAST TO STRING AFTER PARSE')
        correct = 0
        for example in VALID_EXAMPLES:
            schema_data = schema.parse(example.schema_string)
            schema.parse(str(schema_data))
            correct += 1

        fail_msg = "Cast to string success on %d out of %d schemas" % \
          (correct, len(VALID_EXAMPLES))
        self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
Ejemplo n.º 6
0
  def test_unknown_symbol(self):
    print_name('TEST UNKNOWN SYMBOL')
    writers_schema = schema.parse("""\
      {"type": "enum", "name": "Test",
       "symbols": ["FOO", "BAR"]}""")
    datum_to_write = 'FOO'

    readers_schema = schema.parse("""\
      {"type": "enum", "name": "Test",
       "symbols": ["BAR", "BAZ"]}""")

    with self.assertRaises(io.SchemaResolutionException) as context:
        writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema)
        reader = StringIO(writer.getvalue())
        decoder = io.BinaryDecoder(reader)
        datum_reader = io.DatumReader(writers_schema, readers_schema)
Ejemplo n.º 7
0
def check_skip_number(number_type):
    print_name('TEST SKIP %s' % number_type.upper())
    correct = 0
    for value_to_skip, hex_encoding in BINARY_ENCODINGS:
        VALUE_TO_READ = 6253
        print('Value to Skip: %d' % value_to_skip)

        # write the value to skip and a known value
        writers_schema = schema.parse('"%s"' % number_type.lower())
        writer, encoder, datum_writer = write_datum(value_to_skip,
                                                    writers_schema)
        datum_writer.write(VALUE_TO_READ, encoder)

        # skip the value
        reader = StringIO(writer.getvalue())
        decoder = io.BinaryDecoder(reader)
        decoder.skip_long()

        # read data from string buffer
        datum_reader = io.DatumReader(writers_schema)
        read_value = datum_reader.read(decoder)

        print('Read Value: %d' % read_value)
        if read_value == VALUE_TO_READ: correct += 1
        print('')
    return correct
Ejemplo n.º 8
0
 def test_schema_promotion(self):
   print_name('TEST SCHEMA PROMOTION')
   # note that checking writers_schema.type in read_data
   # allows us to handle promotion correctly
   promotable_schemas = ['"int"', '"long"', '"float"', '"double"']
   incorrect = 0
   for i, ws in enumerate(promotable_schemas):
     writers_schema = schema.parse(ws)
     datum_to_write = 219
     for rs in promotable_schemas[i + 1:]:
       readers_schema = schema.parse(rs)
       writer, enc, dw = write_datum(datum_to_write, writers_schema)
       datum_read = read_datum(writer, writers_schema, readers_schema)
       print('Writer: %s Reader: %s' % (writers_schema, readers_schema))
       print('Datum Read: %s' % datum_read)
       if datum_read != datum_to_write: incorrect += 1
   self.assertEquals(incorrect, 0)
Ejemplo n.º 9
0
 def test_type_exception(self):
     print_name('TEST TYPE EXCEPTION')
     writers_schema = schema.parse("""\
   {"type": "record", "name": "Test",
    "fields": [{"name": "F", "type": "int"},
               {"name": "E", "type": "int"}]}""")
     datum_to_write = {'E': 5, 'F': 'Bad'}
     with self.assertRaises(io.AvroTypeException) as context:
         write_datum(datum_to_write, writers_schema)
def validate_entity_payload(schema_definition, payload):
    # Use spavro to validate payload against the linked schema
    try:
        avro_schema = parse(json.dumps(schema_definition))
        valid = validate(avro_schema, payload)
        if not valid:
            raise ValidationError(MESSAGE_NOT_VALID)
        return True
    except Exception as err:
        raise ValidationError(str(err))
Ejemplo n.º 11
0
    def configure(self, taskType, inSchemaText, outSchemaText):
        """

    Parameters
    -------------------------------------------------------------------
    taskType - What type of task (e.g map, reduce)
             - This is an enumeration which is specified in the input protocol
    inSchemaText -  string containing the input schema
                 - This is the actual schema with which the data was encoded
                   i.e it is the writer_schema (see http://avro.apache.org/docs/current/spec.html#Schema+Resolution)
                   This is the schema the parent process is using which might be different
                   from the one provided by the subclass of tether_task

    outSchemaText - string containing the output scheme
                  - This is the schema expected by the parent process for the output
    """
        self.taskType = taskType

        try:
            inSchema = schema.parse(inSchemaText)
            outSchema = schema.parse(outSchemaText)

            if (taskType == TaskType.MAP):
                self.inReader = avio.DatumReader(writers_schema=inSchema,
                                                 readers_schema=self.inschema)
                self.midCollector = Collector(outSchemaText, self.outputClient)

            elif (taskType == TaskType.REDUCE):
                self.midReader = avio.DatumReader(
                    writers_schema=inSchema, readers_schema=self.midschema)
                # this.outCollector = new Collector<OUT>(outSchema);
                self.outCollector = Collector(outSchemaText, self.outputClient)

                # determine which fields in the input record are they keys for the reducer
                self._red_fkeys = [
                    f.name for f in self.midschema.fields
                    if not (f.order == 'ignore')
                ]

        except Exception as e:

            estr = traceback.format_exc()
            self.fail(estr)
Ejemplo n.º 12
0
 def test_validate(self):
     print_name('TEST VALIDATE')
     passed = 0
     for example_schema, datum in SCHEMAS_TO_VALIDATE:
         print('Schema: %s' % example_schema)
         print('Datum: %s' % datum)
         validated = io.validate(schema.parse(example_schema), datum)
         print('Valid: %s' % validated)
         if validated: passed += 1
     self.assertEqual(passed, len(SCHEMAS_TO_VALIDATE))
def validate_avro(schema, datum):
    result = tools.AvroValidator(
        schema=parse(json.dumps(schema)),
        datum=datum,
    )
    errors = []
    for error in result.errors:
        errors.append({
            'description': tools.format_validation_error(error),
        })
    return errors
Ejemplo n.º 14
0
    def test_parse(self):
        correct = 0
        for example in EXAMPLES:
            try:
                schema.parse(example.schema_string)
                if example.valid:
                    correct += 1
                else:
                    self.fail("Invalid schema was parsed: " +
                              example.schema_string)
            except:
                if not example.valid:
                    correct += 1
                else:
                    self.fail("Valid schema failed to parse: " +
                              example.schema_string)

        fail_msg = "Parse behavior correct on %d out of %d schemas." % \
          (correct, len(EXAMPLES))
        self.assertEqual(correct, len(EXAMPLES), fail_msg)
Ejemplo n.º 15
0
    def __init__(self,
                 writer,
                 datum_writer,
                 writers_schema=None,
                 codec='null'):
        """
        If the schema is not present, presume we're appending.

        @param writer: File-like object to write into.
        """
        self._writer = writer
        self._encoder = io.BinaryEncoder(writer)
        self._datum_writer = datum_writer
        self._buffer_writer = StringIO()
        self._buffer_encoder = io.BinaryEncoder(self._buffer_writer)
        self._block_count = 0
        self._meta = {}
        self._header_written = False

        if writers_schema is not None:
            if codec not in VALID_CODECS:
                raise DataFileException("Unknown codec: %r" % codec)
            self._sync_marker = DataFileWriter.generate_sync_marker()
            self.set_meta('avro.codec', codec)
            self.set_meta('avro.schema', str(writers_schema))
            self.datum_writer.writers_schema = writers_schema
        else:
            if writer.mode:
                if writer.mode not in ('rb+', 'ab+'):
                    raise DataFileException(
                        "When appending records to an Avro data file, the file object passed into DataFileWriter must be opened in read/write mode, e.g. for files: \"rb+\" or \"ab+\""
                    )
            else:
                if not (writer.readable() and writer.writable()):
                    raise DataFileException(
                        "When appending records to an Avro data file, the file object passed into DataFileWriter must be opened in read/write mode, e.g. for files: \"rb+\" or \"ab+\""
                    )
            # open writer for reading to collect metadata
            dfr = DataFileReader(writer, io.DatumReader())

            # TODO(hammer): collect arbitrary metadata
            # collect metadata
            self._sync_marker = dfr.sync_marker
            self.set_meta('avro.codec', dfr.get_meta('avro.codec'))

            # get schema used to write existing file
            schema_from_file = dfr.get_meta('avro.schema')
            self.set_meta('avro.schema', schema_from_file)
            self.datum_writer.writers_schema = schema.parse(schema_from_file)

            # seek to the end of the file and prepare for writing
            writer.seek(0, 2)
            self._header_written = True
Ejemplo n.º 16
0
    def test_equivalence_after_round_trip(self):
        # 1. Given a string, parse it to get Avro schema "original".
        # 2. Serialize "original" to a string and parse that string
        #      to generate Avro schema "round trip".
        # 3. Ensure "original" and "round trip" schemas are equivalent.
        print_name('TEST ROUND TRIP')
        correct = 0
        for example in VALID_EXAMPLES:
            original_schema = schema.parse(example.schema_string)
            round_trip_schema = schema.parse(str(original_schema))
            if original_schema == round_trip_schema:
                correct += 1
                debug_msg = "%s: ROUND TRIP SUCCESS" % example.name
            else:
                debug_msg = "%s: ROUND TRIP FAILURE" % example.name
                self.fail(
                    "Round trip failure: %s, %s, %s" %
                    (example.name, original_schema, str(original_schema)))

        fail_msg = "Round trip success on %d out of %d schemas" % \
          (correct, len(VALID_EXAMPLES))
        self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
Ejemplo n.º 17
0
 def test_doc_attributes(self):
     print_name('TEST DOC ATTRIBUTES')
     correct = 0
     for example in DOC_EXAMPLES:
         original_schema = schema.parse(example.schema_string)
         if original_schema.doc is not None:
             correct += 1
         if original_schema.type == 'record':
             for f in original_schema.fields:
                 if f.doc is None:
                     self.fail("Failed to preserve 'doc' in fields: " +
                               example.schema_string)
     self.assertEqual(correct, len(DOC_EXAMPLES))
Ejemplo n.º 18
0
  def test_round_trip(self):
    print_name('TEST ROUND TRIP')
    correct = 0
    for example_schema, datum in SCHEMAS_TO_VALIDATE:
      print 'Schema: %s' % example_schema
      print 'Datum: %s' % datum

      writers_schema = schema.parse(example_schema)
      writer, encoder, datum_writer = write_datum(datum, writers_schema)
      round_trip_datum = read_datum(writer, writers_schema)

      print 'Round Trip Datum: %s' % round_trip_datum
      if datum == round_trip_datum: correct += 1
    self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
Ejemplo n.º 19
0
  def test_no_default_value(self):
    print_name('TEST NO DEFAULT VALUE')
    writers_schema = LONG_RECORD_SCHEMA
    datum_to_write = LONG_RECORD_DATUM

    readers_schema = schema.parse("""\
      {"type": "record", "name": "Test",
       "fields": [{"name": "H", "type": "int"}]}""")

    writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema)
    with self.assertRaises(io.SchemaResolutionException) as context:
        reader = StringIO(writer.getvalue())
        decoder = io.BinaryDecoder(reader)
        datum_reader = io.DatumReader(writers_schema, readers_schema)
Ejemplo n.º 20
0
  def test_field_order(self):
    print_name('TEST FIELD ORDER')
    writers_schema = LONG_RECORD_SCHEMA
    datum_to_write = LONG_RECORD_DATUM

    readers_schema = schema.parse("""\
      {"type": "record", "name": "Test",
       "fields": [{"name": "F", "type": "int"},
                  {"name": "E", "type": "int"}]}""")
    datum_to_read = {'E': 5, 'F': 6}

    writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema)
    datum_read = read_datum(writer, writers_schema, readers_schema)
    print('Datum Read: %s' % datum_read)
    self.assertEquals(datum_to_read, datum_read)
Ejemplo n.º 21
0
    def test_append(self):
        print('')
        print('TEST APPEND')
        print('===========')
        print('')
        correct = 0
        for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
            for codec in CODECS_TO_VALIDATE:
                print('')
                print('SCHEMA NUMBER %d' % (i + 1))
                print('================')
                print('')
                print('Schema: %s' % example_schema)
                print('Datum: %s' % datum)
                print('Codec: %s' % codec)

                # write data in binary to file once
                writer = open(FILENAME, 'wb')
                datum_writer = io.DatumWriter()
                schema_object = schema.parse(example_schema)
                dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec)
                dfw.append(datum)
                dfw.close()

                # open file, write, and close nine times
                for i in range(9):
                    writer = open(FILENAME, 'ab+')
                    dfw = datafile.DataFileWriter(writer, io.DatumWriter())
                    dfw.append(datum)
                    dfw.close()

                # read data in binary from file
                reader = open(FILENAME, 'rb')
                datum_reader = io.DatumReader()
                dfr = datafile.DataFileReader(reader, datum_reader)
                appended_data = []
                for datum in dfr:
                    appended_data.append(datum)

                print('Appended Data: %s' % appended_data)
                print('Appended Data Length: %d' % len(appended_data))
                is_correct = [datum] * 10 == appended_data
                if is_correct:
                    correct += 1
                print('Correct Appended: %s' % is_correct)
                print('')
        os.remove(FILENAME)
        self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
Ejemplo n.º 22
0
def check_binary_encoding(number_type):
    print_name('TEST BINARY %s ENCODING' % number_type.upper())
    correct = 0
    for datum, hex_encoding in BINARY_ENCODINGS:
        print('Datum: %d' % datum)
        print('Correct Encoding: %s' % hex_encoding)

        writers_schema = schema.parse('"%s"' % number_type.lower())
        writer, encoder, datum_writer = write_datum(datum, writers_schema)
        writer.seek(0)
        hex_val = avro_hexlify(writer)

        print('Read Encoding: %s' % hex_val)
        if hex_encoding == hex_val: correct += 1
        print('')
    return correct
Ejemplo n.º 23
0
    def test_round_trip(self):
        print_name('TEST ROUND TRIP')
        correct = 0
        for example_schema, datum in SCHEMAS_TO_VALIDATE:
            print('Schema: %s' % example_schema)
            print('Datum: %s' % datum)

            writers_schema = schema.parse(example_schema)
            writer, encoder, datum_writer = write_datum(datum, writers_schema)
            round_trip_datum = read_datum(writer, writers_schema)

            print('Round Trip Datum: %s' % round_trip_datum)
            if datum == round_trip_datum:
                correct += 1
            else:
                print("Mismatch: {} != {}".format(datum, round_trip_datum))
        self.assertEqual(correct, len(SCHEMAS_TO_VALIDATE))
Ejemplo n.º 24
0
  def test_default_value(self):
    print_name('TEST DEFAULT VALUE')
    writers_schema = LONG_RECORD_SCHEMA
    datum_to_write = LONG_RECORD_DATUM

    correct = 0
    for field_type, default_json, default_datum in DEFAULT_VALUE_EXAMPLES:
      readers_schema = schema.parse("""\
        {"type": "record", "name": "Test",
         "fields": [{"name": "H", "type": %s, "default": %s}]}
        """ % (field_type, default_json))
      datum_to_read = {'H': default_datum}

      writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema)
      datum_read = read_datum(writer, writers_schema, readers_schema)
      print('Datum Read: %s' % datum_read)
      if datum_to_read == datum_read: correct += 1
    self.assertEquals(correct, len(DEFAULT_VALUE_EXAMPLES))
Ejemplo n.º 25
0
def create_remote_kafka_assets(request, sample_generator, *args):
    # @mark annotation does not work with autouse=True.
    if 'integration' not in request.config.invocation_params.args:
        LOG.debug(f'NOT creating Kafka Assets')
        # return
    LOG.debug(f'Creating Kafka Assets')
    kafka_security = config.get_kafka_admin_config()
    kadmin = get_admin_client(kafka_security)
    new_topic = f'{TENANT}.{TEST_TOPIC}'
    create_topic(kadmin, new_topic)
    GENERATED_SAMPLES[new_topic] = []
    producer = get_producer(kafka_security)
    schema = parse(json.dumps(ANNOTATED_SCHEMA))
    for subset in sample_generator(max=100, chunk=10):
        GENERATED_SAMPLES[new_topic].extend(subset)
        produce(subset, schema, new_topic, producer)
    yield None  # end of work before clean-up
    LOG.debug(f'deleting topic: {new_topic}')
    delete_topic(kadmin, new_topic)
Ejemplo n.º 26
0
    def test_round_trip(self):
        print('')
        print('TEST ROUND TRIP')
        print('===============')
        print('')
        correct = 0
        print(SCHEMAS_TO_VALIDATE)
        for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
            for codec in CODECS_TO_VALIDATE:
                print('')
                print('SCHEMA NUMBER %d' % (i + 1))
                print('================')
                print('')
                print('Schema: %s' % example_schema)
                print('Datum: %s' % datum)
                print('Codec: %s' % codec)

                # write data in binary to file 10 times
                writer = open(FILENAME, 'wb')
                datum_writer = io.DatumWriter()
                schema_object = schema.parse(example_schema)
                dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec)
                for datum_counter in range(10):
                    dfw.append(datum)
                dfw.close()

                # read data in binary from file
                reader = open(FILENAME, 'rb')
                datum_reader = io.DatumReader()
                dfr = datafile.DataFileReader(reader, datum_reader)
                round_trip_data = []
                for read_datum in dfr:
                    round_trip_data.append(read_datum)

                print('Round Trip Data: %s' % round_trip_data)
                print('Round Trip Data Length: %d' % len(round_trip_data))
                is_correct = [datum] * 10 == round_trip_data
                if is_correct:
                    correct += 1
                print('Correct Round Trip: %s' % is_correct)
                print('')
        os.remove(FILENAME)
        self.assertEquals(correct, len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
Ejemplo n.º 27
0
def _send_kafka(objs: List[Any],
                schema,
                _type,
                max_size=MAX_KAFKA_MESSAGE_SIZE,
                callback=None):
    # check size
    total_size = fb_utils.utf8size(schema) + fb_utils.utf8size(objs)
    _logger.debug(
        f'Sending {len(objs)} of {_type} to kafka @ size {total_size}')
    if total_size >= max_size:
        raise RuntimeError(
            f'Message size: {total_size} exceeds maximum: {max_size}. Chunking.'
        )
    if not get_broker_info(KADMIN):
        raise ConnectionError('Could not connect to Kafka.')
    schema = parse(schema)
    TENANT = CONF.get('tenant')
    topic = fb_utils.sanitize_topic(f'{TENANT}.fbs.{_type}')
    produce(objs, schema, topic, PRODUCER, callback=callback)
    return
Ejemplo n.º 28
0
    def test_metadata(self):
        # Test the writer with a 'with' statement.
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
        schema_object = schema.parse(sample_schema)
        with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
            dfw.set_meta('test.string', 'foo')
            dfw.set_meta('test.number', '1')
            dfw.append(sample_datum)
        self.assertTrue(writer.closed)

        # Test the reader with a 'with' statement.
        datums = []
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        with datafile.DataFileReader(reader, datum_reader) as dfr:
            self.assertEquals('foo', dfr.get_meta('test.string'))
            self.assertEquals('1', dfr.get_meta('test.number'))
            for datum in dfr:
                datums.append(datum)
        self.assertTrue(reader.closed)
Ejemplo n.º 29
0
    def __init__(self, scheme=None, outputClient=None):
        """

    Parameters
    ---------------------------------------------
    scheme - The scheme for the datums to output - can be a json string
           - or an instance of Schema
    outputClient - The output client used to send messages to the parent
    """

        if not (isinstance(scheme, schema.Schema)):
            scheme = schema.parse(scheme)

        if (outputClient is None):
            raise ValueError("output client can't be none.")

        self.scheme = scheme
        self.buff = StringIO()
        self.encoder = avio.BinaryEncoder(self.buff)

        self.datum_writer = avio.DatumWriter(writers_schema=self.scheme)
        self.outputClient = outputClient
Ejemplo n.º 30
0
    def __init__(self, reader, datum_reader):
        self._reader = reader
        self._raw_decoder = io.BinaryDecoder(reader)
        self._datum_decoder = None # Maybe reset at every block.
        self._datum_reader = datum_reader
        
        # read the header: magic, meta, sync
        self._read_header()

        # ensure codec is valid
        self.codec = self.get_meta(CODEC_KEY)
        if self.codec is None:
            self.codec = "null"
        if self.codec not in VALID_CODECS:
            raise DataFileException('Unknown codec: %s.' % self.codec)

        # get file length
        self._file_length = self.determine_file_length()

        # get ready to read
        self._block_count = 0
        self.datum_reader.writers_schema = schema.parse(self.get_meta(SCHEMA_KEY))