Esempio n. 1
0
 def test_other_attributes(self):
     correct = 0
     props = {}
     for example in OTHER_PROP_EXAMPLES:
         original_schema = schema.parse(example.schema_string)
         round_trip_schema = schema.parse(str(original_schema))
         self.assertEqual(original_schema.other_props,round_trip_schema.other_props)
         if original_schema.type == "record":
             field_props = 0
             for f in original_schema.fields:
                 if f.other_props:
                     props.update(f.other_props)
                     field_props += 1
             self.assertEqual(field_props,len(original_schema.fields))
         if original_schema.other_props:
             props.update(original_schema.other_props)
             correct += 1
     for k in props:
         v = props[k]
         if k == "cp_boolean":
             self.assertEqual(type(v), bool)
         elif k == "cp_int":
             self.assertEqual(type(v), int)
         elif k == "cp_object":
             self.assertEqual(type(v), dict)
         elif k == "cp_float":
             self.assertEqual(type(v), float)
         elif k == "cp_array":
             self.assertEqual(type(v), list)
     self.assertEqual(correct,len(OTHER_PROP_EXAMPLES))
Esempio n. 2
0
    def test_parse(self):
        correct = 0
        for iexample, example in enumerate(EXAMPLES):
            logging.debug('Testing example #%d\n%s', iexample, example.schema_string)
            try:
                schema.parse(example.schema_string)
                if example.valid:
                    correct += 1
                else:
                    self.fail('Invalid schema was parsed:\n%s' % example.schema_string)
            except Exception as exn:
                if example.valid:
                    self.fail(
                            'Valid schema failed to parse: %r\n%s'
                            % (example.schema_string, traceback.format_exc()))
                else:
                    if logging.getLogger().getEffectiveLevel() <= 5:
                        logging.debug('Expected error:\n%s', traceback.format_exc())
                    else:
                        logging.debug('Expected error: %r', exn)
                    correct += 1

        self.assertEqual(
                correct,
                len(EXAMPLES),
                'Parse behavior correct on %d out of %d schemas.'
                % (correct, len(EXAMPLES)),
        )
Esempio n. 3
0
 def testreadfiles(self):
   origschm = schema.parse(open("src/test/schemata/interop.avsc").read())
   for file in os.listdir(_DATAFILE_DIR):
     print "Validating:", file.__str__()
     dr = io.DataFileReader(open(_DATAFILE_DIR+file, "rb"), 
                            self.__datumreader())
     count = int(dr.getmeta("count"))
     decodedSchm = schema.parse(dr.getmeta("schema"))
     self.assertEquals(origschm, decodedSchm)
     for i in range(0,count):
       datum = dr.next()
       self.assertTrue(self.__validator(origschm, datum))
   # validate reading of blocking arrays, blocking maps
   for file in os.listdir(_BLOCKINGFILE_DIR):
     print "Validating:", file.__str__()
     reader = open(_BLOCKINGFILE_DIR+file, "rb")
     decoder = io.Decoder(reader)
     dreader = self.__datumreader()
     dreader.setschema(origschm)
     count = int(decoder.readlong()) #metadata:the count of objects in the file
     blockcount = decoder.readlong()
     for i in range(0,count):
       while blockcount == 0:
         blockcount = decoder.readlong()
       blockcount -= 1
       datum = dreader.read(decoder)
       self.assertTrue(self.__validator(origschm, datum))
Esempio n. 4
0
  def test_equivalence_after_round_trip(self):
    """
    1. Given a string, parse it to get Avro schema "original".
    2. Serialize "original" to a string and parse that string
         to generate Avro schema "round trip".
    3. Ensure "original" and "round trip" schemas are equivalent.
    """
    print_test_name('TEST ROUND TRIP')
    correct = 0
    for example in VALID_EXAMPLES:
      try:
        original_schema = schema.parse(example.schema_string)
        round_trip_schema = schema.parse(str(original_schema))
        if original_schema == round_trip_schema:
          correct += 1
          debug_msg = "%s: ROUND TRIP SUCCESS" % example.name
        else:       
          debug_msg = "%s: ROUND TRIP FAILURE" % example.name
      except:
        debug_msg = "%s: ROUND TRIP FAILURE" % example.name
      finally:
        print debug_msg

    fail_msg = "Round trip success on %d out of %d schemas" % \
      (correct, len(VALID_EXAMPLES))
    self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
Esempio n. 5
0
    def test_name(self):
        int_schema = schema.parse("\"int\"")
        self.assertEqual("int", int_schema.name)
        self.assertEqual("int", int_schema.fullname)

        int_array_schema = schema.parse("""{"type": "array", "items": "int"}""")
        self.assertEqual("array", int_array_schema.name)
        self.assertEqual("array", int_array_schema.fullname)
Esempio n. 6
0
 def checkdefault(self, schemajson, defaultjson, defaultvalue):
   self.check(schemajson)
   actual = schema.parse("{\"type\":\"record\", \"name\":\"Foo\","
                         + "\"fields\":[]}")
   expected = schema.parse("{\"type\":\"record\", \"name\":\"Foo\"," 
                            +"\"fields\":[{\"name\":\"f\", "
                            +"\"type\":"+schemajson+", "
                            +"\"default\":"+defaultjson+"}]}")
   reader = genericio.DatumReader(actual, expected)
   record = reader.read(io.Decoder(cStringIO.StringIO()))
   self.assertEquals(defaultvalue, record.get("f"))
Esempio n. 7
0
  def test_exception_is_not_swallowed_on_parse_error(self):
    print_test_name('TEST EXCEPTION NOT SWALLOWED ON PARSE ERROR')

    try:
        schema.parse('/not/a/real/file')
        caught_exception = False
    except schema.SchemaParseException, e:
        expected_message = 'Error parsing JSON: /not/a/real/file, error = ' \
                           'No JSON object could be decoded'
        self.assertEqual(expected_message, e.args[0])
        caught_exception = True
Esempio n. 8
0
def test_sanity():
  """

  Ensures that our "base" and "good" schemas are actually forwards- and
  backwards-compatible

  """
  # fst schema / record
  fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read())
  fst_writer = DatumWriter(writers_schema=fst_schema)
  fst_record = {
      "fieldWithoutDefaultValue": 0,
      "properField": 0,
      "enumField": "A",
      "unionField": None,
      "arrayField": ["world"],
      "mapField": {"hello": "world"},
      "fixedField": "aaaaaaaaaaaaaaaa"
  }

  # sec schema / record
  sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read())
  sec_writer = DatumWriter(writers_schema=sec_schema)
  sec_record = {
      "fieldWithoutDefaultValue": 0,
      "properField2": 0,
      "enumField": "B",
      "unionField": None,
      "arrayField": ["world"],
      "fixedField": "bbbbbbbbbbbbbbbb"
  }

  # Encode record w/ fst
  fst_buf = StringIO.StringIO()
  fst_encoder = BinaryEncoder(fst_buf)
  fst_writer.write(fst_record, fst_encoder)
  fst_data = fst_buf.getvalue()

  # Encode record w/ sec
  sec_buf = StringIO.StringIO()
  sec_encoder = BinaryEncoder(sec_buf)
  sec_writer.write(sec_record, sec_encoder)
  sec_data = sec_buf.getvalue()

  # writers == fst, readers == sec
  sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema)
  sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data))
  sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good

  # writers == sec, readers == fst
  fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema)
  fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data))
  fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
Esempio n. 9
0
 def check(self, string):
   schm = schema.parse(string)
   st = schema.stringval(schm)
   self.assertEquals(string.replace(" ",""), st.replace(" ",""))
   #test __eq__
   self.assertEquals(schm, schema.parse(string))
   #test hashcode doesn't generate infinite recursion
   schm.__hash__()
   randomdata = self.__random(schm)
   for i in range(1,10):
     self.checkser(schm, randomdata)
   self.checkdatafile(schm)
Esempio n. 10
0
def _check(fst_name, sec_name):
  """

  Tests evolution from schema named MyRecord.{fst_name}.avsc to schema named
  MyRecord.{sec_name}.avsc in BASE_DIR

  """
  fst = schema.parse(open("%s/MyRecord.%s.avsc" % (BASE_DIR, fst_name)).read())
  sec = schema.parse(open("%s/MyRecord.%s.avsc" % (BASE_DIR, sec_name)).read())
  try:
    validator.check([fst, sec])
  except:
    "good"
Esempio n. 11
0
    def test_valid_cast_to_string_after_parse(self):
        """
        Test that the string generated by an Avro Schema object
        is, in fact, a valid Avro schema.
        """
        correct = 0
        for example in VALID_EXAMPLES:
            schema_data = schema.parse(example.schema_string)
            schema.parse(str(schema_data))
            correct += 1

        fail_msg = "Cast to string success on %d out of %d schemas" % \
            (correct, len(VALID_EXAMPLES))
        self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
Esempio n. 12
0
    def test_unknown_symbol(self):
        writer_schema = schema.parse("""\
            {"type": "enum", "name": "Test",
       "symbols": ["FOO", "BAR"]}""")
        datum_to_write = 'FOO'

        reader_schema = schema.parse("""\
            {"type": "enum", "name": "Test",
       "symbols": ["BAR", "BAZ"]}""")

        writer, encoder, datum_writer = write_datum(datum_to_write, writer_schema)
        reader = io.BytesIO(writer.getvalue())
        decoder = avro_io.BinaryDecoder(reader)
        datum_reader = avro_io.DatumReader(writer_schema, reader_schema)
        self.assertRaises(avro_io.SchemaResolutionException, datum_reader.read, decoder)
Esempio n. 13
0
 def test_schema_promotion(self):
     # note that checking writer_schema.type in read_data
     # allows us to handle promotion correctly
     promotable_schemas = ['"int"', '"long"', '"float"', '"double"']
     incorrect = 0
     for i, ws in enumerate(promotable_schemas):
         writer_schema = schema.parse(ws)
         datum_to_write = 219
         for rs in promotable_schemas[i + 1:]:
             reader_schema = schema.parse(rs)
             writer, enc, dw = write_datum(datum_to_write, writer_schema)
             datum_read = read_datum(writer, writer_schema, reader_schema)
             logging.debug('Writer: %s Reader: %s', writer_schema, reader_schema)
             logging.debug('Datum Read: %s', datum_read)
             if datum_read != datum_to_write: incorrect += 1
     self.assertEqual(incorrect, 0)
Esempio n. 14
0
def process_files(output_path, hdfs_path, batch):
    """Process all files in batch a produce an avro file. """
    now = datetime.datetime.now()
    ts = now.strftime("%Y-%m-%d-%H-%M-%S-%f")
    output_filename = FILE_PREFIX + "-" + ts + '.avro'
    print "* creating new avro file: " + output_filename
    xschema = schema.parse(open(SCHEMA_FILE).read())
    rec_writer = io.DatumWriter(xschema)
    df_writer = datafile.DataFileWriter(
                open(output_path + output_filename, 'wb'),
                rec_writer,
                writers_schema = xschema,
                codec = 'deflate')

    for file_path in batch:
        bytes = read_binary(file_path)
        content = base64.b64encode(bytes)
        data = {}
        data['doc_uuid'] = str(uuid.uuid4())
        data['file_path'] = file_path
        data['content'] = content
        df_writer.append(data)

    df_writer.close()
    time.sleep(1)
    hdfs_put(output_path + output_filename, hdfs_path)
Esempio n. 15
0
def main():
    # 检测参数个数
    if len(sys.argv) != 3:
        sys.exit('Usage %s <Schema file> <Data_file>' % (sys.argv[0]))

    # 从 avsc 文件中读取模式
    schema_string = open(sys.argv[1], "r").read()

    # 打开 avro 文件
    avro_file = open(sys.argv[2], "wb")

    # 获取 DatumWriter 对象
    datum_writer = io.DatumWriter()

    # 解析模式
    schema_object = schema.parse(schema_string)

    # 获得 DataFileWriter 对象
    data_file_writer = datafile.DataFileWriter(avro_file, datum_writer, schema_object)

    # 从输入中赋值
    for line in sys.stdin:
        (left, right) = line[:-1].split(",")
        data_file_writer.append({'left':left, "right":right})

    # 关闭 DataFileWriter
    data_file_writer.close()
Esempio n. 16
0
def test_view_avro():
  cluster = mini_cluster.shared_cluster(conf=True)
  try:
    c = make_logged_in_client()
    cluster.fs.setuser(cluster.superuser)
    if cluster.fs.isdir("/test-avro-filebrowser"):
      cluster.fs.rmtree('/test-avro-filebrowser/')

    cluster.fs.mkdir('/test-avro-filebrowser/')

    test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

    f = cluster.fs.open('/test-avro-filebrowser/test-view.avro', "w")
    data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(),
                                                writers_schema=test_schema,
                                                codec='deflate')
    dummy_datum = {
      'name': 'Test',
      'integer': 10,
    }
    data_file_writer.append(dummy_datum)
    data_file_writer.close()

    # autodetect
    response = c.get('/filebrowser/view/test-avro-filebrowser/test-view.avro')
    # (Note: we use eval here cause of an incompatibility issue between
    # the representation string of JSON dicts in simplejson vs. json)
    assert_equal(eval(response.context['view']['contents']), dummy_datum)

    # offsetting should work as well
    response = c.get('/filebrowser/view/test-avro-filebrowser/test-view.avro?offset=1')
    assert_true(response.context.has_key('view'))

    f = cluster.fs.open('/test-avro-filebrowser/test-view2.avro', "w")
    f.write("hello")
    f.close()

    # we shouldn't autodetect non avro files
    response = c.get('/filebrowser/view/test-avro-filebrowser/test-view2.avro')
    assert_equal(response.context['view']['contents'], "hello")

    # we should fail to do a bad thing if they specify compression when it's not set.
    response = c.get('/filebrowser/view/test-avro-filebrowser/test-view2.avro?compression=gzip')
    assert_false(response.context.has_key('view'))

  finally:
    try:
      cluster.fs.rmtree('/test-avro-filebrowser/')
    except:
      pass      # Don't let cleanup errors mask earlier failures
    cluster.shutdown()
Esempio n. 17
0
    def test_context_manager(self):
        # Context manager was introduced as a first class
        # member only in Python 2.6 and above.
        import sys

        if sys.version_info < (2, 6):
            print "Skipping context manager tests on this Python version."
            return
        # Test the writer with a 'with' statement.
        writer = open(FILENAME, "wb")
        datum_writer = io.DatumWriter()
        sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
        schema_object = schema.parse(sample_schema)
        with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
            dfw.append(sample_datum)
        self.assertTrue(writer.closed)

        # Test the reader with a 'with' statement.
        datums = []
        reader = open(FILENAME, "rb")
        datum_reader = io.DatumReader()
        with datafile.DataFileReader(reader, datum_reader) as dfr:
            for datum in dfr:
                datums.append(datum)
        self.assertTrue(reader.closed)
Esempio n. 18
0
    def __init__(self, reader, datum_reader):
        """Initializes a new data file reader.

        Args:
            reader: Open file to read from.
            datum_reader: Avro datum reader.
        """
        self._reader = reader
        self._raw_decoder = avro_io.BinaryDecoder(reader)
        self._datum_decoder = None # Maybe reset at every block.
        self._datum_reader = datum_reader

        # read the header: magic, meta, sync
        self._read_header()

        # ensure codec is valid
        self.codec = self.GetMeta('avro.codec').decode('utf-8')
        if self.codec is None:
            self.codec = "null"
        if self.codec not in VALID_CODECS:
            raise DataFileException('Unknown codec: %s.' % self.codec)

        self._file_length = self._GetInputFileLength()

        # get ready to read
        self._block_count = 0
        self.datum_reader.writer_schema = (
                schema.parse(self.GetMeta(SCHEMA_KEY).decode('utf-8')))
Esempio n. 19
0
def init_avro(output_path, part_id, schema_path):
  print("************* init_avro ***************")
  output_dir = None
  output_dirtmp = None	# Handle Avro Write Error
  if(type(output_path) is str):
    output_dir = init_directory(output_path)
    output_dirtmp = init_directory(output_path + 'tmp') # Handle Avro Write Error
  out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \
    {"output_dir": output_dir, "part_id": str(part_id)}
  out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \
    {"output_dirtmp": output_dirtmp, "part_id": str(part_id)}  # Handle Avro Write Error
  schemas = open(schema_path, 'r').read()
  email_schema = schema.parse(schemas)

  rec_writer = io.DatumWriter(email_schema)
  avro_writer = datafile.DataFileWriter(
    open(out_filename, 'wb'),
    rec_writer,
    email_schema
  )
  # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError when writing into AvroStorage
  avro_writertmp = datafile.DataFileWriter(
      open(out_filenametmp, 'wb'),
    rec_writer,
    email_schema
  )
  return avro_writer, avro_writertmp
  print("*************end init_avro ***************")
Esempio n. 20
0
 def __init__(self, block_bytes, num_records, codec, schema_string):
   # Decompress data early on (if needed) and thus decrease the number of
   # parallel copies of the data in memory at any given in time during
   # block iteration.
   self._decompressed_block_bytes = self._decompress_bytes(block_bytes, codec)
   self._num_records = num_records
   self._schema = schema.parse(schema_string)
Esempio n. 21
0
def main():
	if len(sys.argv) < 2:
		print "Usage: cat input.json | python2.7 JSONtoAvro.py output"
		return

	s = schema.parse(open("tweet.avsc").read())
	f = open(sys.argv[1], "wb")

	writer = datafile.DataFileWriter(f, io.DatumWriter(), s, codec = 'deflate')

	failed = 0

	for line in sys.stdin:
		line = line.strip()

		try:
			data = json.loads(line)
		except ValueError as detail:
			continue

		try:
			writer.append(data)
		except io.AvroTypeException as detail:
			print line
			failed += 1

	writer.close()

	print str(failed) + " failed in schema"
Esempio n. 22
0
 def __init__(self, reader, dreader):
   self.__reader = reader
   self.__decoder = Decoder(reader)
   mag = struct.unpack(len(_MAGIC).__str__()+'s', 
                self.__reader.read(len(_MAGIC)))[0]
   if mag != _MAGIC:
     raise schema.AvroException("Not an avro data file")
   #find the length
   self.__reader.seek(0,2)
   self.__length = self.__reader.tell()
   self.__reader.seek(-4, 2)
   footersize = (int(ord(self.__reader.read(1)) << 24) +
           int(ord(self.__reader.read(1)) << 16) +
           int(ord(self.__reader.read(1)) << 8) +
           int(ord(self.__reader.read(1))))
   seekpos = self.__reader.seek(self.__length-footersize)
   metalength = self.__decoder.readlong()
   if metalength < 0:
     metalength = -metalength
     self.__decoder.readlong() #ignore byteCount if this is a blocking map
   self.__meta = dict()
   for i in range(0, metalength):
     key = self.__decoder.readutf8()
     self.__meta[key] = self.__decoder.readbytes()
   self.__sync = self.__meta.get("sync")
   self.__count = int(self.__meta.get("count"))
   self.__codec = self.__meta.get("codec")
   if (self.__codec != None) and (self.__codec != "null"):
     raise schema.AvroException("Unknown codec: " + self.__codec)
   self.__schema = schema.parse(self.__meta.get("schema").encode("utf-8"))
   self.__blockcount = 0
   self.__dreader = dreader
   self.__dreader.setschema(self.__schema)
   self.__reader.seek(len(_MAGIC))
Esempio n. 23
0
  def test_unknown_symbol(self):
    print_test_name('TEST UNKNOWN SYMBOL')
    writers_schema = schema.parse("""\
      {"type": "enum", "name": "Test",
       "symbols": ["FOO", "BAR"]}""")
    datum_to_write = 'FOO'

    readers_schema = schema.parse("""\
      {"type": "enum", "name": "Test",
       "symbols": ["BAR", "BAZ"]}""")

    writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema)
    reader = StringIO(writer.getvalue())
    decoder = io.BinaryDecoder(reader)
    datum_reader = io.DatumReader(writers_schema, readers_schema)
    self.assertRaises(io.SchemaResolutionException, datum_reader.read, decoder)
Esempio n. 24
0
  def _write_lines(self,lines,fname):
    """
    Write the lines to an avro file named fname

    Parameters
    --------------------------------------------------------
    lines - list of strings to write
    fname - the name of the file to write to.
    """
    import avro.io as avio
    from avro.datafile import DataFileReader,DataFileWriter
    from avro import schema

    #recursively make all directories
    dparts=fname.split(os.sep)[:-1]
    for i in range(len(dparts)):
      pdir=os.sep+os.sep.join(dparts[:i+1])
      if not(os.path.exists(pdir)):
        os.mkdir(pdir)


    with file(fname,'w') as hf:
      inschema="""{"type":"string"}"""
      writer=DataFileWriter(hf,avio.DatumWriter(inschema),writers_schema=schema.parse(inschema))

      #encoder = avio.BinaryEncoder(writer)
      #datum_writer = avio.DatumWriter()
      for datum in lines:
        writer.append(datum)

      writer.close()
Esempio n. 25
0
  def test_container(self):
    writer = open('data.avro', 'wb')
    datum_writer = io.DatumWriter()
    schema_object = schema.parse("""\
{ "type": "record",
  "name": "Pair",
  "doc": "A pair of strings.",
  "fields": [
    {"name": "left", "type": "string"},
    {"name": "right", "type": "string"}
  ]
}
    """)
    dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
    datum = {'left':'L', 'right':'R'}
    dfw.append(datum)
    dfw.close()
    
    reader = open('data.avro', 'rb')
    datum_reader = io.DatumReader()
    dfr = datafile.DataFileReader(reader, datum_reader)
    data = []
    for datum in dfr:
      data.append(datum)
      
    self.assertEquals(1, len(data));
    self.assertEquals(datum, data[0]);
Esempio n. 26
0
def check_skip_number(number_type):
  print_test_name('TEST SKIP %s' % number_type.upper())
  correct = 0
  for value_to_skip, hex_encoding in BINARY_ENCODINGS:
    VALUE_TO_READ = 6253
    print 'Value to Skip: %d' % value_to_skip

    # write the value to skip and a known value
    writers_schema = schema.parse('"%s"' % number_type.lower())
    writer, encoder, datum_writer = write_datum(value_to_skip, writers_schema)
    datum_writer.write(VALUE_TO_READ, encoder)

    # skip the value
    reader = StringIO(writer.getvalue())
    decoder = io.BinaryDecoder(reader)
    decoder.skip_long()

    # read data from string buffer
    datum_reader = io.DatumReader(writers_schema)
    read_value = datum_reader.read(decoder)

    print 'Read Value: %d' % read_value
    if read_value == VALUE_TO_READ: correct += 1
    print ''
  return correct
Esempio n. 27
0
def _write_items(base_name, schema_str, items):
    avro_schema = schema.parse(schema_str)
    avro_file = base_name + '.avro'
    with DataFileWriter(open(avro_file, "w"), DatumWriter(), avro_schema) as writer:
        for i in items:
            writer.append(i)
    writer.close
    return (avro_file)
Esempio n. 28
0
 def test_type_exception(self):
   print_test_name('TEST TYPE EXCEPTION')
   writers_schema = schema.parse("""\
     {"type": "record", "name": "Test",
      "fields": [{"name": "F", "type": "int"},
                 {"name": "E", "type": "int"}]}""")
   datum_to_write = {'E': 5, 'F': 'Bad'}
   self.assertRaises(io.AvroTypeException, write_datum, datum_to_write, writers_schema)
Esempio n. 29
0
 def test_schema_promotion(self):
   print_test_name('TEST SCHEMA PROMOTION')
   # note that checking writers_schema.type in read_data
   # allows us to handle promotion correctly
   promotable_schemas = ['"int"', '"long"', '"float"', '"double"']
   incorrect = 0
   for i, ws in enumerate(promotable_schemas):
     writers_schema = schema.parse(ws)
     datum_to_write = 219
     for rs in promotable_schemas[i + 1:]:
       readers_schema = schema.parse(rs)
       writer, enc, dw = write_datum(datum_to_write, writers_schema)
       datum_read = read_datum(writer, writers_schema, readers_schema)
       print 'Writer: %s Reader: %s' % (writers_schema, readers_schema)
       print 'Datum Read: %s' % datum_read
       if datum_read != datum_to_write: incorrect += 1
   self.assertEquals(incorrect, 0)
Esempio n. 30
0
 def test_null(self):
     schema_null = schema.parse('"null"')
     avro_io.check_schema(datum=None, schema=schema_null)
     try:
         avro_io.check_schema(datum=1, schema=schema_null)
         self.fail("Should have failed")
     except schema.AvroException as exn:
         pass
Esempio n. 31
0
  def testContextManager(self):
    file_path = self.NewTempFile()

    # Test the writer with a 'with' statement.
    with open(file_path, 'wb') as writer:
      datum_writer = io.DatumWriter()
      sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
      schema_object = schema.parse(sample_schema)
      with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
        dfw.append(sample_datum)
      self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    with open(file_path, 'rb') as reader:
      datum_reader = io.DatumReader()
      with datafile.DataFileReader(reader, datum_reader) as dfr:
        for datum in dfr:
          datums.append(datum)
      self.assertTrue(reader.closed)
Esempio n. 32
0
    def test_round_trip(self):
        print_test_name('TEST ROUND TRIP')
        correct = 0
        for example_schema, datum in SCHEMAS_TO_VALIDATE:
            print 'Schema: %s' % example_schema
            print 'Datum: %s' % datum

            writers_schema = schema.parse(example_schema)
            writer, encoder, datum_writer = write_datum(datum, writers_schema)
            round_trip_datum = read_datum(writer, writers_schema)

            print 'Round Trip Datum: %s' % round_trip_datum
            if isinstance(round_trip_datum, Decimal):
                round_trip_datum = round_trip_datum.to_eng_string()
                datum = str(datum)
            elif isinstance(round_trip_datum, datetime.datetime):
                datum = datum.astimezone(tz=timezones.utc)
            if datum == round_trip_datum:
                correct += 1
        self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
Esempio n. 33
0
 def init_avro(self, output_path, part_id, schema_path):
     output_dir = None
     output_dirtmp = None  # Handle Avro Write Error
     if type(output_path) is str:
         output_dir = self.init_directory(output_path)
         output_dirtmp = self.init_directory(
             output_path + 'tmp')  # Handle Avro Write Error
     out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \
                    {"output_dir": output_dir, "part_id": str(part_id)}
     out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \
                       {"output_dirtmp": output_dirtmp, "part_id": str(part_id)}  # Handle Avro Write ERROR
     self.schema = open(schema_path, 'r').read()
     email_schema = schema.parse(self.schema)
     rec_writer = io.DatumWriter(email_schema)
     self.avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'),
                                                rec_writer, email_schema)
     # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError
     # when writing into AvroStorage
     self.avro_writertmp = datafile.DataFileWriter(
         open(out_filenametmp, 'wb'), rec_writer, email_schema)
Esempio n. 34
0
    def test_default_value(self):
        print_test_name('TEST DEFAULT VALUE')
        writers_schema = LONG_RECORD_SCHEMA
        datum_to_write = LONG_RECORD_DATUM

        correct = 0
        for field_type, default_json, default_datum in DEFAULT_VALUE_EXAMPLES:
            readers_schema = schema.parse("""\
        {"type": "record", "name": "Test",
         "fields": [{"name": "H", "type": %s, "default": %s}]}
        """ % (field_type, default_json))
            datum_to_read = {'H': default_datum}

            writer, encoder, datum_writer = write_datum(
                datum_to_write, writers_schema)
            datum_read = read_datum(writer, writers_schema, readers_schema)
            print('Datum Read: %s' % datum_read)
            if datum_to_read == datum_read:
                correct += 1
        self.assertEqual(correct, len(DEFAULT_VALUE_EXAMPLES))
Esempio n. 35
0
    def __init__(self, scheme=None, outputClient=None):
        """

    Parameters
    ---------------------------------------------
    scheme - The scheme for the datums to output - can be a json string
           - or an instance of Schema
    outputClient - The output client used to send messages to the parent
    """

        if not (isinstance(scheme, schema.Schema)):
            scheme = schema.parse(scheme)

        if (outputClient is None):
            raise ValueError("output client can't be none.")

        self.scheme = scheme

        self.datum_writer = avro.io.DatumWriter(writers_schema=self.scheme)
        self.outputClient = outputClient
Esempio n. 36
0
  def test_round_trip(self):
    print('')
    print('TEST ROUND TRIP')
    print('===============')
    print('')
    correct = 0
    for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in CODECS_TO_VALIDATE:
        print('')
        print('SCHEMA NUMBER %d' % (i + 1))
        print('================')
        print('')
        print('Schema: %s' % example_schema)
        print('Datum: %s' % datum)
        print('Codec: %s' % codec)

        # write data in binary to file 10 times
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        schema_object = schema.parse(example_schema)
        dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec)
        for i in range(10):
          dfw.append(datum)
        dfw.close()

        # read data in binary from file
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        dfr = datafile.DataFileReader(reader, datum_reader)
        round_trip_data = []
        for datum in dfr:
          round_trip_data.append(datum)

        print('Round Trip Data: %s' % round_trip_data)
        print('Round Trip Data Length: %d' % len(round_trip_data))
        is_correct = [datum] * 10 == round_trip_data
        if is_correct: correct += 1
        print('Correct Round Trip: %s' % is_correct)
        print('')
    os.remove(FILENAME)
    self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
def generateMultiFieldsCaseStatements(avro_text, proto_schema_fields,
                                      module_name, version):

    proto_schema_fields_names_to_numbers_map = {}
    id_field_number = 0
    for field in proto_schema_fields:
        proto_schema_fields_names_to_numbers_map[field.name] = field.number
        if (field.name == "id"):
            id_field_number = field.number

    avro_schema = schema.parse(avro_text)
    multi_fields_tags_to_names_map = {}
    for field in avro_schema.fields:

        sc, parent = checkRecordSchema(field)
        if (sc != None and field.name != "metadata"):
            #print("multiField")
            #print(field)
            for structField in sc.fields:
                if structField.name == "VALUE":
                    v_name = field.name
                    #print("struct field name" + v_name)
                    fieldNum = proto_schema_fields_names_to_numbers_map.get(
                        field.name)
                    v = fieldNum
                    multi_fields_tags_to_names_map[v] = v_name
            #print("---------------------------------------------------------")

    output = "case VALUE_TAG \n"
    for key in multi_fields_tags_to_names_map:
        name = multi_fields_tags_to_names_map.get(key)
        cName = businessNameToCNameMap.get(name)
        output = output + "when '" + str(key) + "'  then '" + cName + "'\n"
    output = output + "END"
    print(output)
    name = "{0}_{1}_caseStatements.txt".format(module_name, version)

    #file = open(name,"w")
    #file.write(output)
    #file.close()
    return output
Esempio n. 38
0
def write_avro_file(args, outsource='args.avro'):
    SCHEMA = schema.parse(makeSchema(args))
    rec_writer = io.DatumWriter(SCHEMA)   
        
    if outsource == sys.stdout:
        df_writer = datafile.DataFileWriter(sys.stdout, rec_writer, 
                                        writers_schema = SCHEMA, codec = 'deflate')
    
    else:
        df_writer = datafile.DataFileWriter(open(outsource,'wb'), rec_writer, 
                                        writers_schema = SCHEMA, codec = 'deflate')
    data = {}
    count = 1
    data['size'] = len(args)
    for arg in args:
        if type(arg) == tuple:
            arg = tupleToList(arg)
        data["arg%s"%(count)] = arg
        count +=1
    df_writer.append(data)
    df_writer.close()
Esempio n. 39
0
def main(argv):
    valid = set()
    invalid_avro = set()
    invalid_json = set()

    if len(argv) < 3:
        print "Give me an avro schema file and a whitespace-separated list of json files to validate against it."
    else:
        schema = parse(open(argv[1]).read())
        for arg in argv[2:]:
            try:
                json = loads(open(arg, 'r').read())
                if validate(schema, json):
                    valid.add(arg)
                else:
                    invalid_avro.add(arg)
            except ValueError:
                invalid_json.add(arg)
    print 'Valid files:\n\t' + '\n\t'.join(valid)
    print 'Invalid avro:\n\t' + '\n\t'.join(invalid_avro)
    print 'Invalid json:\n\t' + '\n\t'.join(invalid_json)
Esempio n. 40
0
  def test_metadata(self):
    # Test the writer with a 'with' statement.
    writer = open(FILENAME, 'wb')
    datum_writer = io.DatumWriter()
    sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
    schema_object = schema.parse(sample_schema)
    with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
      dfw.set_meta('test.string', 'foo')
      dfw.set_meta('test.number', '1')
      dfw.append(sample_datum)
    self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    reader = open(FILENAME, 'rb')
    datum_reader = io.DatumReader()
    with datafile.DataFileReader(reader, datum_reader) as dfr:
      self.assertEquals('foo', dfr.get_meta('test.string'))
      self.assertEquals('1', dfr.get_meta('test.number'))
      for datum in dfr:
        datums.append(datum)
    self.assertTrue(reader.closed)
def check_topic_key_schema_existence(SCHEMA_REGISTRY_URL, topic):
    try:
        # This is the second way of Getting Schema
        subject = topic + '-key'
        url = "{}/subjects/{}/versions".format(SCHEMA_REGISTRY_URL, subject),
        headers = {
            'Content-Type': 'application/vnd.schemaregistry.v1+json',
        }

        print "\nINFO: Making the API Call to SR"
        versions_response = requests.get(
            url="{}/subjects/{}/versions".format(SCHEMA_REGISTRY_URL, subject),
            headers={
                "Content-Type": "application/vnd.schemaregistry.v1+json",
            },
        )

        latest_version = versions_response.json()[-1]
        schema_response = requests.get(
            url="{}/subjects/{}/versions/{}".format(SCHEMA_REGISTRY_URL,
                                                    subject, latest_version),
            headers={
                "Content-Type": "application/vnd.schemaregistry.v1+json",
            },
        )

        key_schema_response_json = schema_response.json()

        print "\nINFO: Schema Found. Returning with Details"
        return schema.parse(key_schema_response_json["schema"])

    except Exception, e:
        print "\nWARN: Failed to get any Schema"
        print "\nINFO: Creating new by Calling save_new_key_schema_in_SR()"
        key_schema = save_new_key_schema_in_SR(SCHEMA_REGISTRY_URL, topic)
        # Here we are just preparing the schema and same will be sent with Producer mesg. If you try checking the key_schema in SR, you will fail as there will be NO ENTRIES.
        # Entry will happen when any message is written. ==> {u'message': u'Subject not found.', u'error_code': 40401}
        print "\nINFO: Schema Created. Returning with Details"
        return key_schema
Esempio n. 42
0
  def __init__(self, reader, datum_reader):
    self._reader = reader
    self._raw_decoder = io.BinaryDecoder(reader)
    self._datum_decoder = None # Maybe reset at every block.
    self._datum_reader = datum_reader
    
    # read the header: magic, meta, sync
    self._read_header()

    # ensure codec is valid
    self.codec = self.get_meta('avro.codec')
    if self.codec is None:
      self.codec = "null"
    if self.codec not in VALID_CODECS:
      raise DataFileException('Unknown codec: %s.' % self.codec)

    # get file length
    self._file_length = self.determine_file_length()

    # get ready to read
    self._block_count = 0
    self.datum_reader.writers_schema = schema.parse(self.get_meta(SCHEMA_KEY))
Esempio n. 43
0
  def test_round_trip(self):
    print_test_name('TEST ROUND TRIP')
    correct = 0
    for example_schema, datum in SCHEMAS_TO_VALIDATE:
      print 'Schema: %s' % example_schema
      print 'Datum: %s' % datum

      writers_schema = schema.parse(example_schema)
      writer, encoder, datum_writer = write_datum(datum, writers_schema)
      round_trip_datum = read_datum(writer, writers_schema)
      if example_schema == '{"type": "long", "logicalType": "timestamp-micros"}' and isinstance(datum, (int, long)):
          timedelta = datetime.timedelta(microseconds=datum)
          unix_epoch_datetime = datetime.datetime(1970, 1, 1, 0, 0, 0, 0)
          datum = unix_epoch_datetime + timedelta
      elif example_schema == '{"type": "long", "logicalType": "timestamp-millis"}' and isinstance(datum, (int, long)):
          timedelta = datetime.timedelta(microseconds=datum * 1000)
          unix_epoch_datetime = datetime.datetime(1970, 1, 1, 0, 0, 0, 0)
          datum = unix_epoch_datetime + timedelta
      print 'Round Trip Datum: %s' % round_trip_datum
      self.assertEquals(datum, round_trip_datum)
      if datum == round_trip_datum: correct += 1
    self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
Esempio n. 44
0
  def __init__(self, writer, datum_writer, writers_schema=None, codec='null'):
    """
    If the schema is not present, presume we're appending.

    @param writer: File-like object to write into.
    """
    self._writer = writer
    self._encoder = io.BinaryEncoder(writer)
    self._datum_writer = datum_writer
    self._buffer_writer = StringIO()
    self._buffer_encoder = io.BinaryEncoder(self._buffer_writer)
    self._block_count = 0
    self._meta = {}
    self._header_written = False

    if writers_schema is not None:
      if codec not in VALID_CODECS:
        raise DataFileException("Unknown codec: %r" % codec)
      self._sync_marker = DataFileWriter.generate_sync_marker()
      self.set_meta('avro.codec', codec)
      self.set_meta('avro.schema', str(writers_schema))
      self.datum_writer.writers_schema = writers_schema
    else:
      # open writer for reading to collect metadata
      dfr = DataFileReader(writer, io.DatumReader())
      
      # TODO(hammer): collect arbitrary metadata
      # collect metadata
      self._sync_marker = dfr.sync_marker
      self.set_meta('avro.codec', dfr.get_meta('avro.codec'))

      # get schema used to write existing file
      schema_from_file = dfr.get_meta('avro.schema')
      self.set_meta('avro.schema', schema_from_file)
      self.datum_writer.writers_schema = schema.parse(schema_from_file)

      # seek to the end of the file and prepare for writing
      writer.seek(0, 2)
      self._header_written = True
Esempio n. 45
0
 def testUnionSchemaSpecificity(self):
     union_schema = schema.parse("""
     [{
      "type" : "record",
      "name" : "A",
      "fields" : [{"name" : "foo", "type" : ["string", "null"]}]
     },
     {
      "type" : "record",
      "name" : "B",
      "fields" : [{"name" : "bar", "type" : ["string", "null"]}]
     },
     {
      "type" : "record",
      "name" : "AOrB",
      "fields" : [{"name" : "entity", "type" : ["A", "B"]}]
     }]
 """)
     sch = {s.name: s for s in union_schema.schemas}.get('AOrB')
     datum_to_read = {'entity': {'foo': 'this is an instance of schema A'}}
     writer, encoder, datum_writer = write_datum(datum_to_read, sch)
     datum_read = read_datum(writer, sch, sch)
     self.assertEqual(datum_to_read, datum_read)
Esempio n. 46
0
    def test_round_trip(self):
        print_test_name('TEST ROUND TRIP')
        correct = 0

        def are_equal(datum, round_trip_datum):
            if datum != round_trip_datum:
                return False
            if type(datum) == bool:
                return type(round_trip_datum) == bool
            else:
                return True

        for example_schema, datum in SCHEMAS_TO_VALIDATE:
            print 'Schema: %s' % example_schema
            print 'Datum: %s' % datum

            writers_schema = schema.parse(example_schema)
            writer, encoder, datum_writer = write_datum(datum, writers_schema)
            round_trip_datum = read_datum(writer, writers_schema)

            print 'Round Trip Datum: %s' % round_trip_datum
            if are_equal(datum, round_trip_datum): correct += 1
        self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
Esempio n. 47
0
def init_avro(output_path, part_id, schema_path):
    print("************* init_avro ***************")
    output_dir = None
    output_dirtmp = None  # Handle Avro Write Error
    if (type(output_path) is str):
        output_dir = init_directory(output_path)
        output_dirtmp = init_directory(output_path +
                                       'tmp')  # Handle Avro Write Error
    out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \
      {"output_dir": output_dir, "part_id": str(part_id)}
    out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \
      {"output_dirtmp": output_dirtmp, "part_id": str(part_id)}  # Handle Avro Write Error
    schemas = open(schema_path, 'r').read()
    email_schema = schema.parse(schemas)

    rec_writer = io.DatumWriter(email_schema)
    avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'), rec_writer,
                                          email_schema)
    # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError when writing into AvroStorage
    avro_writertmp = datafile.DataFileWriter(open(out_filenametmp, 'wb'),
                                             rec_writer, email_schema)
    return avro_writer, avro_writertmp
    print("*************end init_avro ***************")
Esempio n. 48
0
  def testMetadata(self):
    file_path = self.NewTempFile()

    # Test the writer with a 'with' statement.
    with open(file_path, 'wb') as writer:
      datum_writer = io.DatumWriter()
      sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
      schema_object = schema.parse(sample_schema)
      with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
        dfw.SetMeta('test.string', 'foo')
        dfw.SetMeta('test.number', '1')
        dfw.append(sample_datum)
      self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    with open(file_path, 'rb') as reader:
      datum_reader = io.DatumReader()
      with datafile.DataFileReader(reader, datum_reader) as dfr:
        self.assertEqual(b'foo', dfr.GetMeta('test.string'))
        self.assertEqual(b'1', dfr.GetMeta('test.number'))
        for datum in dfr:
          datums.append(datum)
      self.assertTrue(reader.closed)
Esempio n. 49
0
 def generate_avro_file(cls, schema_str: str, out_file,
                        num_rows: int) -> str:
     """Creates an avro file and saves to tmp folder to be used by test cases
     :param schema_str: valid avro schema as a string
     :param out_file: name of file to be created
     :param num_rows: number of rows to be generated
     :return: string with path to the file created
     """
     filename = os.path.join(TMP_FOLDER, out_file + "." + cls.filetype)
     parsed_schema = schema.parse(schema_str)
     rec_writer = io.DatumWriter(parsed_schema)
     file_writer = datafile.DataFileWriter(open(filename, "wb"), rec_writer,
                                           parsed_schema)
     for _ in range(num_rows):
         data = {}
         data["name"] = "".join(
             random.choice(string.ascii_letters) for i in range(10))
         data["age"] = randrange(-100, 100)
         data["address"] = random.uniform(1.1, 100.10)
         data["street"] = random.uniform(1.1, 100.10)
         data["valid"] = random.choice([True, False])
         file_writer.append(data)
     file_writer.close()
     return filename
Esempio n. 50
0
  def test_context_manager(self):
    # Context manager was introduced as a first class
    # member only in Python 2.6 and above.
    import sys
    if sys.version_info < (2,6):
      print 'Skipping context manager tests on this Python version.'
      return
    # Test the writer with a 'with' statement.
    writer = open(FILENAME, 'wb')
    datum_writer = io.DatumWriter()
    sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
    schema_object = schema.parse(sample_schema)
    with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
      dfw.append(sample_datum)
    self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    reader = open(FILENAME, 'rb')
    datum_reader = io.DatumReader()
    with datafile.DataFileReader(reader, datum_reader) as dfr:
      for datum in dfr:
        datums.append(datum)
    self.assertTrue(reader.closed)
Esempio n. 51
0
from avro import schema, datafile, io
import pprint

OUTFILE_NAME = 'output/product.avro'
INPUT_SCHEMA_NAME = 'product.avsc'

fo = open(INPUT_SCHEMA_NAME, "r+")
SCHEMA_STR = fo.read()
print "Read String is : ", SCHEMA_STR
fo.close()

SCHEMA = schema.parse(SCHEMA_STR)

rec_writer = io.DatumWriter(SCHEMA)

df_writer = datafile.DataFileWriter(open(OUTFILE_NAME, 'wb'),
                                    rec_writer,
                                    writers_schema=SCHEMA)

df_writer.append({
    "product_id": 1000,
    "product_name": "Hugo Boss XY",
    "product_description": "Hugo Xy Men 100 ml",
    "product_status": "AVAILABLE",
    "product_category": ["fragrance", "perfume"],
    "price": 10.35,
    "product_hash": "XY123"
})

df_writer.append({
    "product_id": 1001,
Esempio n. 52
0
def test_view_avro():
    cluster = pseudo_hdfs4.shared_cluster()
    try:
        c = make_logged_in_client()
        cluster.fs.setuser(cluster.superuser)
        if cluster.fs.isdir("/test-avro-filebrowser"):
            cluster.fs.rmtree('/test-avro-filebrowser/')

        cluster.fs.mkdir('/test-avro-filebrowser/')

        test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

        f = cluster.fs.open('/test-avro-filebrowser/test-view.avro', "w")
        data_file_writer = datafile.DataFileWriter(f,
                                                   io.DatumWriter(),
                                                   writers_schema=test_schema,
                                                   codec='deflate')
        dummy_datum = {
            'name': 'Test',
            'integer': 10,
        }
        data_file_writer.append(dummy_datum)
        data_file_writer.close()

        # autodetect
        response = c.get(
            '/filebrowser/view/test-avro-filebrowser/test-view.avro')
        # (Note: we use eval here cause of an incompatibility issue between
        # the representation string of JSON dicts in simplejson vs. json)
        assert_equal(eval(response.context['view']['contents']), dummy_datum)

        # offsetting should work as well
        response = c.get(
            '/filebrowser/view/test-avro-filebrowser/test-view.avro?offset=1')
        assert_equal('avro', response.context['view']['compression'])

        f = cluster.fs.open('/test-avro-filebrowser/test-view2.avro', "w")
        f.write("hello")
        f.close()

        # we shouldn't autodetect non avro files
        response = c.get(
            '/filebrowser/view/test-avro-filebrowser/test-view2.avro')
        assert_equal(response.context['view']['contents'], "hello")

        # we should fail to do a bad thing if they specify compression when it's not set.
        response = c.get(
            '/filebrowser/view/test-avro-filebrowser/test-view2.avro?compression=gzip'
        )
        assert_true('Failed to decompress' in response.context['message'])

    finally:
        try:
            cluster.fs.rmtree('/test-avro-filebrowser/')
        except:
            pass  # Don't let cleanup errors mask earlier failures
Esempio n. 53
0
def test_view_snappy_compressed_avro():
    if not snappy_installed():
        raise SkipTest
    import snappy

    cluster = pseudo_hdfs4.shared_cluster()
    finish = []
    try:
        c = make_logged_in_client()
        cluster.fs.setuser(cluster.superuser)
        if cluster.fs.isdir("/test-snappy-avro-filebrowser"):
            cluster.fs.rmtree('/test-snappy-avro-filebrowser/')

        cluster.fs.mkdir('/test-snappy-avro-filebrowser/')

        test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

        # Cannot use StringIO with datafile writer!
        f = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.avro',
                            "w")
        data_file_writer = datafile.DataFileWriter(f,
                                                   io.DatumWriter(),
                                                   writers_schema=test_schema,
                                                   codec='deflate')
        dummy_datum = {
            'name': 'Test',
            'integer': 10,
        }
        data_file_writer.append(dummy_datum)
        data_file_writer.close()

        fh = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.avro',
                             'r')
        f = cluster.fs.open(
            '/test-snappy-avro-filebrowser/test-view.compressed.avro', "w")
        f.write(snappy.compress(fh.read()))
        f.close()
        fh.close()

        # Snappy compressed fail
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.avro?compression=snappy_avro'
        )
        assert_true('Failed to decompress' in response.context['message'],
                    response)

        # Snappy compressed succeed
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro'
        )
        assert_equal('snappy_avro', response.context['view']['compression'])
        assert_equal(eval(response.context['view']['contents']), dummy_datum,
                     response)
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro?compression=snappy_avro'
        )
        assert_equal('snappy_avro', response.context['view']['compression'])
        assert_equal(eval(response.context['view']['contents']), dummy_datum,
                     response)

        # Avro should also decompress snappy
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro?compression=avro'
        )
        assert_equal('snappy_avro', response.context['view']['compression'])
        assert_equal(eval(response.context['view']['contents']), dummy_datum,
                     response)

        # Largest snappy compressed file
        finish.append(MAX_SNAPPY_DECOMPRESSION_SIZE.set_for_testing(1))
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.avro?compression=snappy_avro'
        )
        assert_true(
            'File size is greater than allowed max snappy decompression size of 1'
            in response.context['message'], response)

    finally:
        for done in finish:
            done()
        try:
            cluster.fs.rmtree('/test-snappy-avro-filebrowser/')
        except:
            pass  # Don't let cleanup errors mask earlier failures
Esempio n. 54
0
 def test_correct_recursive_extraction(self):
   s = schema.parse('{"type": "record", "name": "X", "fields": [{"name": "y", "type": {"type": "record", "name": "Y", "fields": [{"name": "Z", "type": "X"}]}}]}')
   t = schema.parse(str(s.fields[0].type))
   # If we've made it this far, the subschema was reasonably stringified; it ccould be reparsed.
   self.assertEqual("X", t.fields[0].type.name)
Esempio n. 55
0
def _load_schema(path):
    return _schema.parse(open(path).read())
Esempio n. 56
0
def test_allowed_operations():
  fst = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read())
  sec = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read())
  validator.check([fst, sec])
Esempio n. 57
0
  ('"float"', '1.1', 1.1),
  ('"double"', '1.1', 1.1),
  ('{"type": "fixed", "name": "F", "size": 2}', '"\u00FF\u00FF"', u'\xff\xff'),
  ('{"type": "enum", "name": "F", "symbols": ["FOO", "BAR"]}', '"FOO"', 'FOO'),
  ('{"type": "array", "items": "int"}', '[1, 2, 3]', [1, 2, 3]),
  ('{"type": "map", "values": "int"}', '{"a": 1, "b": 2}', {'a': 1, 'b': 2}),
  ('["int", "null"]', '5', 5),
  ('{"type": "record", "name": "F", "fields": [{"name": "A", "type": "int"}]}',
   '{"A": 5}', {'A': 5}),
)

LONG_RECORD_SCHEMA = schema.parse("""\
  {"type": "record",
   "name": "Test",
   "fields": [{"name": "A", "type": "int"},
              {"name": "B", "type": "int"},
              {"name": "C", "type": "int"},
              {"name": "D", "type": "int"},
              {"name": "E", "type": "int"},
              {"name": "F", "type": "int"},
              {"name": "G", "type": "int"}]}""")

LONG_RECORD_DATUM = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}

def avro_hexlify(reader):
  """Return the hex value, as a string, of a binary-encoded int or long."""
  bytes = []
  current_byte = reader.read(1)
  bytes.append(hexlify(current_byte))
  while (ord(current_byte) & 0x80) != 0:
    current_byte = reader.read(1)
    bytes.append(hexlify(current_byte))
Esempio n. 58
0
    def __init__(self, inschema=None, midschema=None, outschema=None):
        """

    Parameters
    ---------------------------------------------------------
    inschema - The scheme for the input to the mapper
    midschema  - The scheme for the output of the mapper
    outschema - The scheme for the output of the reducer

    An example scheme for the prototypical word count example would be
    inscheme='{"type":"record", "name":"Pair","namespace":"org.apache.avro.mapred","fields":[
              {"name":"key","type":"string"},
              {"name":"value","type":"long","order":"ignore"}]
              }'

    Important: The records are split into (key,value) pairs as required by map reduce
    by using all fields with "order"=ignore for the key and the remaining fields for the value.

    The subclass provides these schemas in order to tell this class which schemas it expects.
    The configure request will also provide the schemas that the parent process is using.
    This allows us to check whether the schemas match and if not whether we can resolve
    the differences (see https://avro.apache.org/docs/current/spec.html#Schema+Resolution))

    """

        if (inschema is None):
            raise ValueError("inschema can't be None")

        if (midschema is None):
            raise ValueError("midschema can't be None")

        if (outschema is None):
            raise ValueError("outschema can't be None")

        # make sure we can parse the schemas
        # Should we call fail if we can't parse the schemas?
        self.inschema = schema.parse(inschema)
        self.midschema = schema.parse(midschema)
        self.outschema = schema.parse(outschema)

        # declare various variables
        self.clienTransciever = None

        # output client is used to communicate with the parent process
        # in particular to transmit the outputs of the mapper and reducer
        self.outputClient = None

        # collectors for the output of the mapper and reducer
        self.midCollector = None
        self.outCollector = None

        self._partitions = None

        # cache a list of the fields used by the reducer as the keys
        # we need the fields to decide when we have finished processing all values for
        # a given key. We cache the fields to be more efficient
        self._red_fkeys = None

        # We need to keep track of the previous record fed to the reducer
        # b\c we need to be able to determine when we start processing a new group
        # in the reducer
        self.midRecord = None

        # create an event object to signal when
        # http server is ready to be shutdown
        self.ready_for_shutdown = threading.Event()
        self.log = logging.getLogger("TetherTask")
Esempio n. 59
0
 def __init__(self, avro_schema_file, avro_data_file):
     self.avro_data_file = avro_data_file
     self.schema = parse(open(avro_schema_file, "rb").read())
Esempio n. 60
0
from avro import schema
from avro import io

#
# Constants
#

VERSION = 1
MAGIC = 'Obj' + chr(VERSION)
MAGIC_SIZE = len(MAGIC)
SYNC_SIZE = 16
SYNC_INTERVAL = 1000 * SYNC_SIZE  # TODO(hammer): make configurable
META_SCHEMA = schema.parse("""\
{"type": "record", "name": "org.apache.avro.file.Header",
 "fields" : [
   {"name": "magic", "type": {"type": "fixed", "name": "magic", "size": %d}},
   {"name": "meta", "type": {"type": "map", "values": "bytes"}},
   {"name": "sync", "type": {"type": "fixed", "name": "sync", "size": %d}}]}
""" % (MAGIC_SIZE, SYNC_SIZE))
VALID_CODECS = ['null', 'deflate']
VALID_ENCODINGS = ['binary']  # not used yet

CODEC_KEY = "avro.codec"
SCHEMA_KEY = "avro.schema"

#
# Exceptions
#


class DataFileException(schema.AvroException):