def test_other_attributes(self): correct = 0 props = {} for example in OTHER_PROP_EXAMPLES: original_schema = schema.parse(example.schema_string) round_trip_schema = schema.parse(str(original_schema)) self.assertEqual(original_schema.other_props,round_trip_schema.other_props) if original_schema.type == "record": field_props = 0 for f in original_schema.fields: if f.other_props: props.update(f.other_props) field_props += 1 self.assertEqual(field_props,len(original_schema.fields)) if original_schema.other_props: props.update(original_schema.other_props) correct += 1 for k in props: v = props[k] if k == "cp_boolean": self.assertEqual(type(v), bool) elif k == "cp_int": self.assertEqual(type(v), int) elif k == "cp_object": self.assertEqual(type(v), dict) elif k == "cp_float": self.assertEqual(type(v), float) elif k == "cp_array": self.assertEqual(type(v), list) self.assertEqual(correct,len(OTHER_PROP_EXAMPLES))
def test_parse(self): correct = 0 for iexample, example in enumerate(EXAMPLES): logging.debug('Testing example #%d\n%s', iexample, example.schema_string) try: schema.parse(example.schema_string) if example.valid: correct += 1 else: self.fail('Invalid schema was parsed:\n%s' % example.schema_string) except Exception as exn: if example.valid: self.fail( 'Valid schema failed to parse: %r\n%s' % (example.schema_string, traceback.format_exc())) else: if logging.getLogger().getEffectiveLevel() <= 5: logging.debug('Expected error:\n%s', traceback.format_exc()) else: logging.debug('Expected error: %r', exn) correct += 1 self.assertEqual( correct, len(EXAMPLES), 'Parse behavior correct on %d out of %d schemas.' % (correct, len(EXAMPLES)), )
def testreadfiles(self): origschm = schema.parse(open("src/test/schemata/interop.avsc").read()) for file in os.listdir(_DATAFILE_DIR): print "Validating:", file.__str__() dr = io.DataFileReader(open(_DATAFILE_DIR+file, "rb"), self.__datumreader()) count = int(dr.getmeta("count")) decodedSchm = schema.parse(dr.getmeta("schema")) self.assertEquals(origschm, decodedSchm) for i in range(0,count): datum = dr.next() self.assertTrue(self.__validator(origschm, datum)) # validate reading of blocking arrays, blocking maps for file in os.listdir(_BLOCKINGFILE_DIR): print "Validating:", file.__str__() reader = open(_BLOCKINGFILE_DIR+file, "rb") decoder = io.Decoder(reader) dreader = self.__datumreader() dreader.setschema(origschm) count = int(decoder.readlong()) #metadata:the count of objects in the file blockcount = decoder.readlong() for i in range(0,count): while blockcount == 0: blockcount = decoder.readlong() blockcount -= 1 datum = dreader.read(decoder) self.assertTrue(self.__validator(origschm, datum))
def test_equivalence_after_round_trip(self): """ 1. Given a string, parse it to get Avro schema "original". 2. Serialize "original" to a string and parse that string to generate Avro schema "round trip". 3. Ensure "original" and "round trip" schemas are equivalent. """ print_test_name('TEST ROUND TRIP') correct = 0 for example in VALID_EXAMPLES: try: original_schema = schema.parse(example.schema_string) round_trip_schema = schema.parse(str(original_schema)) if original_schema == round_trip_schema: correct += 1 debug_msg = "%s: ROUND TRIP SUCCESS" % example.name else: debug_msg = "%s: ROUND TRIP FAILURE" % example.name except: debug_msg = "%s: ROUND TRIP FAILURE" % example.name finally: print debug_msg fail_msg = "Round trip success on %d out of %d schemas" % \ (correct, len(VALID_EXAMPLES)) self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
def test_name(self): int_schema = schema.parse("\"int\"") self.assertEqual("int", int_schema.name) self.assertEqual("int", int_schema.fullname) int_array_schema = schema.parse("""{"type": "array", "items": "int"}""") self.assertEqual("array", int_array_schema.name) self.assertEqual("array", int_array_schema.fullname)
def checkdefault(self, schemajson, defaultjson, defaultvalue): self.check(schemajson) actual = schema.parse("{\"type\":\"record\", \"name\":\"Foo\"," + "\"fields\":[]}") expected = schema.parse("{\"type\":\"record\", \"name\":\"Foo\"," +"\"fields\":[{\"name\":\"f\", " +"\"type\":"+schemajson+", " +"\"default\":"+defaultjson+"}]}") reader = genericio.DatumReader(actual, expected) record = reader.read(io.Decoder(cStringIO.StringIO())) self.assertEquals(defaultvalue, record.get("f"))
def test_exception_is_not_swallowed_on_parse_error(self): print_test_name('TEST EXCEPTION NOT SWALLOWED ON PARSE ERROR') try: schema.parse('/not/a/real/file') caught_exception = False except schema.SchemaParseException, e: expected_message = 'Error parsing JSON: /not/a/real/file, error = ' \ 'No JSON object could be decoded' self.assertEqual(expected_message, e.args[0]) caught_exception = True
def test_sanity(): """ Ensures that our "base" and "good" schemas are actually forwards- and backwards-compatible """ # fst schema / record fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read()) fst_writer = DatumWriter(writers_schema=fst_schema) fst_record = { "fieldWithoutDefaultValue": 0, "properField": 0, "enumField": "A", "unionField": None, "arrayField": ["world"], "mapField": {"hello": "world"}, "fixedField": "aaaaaaaaaaaaaaaa" } # sec schema / record sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read()) sec_writer = DatumWriter(writers_schema=sec_schema) sec_record = { "fieldWithoutDefaultValue": 0, "properField2": 0, "enumField": "B", "unionField": None, "arrayField": ["world"], "fixedField": "bbbbbbbbbbbbbbbb" } # Encode record w/ fst fst_buf = StringIO.StringIO() fst_encoder = BinaryEncoder(fst_buf) fst_writer.write(fst_record, fst_encoder) fst_data = fst_buf.getvalue() # Encode record w/ sec sec_buf = StringIO.StringIO() sec_encoder = BinaryEncoder(sec_buf) sec_writer.write(sec_record, sec_encoder) sec_data = sec_buf.getvalue() # writers == fst, readers == sec sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema) sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data)) sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good # writers == sec, readers == fst fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema) fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data)) fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
def check(self, string): schm = schema.parse(string) st = schema.stringval(schm) self.assertEquals(string.replace(" ",""), st.replace(" ","")) #test __eq__ self.assertEquals(schm, schema.parse(string)) #test hashcode doesn't generate infinite recursion schm.__hash__() randomdata = self.__random(schm) for i in range(1,10): self.checkser(schm, randomdata) self.checkdatafile(schm)
def _check(fst_name, sec_name): """ Tests evolution from schema named MyRecord.{fst_name}.avsc to schema named MyRecord.{sec_name}.avsc in BASE_DIR """ fst = schema.parse(open("%s/MyRecord.%s.avsc" % (BASE_DIR, fst_name)).read()) sec = schema.parse(open("%s/MyRecord.%s.avsc" % (BASE_DIR, sec_name)).read()) try: validator.check([fst, sec]) except: "good"
def test_valid_cast_to_string_after_parse(self): """ Test that the string generated by an Avro Schema object is, in fact, a valid Avro schema. """ correct = 0 for example in VALID_EXAMPLES: schema_data = schema.parse(example.schema_string) schema.parse(str(schema_data)) correct += 1 fail_msg = "Cast to string success on %d out of %d schemas" % \ (correct, len(VALID_EXAMPLES)) self.assertEqual(correct, len(VALID_EXAMPLES), fail_msg)
def test_unknown_symbol(self): writer_schema = schema.parse("""\ {"type": "enum", "name": "Test", "symbols": ["FOO", "BAR"]}""") datum_to_write = 'FOO' reader_schema = schema.parse("""\ {"type": "enum", "name": "Test", "symbols": ["BAR", "BAZ"]}""") writer, encoder, datum_writer = write_datum(datum_to_write, writer_schema) reader = io.BytesIO(writer.getvalue()) decoder = avro_io.BinaryDecoder(reader) datum_reader = avro_io.DatumReader(writer_schema, reader_schema) self.assertRaises(avro_io.SchemaResolutionException, datum_reader.read, decoder)
def test_schema_promotion(self): # note that checking writer_schema.type in read_data # allows us to handle promotion correctly promotable_schemas = ['"int"', '"long"', '"float"', '"double"'] incorrect = 0 for i, ws in enumerate(promotable_schemas): writer_schema = schema.parse(ws) datum_to_write = 219 for rs in promotable_schemas[i + 1:]: reader_schema = schema.parse(rs) writer, enc, dw = write_datum(datum_to_write, writer_schema) datum_read = read_datum(writer, writer_schema, reader_schema) logging.debug('Writer: %s Reader: %s', writer_schema, reader_schema) logging.debug('Datum Read: %s', datum_read) if datum_read != datum_to_write: incorrect += 1 self.assertEqual(incorrect, 0)
def process_files(output_path, hdfs_path, batch): """Process all files in batch a produce an avro file. """ now = datetime.datetime.now() ts = now.strftime("%Y-%m-%d-%H-%M-%S-%f") output_filename = FILE_PREFIX + "-" + ts + '.avro' print "* creating new avro file: " + output_filename xschema = schema.parse(open(SCHEMA_FILE).read()) rec_writer = io.DatumWriter(xschema) df_writer = datafile.DataFileWriter( open(output_path + output_filename, 'wb'), rec_writer, writers_schema = xschema, codec = 'deflate') for file_path in batch: bytes = read_binary(file_path) content = base64.b64encode(bytes) data = {} data['doc_uuid'] = str(uuid.uuid4()) data['file_path'] = file_path data['content'] = content df_writer.append(data) df_writer.close() time.sleep(1) hdfs_put(output_path + output_filename, hdfs_path)
def main(): # 检测参数个数 if len(sys.argv) != 3: sys.exit('Usage %s <Schema file> <Data_file>' % (sys.argv[0])) # 从 avsc 文件中读取模式 schema_string = open(sys.argv[1], "r").read() # 打开 avro 文件 avro_file = open(sys.argv[2], "wb") # 获取 DatumWriter 对象 datum_writer = io.DatumWriter() # 解析模式 schema_object = schema.parse(schema_string) # 获得 DataFileWriter 对象 data_file_writer = datafile.DataFileWriter(avro_file, datum_writer, schema_object) # 从输入中赋值 for line in sys.stdin: (left, right) = line[:-1].split(",") data_file_writer.append({'left':left, "right":right}) # 关闭 DataFileWriter data_file_writer.close()
def test_view_avro(): cluster = mini_cluster.shared_cluster(conf=True) try: c = make_logged_in_client() cluster.fs.setuser(cluster.superuser) if cluster.fs.isdir("/test-avro-filebrowser"): cluster.fs.rmtree('/test-avro-filebrowser/') cluster.fs.mkdir('/test-avro-filebrowser/') test_schema = schema.parse(""" { "name": "test", "type": "record", "fields": [ { "name": "name", "type": "string" }, { "name": "integer", "type": "int" } ] } """) f = cluster.fs.open('/test-avro-filebrowser/test-view.avro', "w") data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(), writers_schema=test_schema, codec='deflate') dummy_datum = { 'name': 'Test', 'integer': 10, } data_file_writer.append(dummy_datum) data_file_writer.close() # autodetect response = c.get('/filebrowser/view/test-avro-filebrowser/test-view.avro') # (Note: we use eval here cause of an incompatibility issue between # the representation string of JSON dicts in simplejson vs. json) assert_equal(eval(response.context['view']['contents']), dummy_datum) # offsetting should work as well response = c.get('/filebrowser/view/test-avro-filebrowser/test-view.avro?offset=1') assert_true(response.context.has_key('view')) f = cluster.fs.open('/test-avro-filebrowser/test-view2.avro', "w") f.write("hello") f.close() # we shouldn't autodetect non avro files response = c.get('/filebrowser/view/test-avro-filebrowser/test-view2.avro') assert_equal(response.context['view']['contents'], "hello") # we should fail to do a bad thing if they specify compression when it's not set. response = c.get('/filebrowser/view/test-avro-filebrowser/test-view2.avro?compression=gzip') assert_false(response.context.has_key('view')) finally: try: cluster.fs.rmtree('/test-avro-filebrowser/') except: pass # Don't let cleanup errors mask earlier failures cluster.shutdown()
def test_context_manager(self): # Context manager was introduced as a first class # member only in Python 2.6 and above. import sys if sys.version_info < (2, 6): print "Skipping context manager tests on this Python version." return # Test the writer with a 'with' statement. writer = open(FILENAME, "wb") datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, "rb") datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def __init__(self, reader, datum_reader): """Initializes a new data file reader. Args: reader: Open file to read from. datum_reader: Avro datum reader. """ self._reader = reader self._raw_decoder = avro_io.BinaryDecoder(reader) self._datum_decoder = None # Maybe reset at every block. self._datum_reader = datum_reader # read the header: magic, meta, sync self._read_header() # ensure codec is valid self.codec = self.GetMeta('avro.codec').decode('utf-8') if self.codec is None: self.codec = "null" if self.codec not in VALID_CODECS: raise DataFileException('Unknown codec: %s.' % self.codec) self._file_length = self._GetInputFileLength() # get ready to read self._block_count = 0 self.datum_reader.writer_schema = ( schema.parse(self.GetMeta(SCHEMA_KEY).decode('utf-8')))
def init_avro(output_path, part_id, schema_path): print("************* init_avro ***************") output_dir = None output_dirtmp = None # Handle Avro Write Error if(type(output_path) is str): output_dir = init_directory(output_path) output_dirtmp = init_directory(output_path + 'tmp') # Handle Avro Write Error out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \ {"output_dir": output_dir, "part_id": str(part_id)} out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \ {"output_dirtmp": output_dirtmp, "part_id": str(part_id)} # Handle Avro Write Error schemas = open(schema_path, 'r').read() email_schema = schema.parse(schemas) rec_writer = io.DatumWriter(email_schema) avro_writer = datafile.DataFileWriter( open(out_filename, 'wb'), rec_writer, email_schema ) # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError when writing into AvroStorage avro_writertmp = datafile.DataFileWriter( open(out_filenametmp, 'wb'), rec_writer, email_schema ) return avro_writer, avro_writertmp print("*************end init_avro ***************")
def __init__(self, block_bytes, num_records, codec, schema_string): # Decompress data early on (if needed) and thus decrease the number of # parallel copies of the data in memory at any given in time during # block iteration. self._decompressed_block_bytes = self._decompress_bytes(block_bytes, codec) self._num_records = num_records self._schema = schema.parse(schema_string)
def main(): if len(sys.argv) < 2: print "Usage: cat input.json | python2.7 JSONtoAvro.py output" return s = schema.parse(open("tweet.avsc").read()) f = open(sys.argv[1], "wb") writer = datafile.DataFileWriter(f, io.DatumWriter(), s, codec = 'deflate') failed = 0 for line in sys.stdin: line = line.strip() try: data = json.loads(line) except ValueError as detail: continue try: writer.append(data) except io.AvroTypeException as detail: print line failed += 1 writer.close() print str(failed) + " failed in schema"
def __init__(self, reader, dreader): self.__reader = reader self.__decoder = Decoder(reader) mag = struct.unpack(len(_MAGIC).__str__()+'s', self.__reader.read(len(_MAGIC)))[0] if mag != _MAGIC: raise schema.AvroException("Not an avro data file") #find the length self.__reader.seek(0,2) self.__length = self.__reader.tell() self.__reader.seek(-4, 2) footersize = (int(ord(self.__reader.read(1)) << 24) + int(ord(self.__reader.read(1)) << 16) + int(ord(self.__reader.read(1)) << 8) + int(ord(self.__reader.read(1)))) seekpos = self.__reader.seek(self.__length-footersize) metalength = self.__decoder.readlong() if metalength < 0: metalength = -metalength self.__decoder.readlong() #ignore byteCount if this is a blocking map self.__meta = dict() for i in range(0, metalength): key = self.__decoder.readutf8() self.__meta[key] = self.__decoder.readbytes() self.__sync = self.__meta.get("sync") self.__count = int(self.__meta.get("count")) self.__codec = self.__meta.get("codec") if (self.__codec != None) and (self.__codec != "null"): raise schema.AvroException("Unknown codec: " + self.__codec) self.__schema = schema.parse(self.__meta.get("schema").encode("utf-8")) self.__blockcount = 0 self.__dreader = dreader self.__dreader.setschema(self.__schema) self.__reader.seek(len(_MAGIC))
def test_unknown_symbol(self): print_test_name('TEST UNKNOWN SYMBOL') writers_schema = schema.parse("""\ {"type": "enum", "name": "Test", "symbols": ["FOO", "BAR"]}""") datum_to_write = 'FOO' readers_schema = schema.parse("""\ {"type": "enum", "name": "Test", "symbols": ["BAR", "BAZ"]}""") writer, encoder, datum_writer = write_datum(datum_to_write, writers_schema) reader = StringIO(writer.getvalue()) decoder = io.BinaryDecoder(reader) datum_reader = io.DatumReader(writers_schema, readers_schema) self.assertRaises(io.SchemaResolutionException, datum_reader.read, decoder)
def _write_lines(self,lines,fname): """ Write the lines to an avro file named fname Parameters -------------------------------------------------------- lines - list of strings to write fname - the name of the file to write to. """ import avro.io as avio from avro.datafile import DataFileReader,DataFileWriter from avro import schema #recursively make all directories dparts=fname.split(os.sep)[:-1] for i in range(len(dparts)): pdir=os.sep+os.sep.join(dparts[:i+1]) if not(os.path.exists(pdir)): os.mkdir(pdir) with file(fname,'w') as hf: inschema="""{"type":"string"}""" writer=DataFileWriter(hf,avio.DatumWriter(inschema),writers_schema=schema.parse(inschema)) #encoder = avio.BinaryEncoder(writer) #datum_writer = avio.DatumWriter() for datum in lines: writer.append(datum) writer.close()
def test_container(self): writer = open('data.avro', 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse("""\ { "type": "record", "name": "Pair", "doc": "A pair of strings.", "fields": [ {"name": "left", "type": "string"}, {"name": "right", "type": "string"} ] } """) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object) datum = {'left':'L', 'right':'R'} dfw.append(datum) dfw.close() reader = open('data.avro', 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) data = [] for datum in dfr: data.append(datum) self.assertEquals(1, len(data)); self.assertEquals(datum, data[0]);
def check_skip_number(number_type): print_test_name('TEST SKIP %s' % number_type.upper()) correct = 0 for value_to_skip, hex_encoding in BINARY_ENCODINGS: VALUE_TO_READ = 6253 print 'Value to Skip: %d' % value_to_skip # write the value to skip and a known value writers_schema = schema.parse('"%s"' % number_type.lower()) writer, encoder, datum_writer = write_datum(value_to_skip, writers_schema) datum_writer.write(VALUE_TO_READ, encoder) # skip the value reader = StringIO(writer.getvalue()) decoder = io.BinaryDecoder(reader) decoder.skip_long() # read data from string buffer datum_reader = io.DatumReader(writers_schema) read_value = datum_reader.read(decoder) print 'Read Value: %d' % read_value if read_value == VALUE_TO_READ: correct += 1 print '' return correct
def _write_items(base_name, schema_str, items): avro_schema = schema.parse(schema_str) avro_file = base_name + '.avro' with DataFileWriter(open(avro_file, "w"), DatumWriter(), avro_schema) as writer: for i in items: writer.append(i) writer.close return (avro_file)
def test_type_exception(self): print_test_name('TEST TYPE EXCEPTION') writers_schema = schema.parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "F", "type": "int"}, {"name": "E", "type": "int"}]}""") datum_to_write = {'E': 5, 'F': 'Bad'} self.assertRaises(io.AvroTypeException, write_datum, datum_to_write, writers_schema)
def test_schema_promotion(self): print_test_name('TEST SCHEMA PROMOTION') # note that checking writers_schema.type in read_data # allows us to handle promotion correctly promotable_schemas = ['"int"', '"long"', '"float"', '"double"'] incorrect = 0 for i, ws in enumerate(promotable_schemas): writers_schema = schema.parse(ws) datum_to_write = 219 for rs in promotable_schemas[i + 1:]: readers_schema = schema.parse(rs) writer, enc, dw = write_datum(datum_to_write, writers_schema) datum_read = read_datum(writer, writers_schema, readers_schema) print 'Writer: %s Reader: %s' % (writers_schema, readers_schema) print 'Datum Read: %s' % datum_read if datum_read != datum_to_write: incorrect += 1 self.assertEquals(incorrect, 0)
def test_null(self): schema_null = schema.parse('"null"') avro_io.check_schema(datum=None, schema=schema_null) try: avro_io.check_schema(datum=1, schema=schema_null) self.fail("Should have failed") except schema.AvroException as exn: pass
def testContextManager(self): file_path = self.NewTempFile() # Test the writer with a 'with' statement. with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def test_round_trip(self): print_test_name('TEST ROUND TRIP') correct = 0 for example_schema, datum in SCHEMAS_TO_VALIDATE: print 'Schema: %s' % example_schema print 'Datum: %s' % datum writers_schema = schema.parse(example_schema) writer, encoder, datum_writer = write_datum(datum, writers_schema) round_trip_datum = read_datum(writer, writers_schema) print 'Round Trip Datum: %s' % round_trip_datum if isinstance(round_trip_datum, Decimal): round_trip_datum = round_trip_datum.to_eng_string() datum = str(datum) elif isinstance(round_trip_datum, datetime.datetime): datum = datum.astimezone(tz=timezones.utc) if datum == round_trip_datum: correct += 1 self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
def init_avro(self, output_path, part_id, schema_path): output_dir = None output_dirtmp = None # Handle Avro Write Error if type(output_path) is str: output_dir = self.init_directory(output_path) output_dirtmp = self.init_directory( output_path + 'tmp') # Handle Avro Write Error out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \ {"output_dir": output_dir, "part_id": str(part_id)} out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \ {"output_dirtmp": output_dirtmp, "part_id": str(part_id)} # Handle Avro Write ERROR self.schema = open(schema_path, 'r').read() email_schema = schema.parse(self.schema) rec_writer = io.DatumWriter(email_schema) self.avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'), rec_writer, email_schema) # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError # when writing into AvroStorage self.avro_writertmp = datafile.DataFileWriter( open(out_filenametmp, 'wb'), rec_writer, email_schema)
def test_default_value(self): print_test_name('TEST DEFAULT VALUE') writers_schema = LONG_RECORD_SCHEMA datum_to_write = LONG_RECORD_DATUM correct = 0 for field_type, default_json, default_datum in DEFAULT_VALUE_EXAMPLES: readers_schema = schema.parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "H", "type": %s, "default": %s}]} """ % (field_type, default_json)) datum_to_read = {'H': default_datum} writer, encoder, datum_writer = write_datum( datum_to_write, writers_schema) datum_read = read_datum(writer, writers_schema, readers_schema) print('Datum Read: %s' % datum_read) if datum_to_read == datum_read: correct += 1 self.assertEqual(correct, len(DEFAULT_VALUE_EXAMPLES))
def __init__(self, scheme=None, outputClient=None): """ Parameters --------------------------------------------- scheme - The scheme for the datums to output - can be a json string - or an instance of Schema outputClient - The output client used to send messages to the parent """ if not (isinstance(scheme, schema.Schema)): scheme = schema.parse(scheme) if (outputClient is None): raise ValueError("output client can't be none.") self.scheme = scheme self.datum_writer = avro.io.DatumWriter(writers_schema=self.scheme) self.outputClient = outputClient
def test_round_trip(self): print('') print('TEST ROUND TRIP') print('===============') print('') correct = 0 for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: print('') print('SCHEMA NUMBER %d' % (i + 1)) print('================') print('') print('Schema: %s' % example_schema) print('Datum: %s' % datum) print('Codec: %s' % codec) # write data in binary to file 10 times writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse(example_schema) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) for i in range(10): dfw.append(datum) dfw.close() # read data in binary from file reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) round_trip_data = [] for datum in dfr: round_trip_data.append(datum) print('Round Trip Data: %s' % round_trip_data) print('Round Trip Data Length: %d' % len(round_trip_data)) is_correct = [datum] * 10 == round_trip_data if is_correct: correct += 1 print('Correct Round Trip: %s' % is_correct) print('') os.remove(FILENAME) self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
def generateMultiFieldsCaseStatements(avro_text, proto_schema_fields, module_name, version): proto_schema_fields_names_to_numbers_map = {} id_field_number = 0 for field in proto_schema_fields: proto_schema_fields_names_to_numbers_map[field.name] = field.number if (field.name == "id"): id_field_number = field.number avro_schema = schema.parse(avro_text) multi_fields_tags_to_names_map = {} for field in avro_schema.fields: sc, parent = checkRecordSchema(field) if (sc != None and field.name != "metadata"): #print("multiField") #print(field) for structField in sc.fields: if structField.name == "VALUE": v_name = field.name #print("struct field name" + v_name) fieldNum = proto_schema_fields_names_to_numbers_map.get( field.name) v = fieldNum multi_fields_tags_to_names_map[v] = v_name #print("---------------------------------------------------------") output = "case VALUE_TAG \n" for key in multi_fields_tags_to_names_map: name = multi_fields_tags_to_names_map.get(key) cName = businessNameToCNameMap.get(name) output = output + "when '" + str(key) + "' then '" + cName + "'\n" output = output + "END" print(output) name = "{0}_{1}_caseStatements.txt".format(module_name, version) #file = open(name,"w") #file.write(output) #file.close() return output
def write_avro_file(args, outsource='args.avro'): SCHEMA = schema.parse(makeSchema(args)) rec_writer = io.DatumWriter(SCHEMA) if outsource == sys.stdout: df_writer = datafile.DataFileWriter(sys.stdout, rec_writer, writers_schema = SCHEMA, codec = 'deflate') else: df_writer = datafile.DataFileWriter(open(outsource,'wb'), rec_writer, writers_schema = SCHEMA, codec = 'deflate') data = {} count = 1 data['size'] = len(args) for arg in args: if type(arg) == tuple: arg = tupleToList(arg) data["arg%s"%(count)] = arg count +=1 df_writer.append(data) df_writer.close()
def main(argv): valid = set() invalid_avro = set() invalid_json = set() if len(argv) < 3: print "Give me an avro schema file and a whitespace-separated list of json files to validate against it." else: schema = parse(open(argv[1]).read()) for arg in argv[2:]: try: json = loads(open(arg, 'r').read()) if validate(schema, json): valid.add(arg) else: invalid_avro.add(arg) except ValueError: invalid_json.add(arg) print 'Valid files:\n\t' + '\n\t'.join(valid) print 'Invalid avro:\n\t' + '\n\t'.join(invalid_avro) print 'Invalid json:\n\t' + '\n\t'.join(invalid_json)
def test_metadata(self): # Test the writer with a 'with' statement. writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.set_meta('test.string', 'foo') dfw.set_meta('test.number', '1') dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: self.assertEquals('foo', dfr.get_meta('test.string')) self.assertEquals('1', dfr.get_meta('test.number')) for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def check_topic_key_schema_existence(SCHEMA_REGISTRY_URL, topic): try: # This is the second way of Getting Schema subject = topic + '-key' url = "{}/subjects/{}/versions".format(SCHEMA_REGISTRY_URL, subject), headers = { 'Content-Type': 'application/vnd.schemaregistry.v1+json', } print "\nINFO: Making the API Call to SR" versions_response = requests.get( url="{}/subjects/{}/versions".format(SCHEMA_REGISTRY_URL, subject), headers={ "Content-Type": "application/vnd.schemaregistry.v1+json", }, ) latest_version = versions_response.json()[-1] schema_response = requests.get( url="{}/subjects/{}/versions/{}".format(SCHEMA_REGISTRY_URL, subject, latest_version), headers={ "Content-Type": "application/vnd.schemaregistry.v1+json", }, ) key_schema_response_json = schema_response.json() print "\nINFO: Schema Found. Returning with Details" return schema.parse(key_schema_response_json["schema"]) except Exception, e: print "\nWARN: Failed to get any Schema" print "\nINFO: Creating new by Calling save_new_key_schema_in_SR()" key_schema = save_new_key_schema_in_SR(SCHEMA_REGISTRY_URL, topic) # Here we are just preparing the schema and same will be sent with Producer mesg. If you try checking the key_schema in SR, you will fail as there will be NO ENTRIES. # Entry will happen when any message is written. ==> {u'message': u'Subject not found.', u'error_code': 40401} print "\nINFO: Schema Created. Returning with Details" return key_schema
def __init__(self, reader, datum_reader): self._reader = reader self._raw_decoder = io.BinaryDecoder(reader) self._datum_decoder = None # Maybe reset at every block. self._datum_reader = datum_reader # read the header: magic, meta, sync self._read_header() # ensure codec is valid self.codec = self.get_meta('avro.codec') if self.codec is None: self.codec = "null" if self.codec not in VALID_CODECS: raise DataFileException('Unknown codec: %s.' % self.codec) # get file length self._file_length = self.determine_file_length() # get ready to read self._block_count = 0 self.datum_reader.writers_schema = schema.parse(self.get_meta(SCHEMA_KEY))
def test_round_trip(self): print_test_name('TEST ROUND TRIP') correct = 0 for example_schema, datum in SCHEMAS_TO_VALIDATE: print 'Schema: %s' % example_schema print 'Datum: %s' % datum writers_schema = schema.parse(example_schema) writer, encoder, datum_writer = write_datum(datum, writers_schema) round_trip_datum = read_datum(writer, writers_schema) if example_schema == '{"type": "long", "logicalType": "timestamp-micros"}' and isinstance(datum, (int, long)): timedelta = datetime.timedelta(microseconds=datum) unix_epoch_datetime = datetime.datetime(1970, 1, 1, 0, 0, 0, 0) datum = unix_epoch_datetime + timedelta elif example_schema == '{"type": "long", "logicalType": "timestamp-millis"}' and isinstance(datum, (int, long)): timedelta = datetime.timedelta(microseconds=datum * 1000) unix_epoch_datetime = datetime.datetime(1970, 1, 1, 0, 0, 0, 0) datum = unix_epoch_datetime + timedelta print 'Round Trip Datum: %s' % round_trip_datum self.assertEquals(datum, round_trip_datum) if datum == round_trip_datum: correct += 1 self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
def __init__(self, writer, datum_writer, writers_schema=None, codec='null'): """ If the schema is not present, presume we're appending. @param writer: File-like object to write into. """ self._writer = writer self._encoder = io.BinaryEncoder(writer) self._datum_writer = datum_writer self._buffer_writer = StringIO() self._buffer_encoder = io.BinaryEncoder(self._buffer_writer) self._block_count = 0 self._meta = {} self._header_written = False if writers_schema is not None: if codec not in VALID_CODECS: raise DataFileException("Unknown codec: %r" % codec) self._sync_marker = DataFileWriter.generate_sync_marker() self.set_meta('avro.codec', codec) self.set_meta('avro.schema', str(writers_schema)) self.datum_writer.writers_schema = writers_schema else: # open writer for reading to collect metadata dfr = DataFileReader(writer, io.DatumReader()) # TODO(hammer): collect arbitrary metadata # collect metadata self._sync_marker = dfr.sync_marker self.set_meta('avro.codec', dfr.get_meta('avro.codec')) # get schema used to write existing file schema_from_file = dfr.get_meta('avro.schema') self.set_meta('avro.schema', schema_from_file) self.datum_writer.writers_schema = schema.parse(schema_from_file) # seek to the end of the file and prepare for writing writer.seek(0, 2) self._header_written = True
def testUnionSchemaSpecificity(self): union_schema = schema.parse(""" [{ "type" : "record", "name" : "A", "fields" : [{"name" : "foo", "type" : ["string", "null"]}] }, { "type" : "record", "name" : "B", "fields" : [{"name" : "bar", "type" : ["string", "null"]}] }, { "type" : "record", "name" : "AOrB", "fields" : [{"name" : "entity", "type" : ["A", "B"]}] }] """) sch = {s.name: s for s in union_schema.schemas}.get('AOrB') datum_to_read = {'entity': {'foo': 'this is an instance of schema A'}} writer, encoder, datum_writer = write_datum(datum_to_read, sch) datum_read = read_datum(writer, sch, sch) self.assertEqual(datum_to_read, datum_read)
def test_round_trip(self): print_test_name('TEST ROUND TRIP') correct = 0 def are_equal(datum, round_trip_datum): if datum != round_trip_datum: return False if type(datum) == bool: return type(round_trip_datum) == bool else: return True for example_schema, datum in SCHEMAS_TO_VALIDATE: print 'Schema: %s' % example_schema print 'Datum: %s' % datum writers_schema = schema.parse(example_schema) writer, encoder, datum_writer = write_datum(datum, writers_schema) round_trip_datum = read_datum(writer, writers_schema) print 'Round Trip Datum: %s' % round_trip_datum if are_equal(datum, round_trip_datum): correct += 1 self.assertEquals(correct, len(SCHEMAS_TO_VALIDATE))
def init_avro(output_path, part_id, schema_path): print("************* init_avro ***************") output_dir = None output_dirtmp = None # Handle Avro Write Error if (type(output_path) is str): output_dir = init_directory(output_path) output_dirtmp = init_directory(output_path + 'tmp') # Handle Avro Write Error out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \ {"output_dir": output_dir, "part_id": str(part_id)} out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \ {"output_dirtmp": output_dirtmp, "part_id": str(part_id)} # Handle Avro Write Error schemas = open(schema_path, 'r').read() email_schema = schema.parse(schemas) rec_writer = io.DatumWriter(email_schema) avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'), rec_writer, email_schema) # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError when writing into AvroStorage avro_writertmp = datafile.DataFileWriter(open(out_filenametmp, 'wb'), rec_writer, email_schema) return avro_writer, avro_writertmp print("*************end init_avro ***************")
def testMetadata(self): file_path = self.NewTempFile() # Test the writer with a 'with' statement. with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.SetMeta('test.string', 'foo') dfw.SetMeta('test.number', '1') dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: self.assertEqual(b'foo', dfr.GetMeta('test.string')) self.assertEqual(b'1', dfr.GetMeta('test.number')) for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def generate_avro_file(cls, schema_str: str, out_file, num_rows: int) -> str: """Creates an avro file and saves to tmp folder to be used by test cases :param schema_str: valid avro schema as a string :param out_file: name of file to be created :param num_rows: number of rows to be generated :return: string with path to the file created """ filename = os.path.join(TMP_FOLDER, out_file + "." + cls.filetype) parsed_schema = schema.parse(schema_str) rec_writer = io.DatumWriter(parsed_schema) file_writer = datafile.DataFileWriter(open(filename, "wb"), rec_writer, parsed_schema) for _ in range(num_rows): data = {} data["name"] = "".join( random.choice(string.ascii_letters) for i in range(10)) data["age"] = randrange(-100, 100) data["address"] = random.uniform(1.1, 100.10) data["street"] = random.uniform(1.1, 100.10) data["valid"] = random.choice([True, False]) file_writer.append(data) file_writer.close() return filename
def test_context_manager(self): # Context manager was introduced as a first class # member only in Python 2.6 and above. import sys if sys.version_info < (2,6): print 'Skipping context manager tests on this Python version.' return # Test the writer with a 'with' statement. writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
from avro import schema, datafile, io import pprint OUTFILE_NAME = 'output/product.avro' INPUT_SCHEMA_NAME = 'product.avsc' fo = open(INPUT_SCHEMA_NAME, "r+") SCHEMA_STR = fo.read() print "Read String is : ", SCHEMA_STR fo.close() SCHEMA = schema.parse(SCHEMA_STR) rec_writer = io.DatumWriter(SCHEMA) df_writer = datafile.DataFileWriter(open(OUTFILE_NAME, 'wb'), rec_writer, writers_schema=SCHEMA) df_writer.append({ "product_id": 1000, "product_name": "Hugo Boss XY", "product_description": "Hugo Xy Men 100 ml", "product_status": "AVAILABLE", "product_category": ["fragrance", "perfume"], "price": 10.35, "product_hash": "XY123" }) df_writer.append({ "product_id": 1001,
def test_view_avro(): cluster = pseudo_hdfs4.shared_cluster() try: c = make_logged_in_client() cluster.fs.setuser(cluster.superuser) if cluster.fs.isdir("/test-avro-filebrowser"): cluster.fs.rmtree('/test-avro-filebrowser/') cluster.fs.mkdir('/test-avro-filebrowser/') test_schema = schema.parse(""" { "name": "test", "type": "record", "fields": [ { "name": "name", "type": "string" }, { "name": "integer", "type": "int" } ] } """) f = cluster.fs.open('/test-avro-filebrowser/test-view.avro', "w") data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(), writers_schema=test_schema, codec='deflate') dummy_datum = { 'name': 'Test', 'integer': 10, } data_file_writer.append(dummy_datum) data_file_writer.close() # autodetect response = c.get( '/filebrowser/view/test-avro-filebrowser/test-view.avro') # (Note: we use eval here cause of an incompatibility issue between # the representation string of JSON dicts in simplejson vs. json) assert_equal(eval(response.context['view']['contents']), dummy_datum) # offsetting should work as well response = c.get( '/filebrowser/view/test-avro-filebrowser/test-view.avro?offset=1') assert_equal('avro', response.context['view']['compression']) f = cluster.fs.open('/test-avro-filebrowser/test-view2.avro', "w") f.write("hello") f.close() # we shouldn't autodetect non avro files response = c.get( '/filebrowser/view/test-avro-filebrowser/test-view2.avro') assert_equal(response.context['view']['contents'], "hello") # we should fail to do a bad thing if they specify compression when it's not set. response = c.get( '/filebrowser/view/test-avro-filebrowser/test-view2.avro?compression=gzip' ) assert_true('Failed to decompress' in response.context['message']) finally: try: cluster.fs.rmtree('/test-avro-filebrowser/') except: pass # Don't let cleanup errors mask earlier failures
def test_view_snappy_compressed_avro(): if not snappy_installed(): raise SkipTest import snappy cluster = pseudo_hdfs4.shared_cluster() finish = [] try: c = make_logged_in_client() cluster.fs.setuser(cluster.superuser) if cluster.fs.isdir("/test-snappy-avro-filebrowser"): cluster.fs.rmtree('/test-snappy-avro-filebrowser/') cluster.fs.mkdir('/test-snappy-avro-filebrowser/') test_schema = schema.parse(""" { "name": "test", "type": "record", "fields": [ { "name": "name", "type": "string" }, { "name": "integer", "type": "int" } ] } """) # Cannot use StringIO with datafile writer! f = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.avro', "w") data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(), writers_schema=test_schema, codec='deflate') dummy_datum = { 'name': 'Test', 'integer': 10, } data_file_writer.append(dummy_datum) data_file_writer.close() fh = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.avro', 'r') f = cluster.fs.open( '/test-snappy-avro-filebrowser/test-view.compressed.avro', "w") f.write(snappy.compress(fh.read())) f.close() fh.close() # Snappy compressed fail response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.avro?compression=snappy_avro' ) assert_true('Failed to decompress' in response.context['message'], response) # Snappy compressed succeed response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro' ) assert_equal('snappy_avro', response.context['view']['compression']) assert_equal(eval(response.context['view']['contents']), dummy_datum, response) response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro?compression=snappy_avro' ) assert_equal('snappy_avro', response.context['view']['compression']) assert_equal(eval(response.context['view']['contents']), dummy_datum, response) # Avro should also decompress snappy response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro?compression=avro' ) assert_equal('snappy_avro', response.context['view']['compression']) assert_equal(eval(response.context['view']['contents']), dummy_datum, response) # Largest snappy compressed file finish.append(MAX_SNAPPY_DECOMPRESSION_SIZE.set_for_testing(1)) response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.avro?compression=snappy_avro' ) assert_true( 'File size is greater than allowed max snappy decompression size of 1' in response.context['message'], response) finally: for done in finish: done() try: cluster.fs.rmtree('/test-snappy-avro-filebrowser/') except: pass # Don't let cleanup errors mask earlier failures
def test_correct_recursive_extraction(self): s = schema.parse('{"type": "record", "name": "X", "fields": [{"name": "y", "type": {"type": "record", "name": "Y", "fields": [{"name": "Z", "type": "X"}]}}]}') t = schema.parse(str(s.fields[0].type)) # If we've made it this far, the subschema was reasonably stringified; it ccould be reparsed. self.assertEqual("X", t.fields[0].type.name)
def _load_schema(path): return _schema.parse(open(path).read())
def test_allowed_operations(): fst = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read()) sec = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read()) validator.check([fst, sec])
('"float"', '1.1', 1.1), ('"double"', '1.1', 1.1), ('{"type": "fixed", "name": "F", "size": 2}', '"\u00FF\u00FF"', u'\xff\xff'), ('{"type": "enum", "name": "F", "symbols": ["FOO", "BAR"]}', '"FOO"', 'FOO'), ('{"type": "array", "items": "int"}', '[1, 2, 3]', [1, 2, 3]), ('{"type": "map", "values": "int"}', '{"a": 1, "b": 2}', {'a': 1, 'b': 2}), ('["int", "null"]', '5', 5), ('{"type": "record", "name": "F", "fields": [{"name": "A", "type": "int"}]}', '{"A": 5}', {'A': 5}), ) LONG_RECORD_SCHEMA = schema.parse("""\ {"type": "record", "name": "Test", "fields": [{"name": "A", "type": "int"}, {"name": "B", "type": "int"}, {"name": "C", "type": "int"}, {"name": "D", "type": "int"}, {"name": "E", "type": "int"}, {"name": "F", "type": "int"}, {"name": "G", "type": "int"}]}""") LONG_RECORD_DATUM = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7} def avro_hexlify(reader): """Return the hex value, as a string, of a binary-encoded int or long.""" bytes = [] current_byte = reader.read(1) bytes.append(hexlify(current_byte)) while (ord(current_byte) & 0x80) != 0: current_byte = reader.read(1) bytes.append(hexlify(current_byte))
def __init__(self, inschema=None, midschema=None, outschema=None): """ Parameters --------------------------------------------------------- inschema - The scheme for the input to the mapper midschema - The scheme for the output of the mapper outschema - The scheme for the output of the reducer An example scheme for the prototypical word count example would be inscheme='{"type":"record", "name":"Pair","namespace":"org.apache.avro.mapred","fields":[ {"name":"key","type":"string"}, {"name":"value","type":"long","order":"ignore"}] }' Important: The records are split into (key,value) pairs as required by map reduce by using all fields with "order"=ignore for the key and the remaining fields for the value. The subclass provides these schemas in order to tell this class which schemas it expects. The configure request will also provide the schemas that the parent process is using. This allows us to check whether the schemas match and if not whether we can resolve the differences (see https://avro.apache.org/docs/current/spec.html#Schema+Resolution)) """ if (inschema is None): raise ValueError("inschema can't be None") if (midschema is None): raise ValueError("midschema can't be None") if (outschema is None): raise ValueError("outschema can't be None") # make sure we can parse the schemas # Should we call fail if we can't parse the schemas? self.inschema = schema.parse(inschema) self.midschema = schema.parse(midschema) self.outschema = schema.parse(outschema) # declare various variables self.clienTransciever = None # output client is used to communicate with the parent process # in particular to transmit the outputs of the mapper and reducer self.outputClient = None # collectors for the output of the mapper and reducer self.midCollector = None self.outCollector = None self._partitions = None # cache a list of the fields used by the reducer as the keys # we need the fields to decide when we have finished processing all values for # a given key. We cache the fields to be more efficient self._red_fkeys = None # We need to keep track of the previous record fed to the reducer # b\c we need to be able to determine when we start processing a new group # in the reducer self.midRecord = None # create an event object to signal when # http server is ready to be shutdown self.ready_for_shutdown = threading.Event() self.log = logging.getLogger("TetherTask")
def __init__(self, avro_schema_file, avro_data_file): self.avro_data_file = avro_data_file self.schema = parse(open(avro_schema_file, "rb").read())
from avro import schema from avro import io # # Constants # VERSION = 1 MAGIC = 'Obj' + chr(VERSION) MAGIC_SIZE = len(MAGIC) SYNC_SIZE = 16 SYNC_INTERVAL = 1000 * SYNC_SIZE # TODO(hammer): make configurable META_SCHEMA = schema.parse("""\ {"type": "record", "name": "org.apache.avro.file.Header", "fields" : [ {"name": "magic", "type": {"type": "fixed", "name": "magic", "size": %d}}, {"name": "meta", "type": {"type": "map", "values": "bytes"}}, {"name": "sync", "type": {"type": "fixed", "name": "sync", "size": %d}}]} """ % (MAGIC_SIZE, SYNC_SIZE)) VALID_CODECS = ['null', 'deflate'] VALID_ENCODINGS = ['binary'] # not used yet CODEC_KEY = "avro.codec" SCHEMA_KEY = "avro.schema" # # Exceptions # class DataFileException(schema.AvroException):