def serialize(items): from avro import schema, io import io as io2 schema_path = "data/files/fb_scheam.avsc" schema = schema.Parse(open(schema_path).read()) writer = io.DatumWriter(schema) bytes_writer = io2.BytesIO() encoder = io.BinaryEncoder(bytes_writer) # There must be a better way of writing this item that isn't so long print(get_as_json(items)) writer.write(get_as_json(items), encoder) raw_bytes = bytes_writer.getvalue() return raw_bytes
def test_empty_datafile(self): """A reader should not fail to read a file consisting of a single empty block.""" sample_schema = schema.parse(SCHEMAS_TO_VALIDATE[1][0]) with datafile.DataFileWriter(open(FILENAME, 'wb'), io.DatumWriter(), sample_schema) as dfw: dfw.flush() # Write an empty block dfw.encoder.write_long(0) dfw.encoder.write_long(0) dfw.writer.write(dfw.sync_marker) with datafile.DataFileReader(open(FILENAME, 'rb'), io.DatumReader()) as dfr: self.assertEqual([], list(dfr))
def test_write_data(self): writer = open('pairs.avro', 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse( open( '/Users/tom/workspace/hadoop-book-avro/src/main/java/Pair.avsc' ).read()) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object) dfw.append({'left': 'a', 'right': '1'}) dfw.append({'left': 'c', 'right': '2'}) dfw.append({'left': 'b', 'right': '3'}) dfw.append({'left': 'b', 'right': '2'}) dfw.close()
def test_write_data(self): writer = open('pairs.avro', 'wb') datum_writer = io.DatumWriter() schema_object = schema.Parse( open( '/Users/zzy/Docs/hadoop_book/ch12-avro/src/main/resources/StringPair.avsc' ).read()) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object) dfw.append({'left': 'a', 'right': '1'}) dfw.append({'left': 'c', 'right': '2'}) dfw.append({'left': 'b', 'right': '3'}) dfw.append({'left': 'b', 'right': '2'}) dfw.close()
def testRoundTrip(self): correct = 0 for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: file_path = self.NewTempFile() # Write the datum this many times in the data file: nitems = 10 logging.debug( 'Performing round-trip with codec %r in file %s for example #%d\n' 'Writing datum: %r using writer schema:\n%s', codec, file_path, iexample, datum, writer_schema) logging.debug('Creating data file %r', file_path) with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() schema_object = schema.parse(writer_schema) with datafile.DataFileWriter( writer=writer, datum_writer=datum_writer, writer_schema=schema_object, codec=codec, ) as dfw: for _ in range(nitems): dfw.append(datum) logging.debug('Reading data from %r', file_path) with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: round_trip_data = list(dfr) logging.debug( 'Round-trip data has %d items: %r', len(round_trip_data), round_trip_data) if ([datum] * nitems) == round_trip_data: correct += 1 else: logging.error( 'Round-trip data does not match:\n' 'Expect: %r\n' 'Actual: %r', [datum] * nitems, round_trip_data) self.assertEqual( correct, len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
def init_avro(self, output_path, part_id, schema_path): output_dir = None if(type(output_path) is str): output_dir = self.init_directory(output_path) out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \ {"output_dir": output_dir, "part_id": str(part_id)} self.schema = open(schema_path, 'r').read() email_schema = schema.parse(self.schema) rec_writer = io.DatumWriter(email_schema) self.avro_writer = datafile.DataFileWriter( open(out_filename, 'wb'), rec_writer, email_schema )
def dump_report(datum): # have to diddle with some of the values so avro doesn't choke uuids = map(convert_uuids, datum.itervalues()) map(convert_readings, datum.itervalues()) # then just dump it to a string out = StringIO() dwriter = io.DatumWriter(writers_schema=REPORT_SCHEMA) dwriter.write(datum, io.BinaryEncoder(out)) for id, p in zip(uuids, datum.itervalues()): if id: p['uuid'] = id return out.getvalue()
def merge_output_records_to_file(records): bio = BytesIO() schema = avs.Parse(json.dumps(output_schema)) writer = aio.DatumWriter() writer.write = lambda datum, encoder: encoder.write(datum) dw = adf.DataFileWriter(bio, writer, schema) for r in records: dw.append(r) dw.flush() return bio.getvalue()
def test_round_trip(self): print '' print 'TEST ROUND TRIP' print '===============' print '' correct = 0 for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: if (codec == 'snappy'): try: import snappy except: print 'Snappy not present. Skipping.' correct += 1 continue print '' print 'SCHEMA NUMBER %d' % (i + 1) print '================' print '' print 'Schema: %s' % example_schema print 'Datum: %s' % datum print 'Codec: %s' % codec # write data in binary to file 10 times writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse(example_schema) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) for i in range(10): dfw.append(datum) dfw.close() # read data in binary from file reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) round_trip_data = [] for datum in dfr: round_trip_data.append(datum) print 'Round Trip Data: %s' % round_trip_data print 'Round Trip Data Length: %d' % len(round_trip_data) is_correct = [datum] * 10 == round_trip_data if is_correct: correct += 1 print 'Correct Round Trip: %s' % is_correct print '' os.remove(FILENAME) self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
def test_view_avro(self): prefix = self.cluster.fs_prefix + '/test_view_avro' self.cluster.fs.mkdir(prefix) test_schema = schema.parse(""" { "name": "test", "type": "record", "fields": [ { "name": "name", "type": "string" }, { "name": "integer", "type": "int" } ] } """) f = self.cluster.fs.open(prefix + '/test-view.avro', "w") data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(), writers_schema=test_schema, codec='deflate') dummy_datum = { 'name': 'Test', 'integer': 10, } data_file_writer.append(dummy_datum) data_file_writer.close() # autodetect response = self.c.get('/filebrowser/view=%s/test-view.avro' % prefix) # (Note: we use eval here cause of an incompatibility issue between # the representation string of JSON dicts in simplejson vs. json) assert_equal(eval(response.context['view']['contents']), dummy_datum) # offsetting should work as well response = self.c.get('/filebrowser/view=%s/test-view.avro?offset=1' % prefix) assert_equal('avro', response.context['view']['compression']) f = self.cluster.fs.open(prefix + '/test-view2.avro', "w") f.write("hello") f.close() # we shouldn't autodetect non avro files response = self.c.get('/filebrowser/view=%s/test-view2.avro' % prefix) assert_equal(response.context['view']['contents'], "hello") # we should fail to do a bad thing if they specify compression when it's not set. response = self.c.get('/filebrowser/view=%s/test-view2.avro?compression=gzip' % prefix) assert_true('Failed to decompress' in response.context['message'])
def test_view_snappy_compressed_avro(self): if not snappy_installed(): raise SkipTest import snappy finish = [] try: prefix = self.cluster.fs_prefix + '/test-snappy-avro-filebrowser' self.cluster.fs.mkdir(prefix) test_schema = schema.parse(""" { "name": "test", "type": "record", "fields": [ { "name": "name", "type": "string" }, { "name": "integer", "type": "int" } ] } """) # Cannot use StringIO with datafile writer! f = self.cluster.fs.open(prefix +'/test-view.compressed.avro', "w") data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(), writers_schema=test_schema, codec='snappy') dummy_datum = { 'name': 'Test', 'integer': 10, } data_file_writer.append(dummy_datum) data_file_writer.close() f.close() # Check to see if snappy is the codec f = self.cluster.fs.open(prefix + '/test-view.compressed.avro', "r") assert_true('snappy' in f.read()) f.close() # Snappy compressed succeed response = self.c.get('/filebrowser/view=%s/test-view.compressed.avro' % prefix) assert_equal('avro', response.context['view']['compression']) assert_equal(eval(response.context['view']['contents']), dummy_datum, response) finally: for done in finish: done()
def test_round_trip(self): print('') print('TEST ROUND TRIP') print('===============') print('') correct = 0 for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE): for codec in CODECS_TO_VALIDATE: print('') print('SCHEMA NUMBER %d' % (i + 1)) print('================') print('') print('Schema: %s' % example_schema) print('Datum: %s' % datum) print('Codec: %s' % codec) # write data in binary to file 10 times writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() schema_object = schema.parse(example_schema) dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec) for i in range(10): dfw.append(datum) dfw.close() # read data in binary from file reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() dfr = datafile.DataFileReader(reader, datum_reader) round_trip_data = [] for datum in dfr: round_trip_data.append(datum) print('Round Trip Data: %s' % round_trip_data) print('Round Trip Data Length: %d' % len(round_trip_data)) is_correct = [datum] * 10 == round_trip_data if is_correct: correct += 1 print('Correct Round Trip: %s' % is_correct) print('') os.remove(FILENAME) self.assertEquals(correct, len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
def test_context_manager(self): """Test the writer with a 'with' statement.""" writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def init_avro(self, output_path, part_id, schema_path): output_dir = None output_dirtmp = None # Handle Avro Write Error if (type(output_path) is str): output_dir = self.init_directory(output_path) output_dirtmp = self.init_directory( output_path + 'tmp') # Handle Avro Write Error out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \ {"output_dir": output_dir, "part_id": str(part_id)} out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \ {"output_dirtmp": output_dirtmp, "part_id": str(part_id)} # Handle Avro Write Error self.schema = open(schema_path, 'r').read() email_schema = schema.parse(self.schema) rec_writer = io.DatumWriter(email_schema) self.avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'), rec_writer, email_schema) # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError when writing into AvroStorage self.avro_writertmp = datafile.DataFileWriter( open(out_filenametmp, 'wb'), rec_writer, email_schema)
def encode_record_for_topic(self, topic, record, is_key=False): """ Encode a record for a given topic. This is expensive as it fetches the latest schema for a given topic. """ if not isinstance(record, dict): raise SerializerError("record must be a dictionary") subject_suffix = ('-key' if is_key else '-value') # get the latest schema for the subject subject = topic + subject_suffix try: schema_id,schema,version = self.registry_client.get_latest_schema(subject) except ClientError as e: message = "Unable to retrieve schema id for subject %s" % (subject) raise SerializerError(message) else: # cache writer self.id_to_writers[schema_id] = io.DatumWriter(schema) return self.encode_record_with_schema_id(schema_id, record)
def testContextManager(self): file_path = self.NewTempFile() # Test the writer with a 'with' statement. with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def write_avro_file(args, outsource='args.avro'): SCHEMA = schema.parse(makeSchema(args)) rec_writer = io.DatumWriter(SCHEMA) if outsource == sys.stdout: df_writer = datafile.DataFileWriter(sys.stdout, rec_writer, writers_schema = SCHEMA, codec = 'deflate') else: df_writer = datafile.DataFileWriter(open(outsource,'wb'), rec_writer, writers_schema = SCHEMA, codec = 'deflate') data = {} count = 1 data['size'] = len(args) for arg in args: if type(arg) == tuple: arg = tupleToList(arg) data["arg%s"%(count)] = arg count +=1 df_writer.append(data) df_writer.close()
def encode_record(schema_id, schema, record): #construct avro writer writer = io.DatumWriter(schema) outf = StringIO.StringIO() # write the header # magic byte outf.write(struct.pack('b', MAGIC_BYTE)) # write the schema ID in network byte order (big end) outf.write(struct.pack('>I', schema_id)) # write the record to the rest of it # Create an encoder that we'll write to encoder = io.BinaryEncoder(outf) # write the magic byte # write the object in 'obj' as Avro writer.write(record, encoder) return outf.getvalue()
def __init__(self, scheme=None, outputClient=None): """ Parameters --------------------------------------------- scheme - The scheme for the datums to output - can be a json string - or an instance of Schema outputClient - The output client used to send messages to the parent """ if not (isinstance(scheme, schema.Schema)): scheme = schema.parse(scheme) if (outputClient is None): raise ValueError("output client can't be none.") self.scheme = scheme self.buff = StringIO() self.encoder = avio.BinaryEncoder(self.buff) self.datum_writer = avio.DatumWriter(writers_schema=self.scheme) self.outputClient = outputClient
def test_metadata(self): # Test the writer with a 'with' statement. writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.set_meta('test.string', 'foo') dfw.set_meta('test.number', '1') dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: self.assertEquals('foo', dfr.get_meta('test.string')) self.assertEquals('1', dfr.get_meta('test.number')) for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def send_event(exchange): """Send an event to publish at an input "exchange".""" # Get Avro schema, create serialized raw_bytes version of event body event_schema = schema.Parse(open(f"schemas/{exchange}.avsc", "rb").read()) writer = avro_io.DatumWriter(event_schema) bytes_writer = io.BytesIO() encoder = avro_io.BinaryEncoder(bytes_writer) writer.write(event_bodies[exchange], encoder) raw_bytes = bytes_writer.getvalue() # create connection, declare exchange connection = pika.BlockingConnection( pika.ConnectionParameters(host='localhost')) channel = connection.channel() channel.exchange_declare(exchange=exchange, exchange_type='fanout') # publish message, close connection channel.basic_publish(exchange=exchange, routing_key='', body=raw_bytes) connection.close()
def write_avro_file(): # Lets generate our data data = {} data['name'] = 'Foo' data['age'] = 19 data['address'] = '10, Bar Eggs Spam' data['value'] = 800 # Create a 'record' (datum) writer rec_writer = io.DatumWriter(SCHEMA) # Create a 'data file' (avro file) writer df_writer = datafile.DataFileWriter( # The file to contain # the records open(OUTFILE_NAME, 'wb'), # The 'record' (datum) writer rec_writer, # Schema, if writing a new file # (aka not 'appending') # (Schema is stored into # the file, so not needed # when you want the writer # to append instead) writers_schema = SCHEMA, # An optional codec name # for compression # ('null' for none) codec = 'deflate' ) # Write our data # (You can call append multiple times # to write more than one record, of course) df_writer.append(data) # Close to ensure writing is complete df_writer.close()
def _writer(_schema=self._schema): """Records coroutine.""" writer = None try: while True: obj = (yield) if not writer: if not _schema: # no schema implies no writer _schema = _get_schema(obj) self._schema = _schema datum_writer = avi.DatumWriter(_schema) buf = BytesIO() writer = avd.DataFileWriter(buf, datum_writer, _schema) writer.append(obj) except GeneratorExit: # we are ready to send the data to HDFS if writer: writer.flush( ) # make sure everything has been written to the buffer buf.seek(0) self._client.write(hdfs_path, buf, overwrite=overwrite) finally: if writer: writer.close()
def test_context_manager(self): # Context manager was introduced as a first class # member only in Python 2.6 and above. import sys if sys.version_info < (2,6): print 'Skipping context manager tests on this Python version.' return # Test the writer with a 'with' statement. writer = open(FILENAME, 'wb') datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] reader = open(FILENAME, 'rb') datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def generate_avro_file(cls, schema_str: str, out_file, num_rows: int) -> str: """Creates an avro file and saves to tmp folder to be used by test cases :param schema_str: valid avro schema as a string :param out_file: name of file to be created :param num_rows: number of rows to be generated :return: string with path to the file created """ filename = os.path.join(TMP_FOLDER, out_file + "." + cls.filetype) parsed_schema = schema.parse(schema_str) rec_writer = io.DatumWriter(parsed_schema) file_writer = datafile.DataFileWriter(open(filename, "wb"), rec_writer, parsed_schema) for _ in range(num_rows): data = {} data["name"] = "".join( random.choice(string.ascii_letters) for i in range(10)) data["age"] = randrange(-100, 100) data["address"] = random.uniform(1.1, 100.10) data["street"] = random.uniform(1.1, 100.10) data["valid"] = random.choice([True, False]) file_writer.append(data) file_writer.close() return filename
def testMetadata(self): file_path = self.NewTempFile() # Test the writer with a 'with' statement. with open(file_path, 'wb') as writer: datum_writer = io.DatumWriter() sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1] schema_object = schema.parse(sample_schema) with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw: dfw.SetMeta('test.string', 'foo') dfw.SetMeta('test.number', '1') dfw.append(sample_datum) self.assertTrue(writer.closed) # Test the reader with a 'with' statement. datums = [] with open(file_path, 'rb') as reader: datum_reader = io.DatumReader() with datafile.DataFileReader(reader, datum_reader) as dfr: self.assertEqual(b'foo', dfr.GetMeta('test.string')) self.assertEqual(b'1', dfr.GetMeta('test.number')) for datum in dfr: datums.append(datum) self.assertTrue(reader.closed)
def test_view_avro(): cluster = pseudo_hdfs4.shared_cluster() try: c = make_logged_in_client() cluster.fs.setuser(cluster.superuser) if cluster.fs.isdir("/test-avro-filebrowser"): cluster.fs.rmtree('/test-avro-filebrowser/') cluster.fs.mkdir('/test-avro-filebrowser/') test_schema = schema.parse(""" { "name": "test", "type": "record", "fields": [ { "name": "name", "type": "string" }, { "name": "integer", "type": "int" } ] } """) f = cluster.fs.open('/test-avro-filebrowser/test-view.avro', "w") data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(), writers_schema=test_schema, codec='deflate') dummy_datum = { 'name': 'Test', 'integer': 10, } data_file_writer.append(dummy_datum) data_file_writer.close() # autodetect response = c.get( '/filebrowser/view/test-avro-filebrowser/test-view.avro') # (Note: we use eval here cause of an incompatibility issue between # the representation string of JSON dicts in simplejson vs. json) assert_equal(eval(response.context['view']['contents']), dummy_datum) # offsetting should work as well response = c.get( '/filebrowser/view/test-avro-filebrowser/test-view.avro?offset=1') assert_equal('avro', response.context['view']['compression']) f = cluster.fs.open('/test-avro-filebrowser/test-view2.avro', "w") f.write("hello") f.close() # we shouldn't autodetect non avro files response = c.get( '/filebrowser/view/test-avro-filebrowser/test-view2.avro') assert_equal(response.context['view']['contents'], "hello") # we should fail to do a bad thing if they specify compression when it's not set. response = c.get( '/filebrowser/view/test-avro-filebrowser/test-view2.avro?compression=gzip' ) assert_true('Failed to decompress' in response.context['message']) finally: try: cluster.fs.rmtree('/test-avro-filebrowser/') except: pass # Don't let cleanup errors mask earlier failures
def test_view_snappy_compressed_avro(): if not snappy_installed(): raise SkipTest import snappy cluster = pseudo_hdfs4.shared_cluster() finish = [] try: c = make_logged_in_client() cluster.fs.setuser(cluster.superuser) if cluster.fs.isdir("/test-snappy-avro-filebrowser"): cluster.fs.rmtree('/test-snappy-avro-filebrowser/') cluster.fs.mkdir('/test-snappy-avro-filebrowser/') test_schema = schema.parse(""" { "name": "test", "type": "record", "fields": [ { "name": "name", "type": "string" }, { "name": "integer", "type": "int" } ] } """) # Cannot use StringIO with datafile writer! f = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.avro', "w") data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(), writers_schema=test_schema, codec='deflate') dummy_datum = { 'name': 'Test', 'integer': 10, } data_file_writer.append(dummy_datum) data_file_writer.close() fh = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.avro', 'r') f = cluster.fs.open( '/test-snappy-avro-filebrowser/test-view.compressed.avro', "w") f.write(snappy.compress(fh.read())) f.close() fh.close() # Snappy compressed fail response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.avro?compression=snappy_avro' ) assert_true('Failed to decompress' in response.context['message'], response) # Snappy compressed succeed response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro' ) assert_equal('snappy_avro', response.context['view']['compression']) assert_equal(eval(response.context['view']['contents']), dummy_datum, response) response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro?compression=snappy_avro' ) assert_equal('snappy_avro', response.context['view']['compression']) assert_equal(eval(response.context['view']['contents']), dummy_datum, response) # Avro should also decompress snappy response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro?compression=avro' ) assert_equal('snappy_avro', response.context['view']['compression']) assert_equal(eval(response.context['view']['contents']), dummy_datum, response) # Largest snappy compressed file finish.append(MAX_SNAPPY_DECOMPRESSION_SIZE.set_for_testing(1)) response = c.get( '/filebrowser/view/test-snappy-avro-filebrowser/test-view.avro?compression=snappy_avro' ) assert_true( 'File size is greater than allowed max snappy decompression size of 1' in response.context['message'], response) finally: for done in finish: done() try: cluster.fs.rmtree('/test-snappy-avro-filebrowser/') except: pass # Don't let cleanup errors mask earlier failures
def test1(self): from word_count_task import WordCountTask from avro.tether import TaskRunner, find_port, HTTPRequestor, inputProtocol, TaskType from avro import io as avio import mock_tether_parent import subprocess import StringIO import logging # set the logging level to debug so that debug messages are printed logging.basicConfig(level=logging.DEBUG) proc = None try: # launch the server in a separate process env = dict() env["PYTHONPATH"] = ':'.join(sys.path) parent_port = find_port() pyfile = mock_tether_parent.__file__ proc = subprocess.Popen( ["python", pyfile, "start_server", "{0}".format(parent_port)]) input_port = find_port() print "Mock server started process pid={0}".format(proc.pid) # Possible race condition? open tries to connect to the subprocess before the subprocess is fully started # so we give the subprocess time to start up time.sleep(1) runner = TaskRunner(WordCountTask()) runner.start(outputport=parent_port, join=False) # Test sending various messages to the server and ensuring they are # processed correctly requestor = HTTPRequestor("localhost", runner.server.server_address[1], inputProtocol) # TODO: We should validate that open worked by grabbing the STDOUT of the subproces # and ensuring that it outputted the correct message. # Test the mapper requestor.request( "configure", { "taskType": TaskType.MAP, "inSchema": str(runner.task.inschema), "outSchema": str(runner.task.midschema) }) # Serialize some data so we can send it to the input function datum = "This is a line of text" writer = StringIO.StringIO() encoder = avio.BinaryEncoder(writer) datum_writer = avio.DatumWriter(runner.task.inschema) datum_writer.write(datum, encoder) writer.seek(0) data = writer.read() # Call input to simulate calling map requestor.request("input", {"data": data, "count": 1}) #Test the reducer requestor.request( "configure", { "taskType": TaskType.REDUCE, "inSchema": str(runner.task.midschema), "outSchema": str(runner.task.outschema) }) #Serialize some data so we can send it to the input function datum = {"key": "word", "value": 2} writer = StringIO.StringIO() encoder = avio.BinaryEncoder(writer) datum_writer = avio.DatumWriter(runner.task.midschema) datum_writer.write(datum, encoder) writer.seek(0) data = writer.read() #Call input to simulate calling reduce requestor.request("input", {"data": data, "count": 1}) requestor.request("complete", {}) runner.task.ready_for_shutdown.wait() runner.server.shutdown() #time.sleep(2) #runner.server.shutdown() sthread = runner.sthread #Possible race condition? time.sleep(1) #make sure the other thread terminated self.assertFalse(sthread.isAlive()) #shutdown the logging logging.shutdown() except Exception as e: raise finally: #close the process if not (proc is None): proc.kill()
def write(self, fp, datum, schema): sch = self.names.get_name('edu.berkeley.cs.local.' + schema, None) dwriter = io.DatumWriter(writers_schema=sch) dwriter.write(datum, io.BinaryEncoder(fp))