def serialize(items):
    from avro import schema, io
    import io as io2
    schema_path = "data/files/fb_scheam.avsc"
    schema = schema.Parse(open(schema_path).read())
    writer = io.DatumWriter(schema)
    bytes_writer = io2.BytesIO()
    encoder = io.BinaryEncoder(bytes_writer)
    # There must be a better way of writing this item that isn't so long
    print(get_as_json(items))
    writer.write(get_as_json(items), encoder)
    raw_bytes = bytes_writer.getvalue()
    return raw_bytes
Example #2
0
  def test_empty_datafile(self):
    """A reader should not fail to read a file consisting of a single empty block."""
    sample_schema = schema.parse(SCHEMAS_TO_VALIDATE[1][0])
    with datafile.DataFileWriter(open(FILENAME, 'wb'), io.DatumWriter(),
        sample_schema) as dfw:
      dfw.flush()
      # Write an empty block
      dfw.encoder.write_long(0)
      dfw.encoder.write_long(0)
      dfw.writer.write(dfw.sync_marker)

    with datafile.DataFileReader(open(FILENAME, 'rb'), io.DatumReader()) as dfr:
      self.assertEqual([], list(dfr))
Example #3
0
 def test_write_data(self):
     writer = open('pairs.avro', 'wb')
     datum_writer = io.DatumWriter()
     schema_object = schema.parse(
         open(
             '/Users/tom/workspace/hadoop-book-avro/src/main/java/Pair.avsc'
         ).read())
     dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
     dfw.append({'left': 'a', 'right': '1'})
     dfw.append({'left': 'c', 'right': '2'})
     dfw.append({'left': 'b', 'right': '3'})
     dfw.append({'left': 'b', 'right': '2'})
     dfw.close()
Example #4
0
 def test_write_data(self):
     writer = open('pairs.avro', 'wb')
     datum_writer = io.DatumWriter()
     schema_object = schema.Parse(
         open(
             '/Users/zzy/Docs/hadoop_book/ch12-avro/src/main/resources/StringPair.avsc'
         ).read())
     dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
     dfw.append({'left': 'a', 'right': '1'})
     dfw.append({'left': 'c', 'right': '2'})
     dfw.append({'left': 'b', 'right': '3'})
     dfw.append({'left': 'b', 'right': '2'})
     dfw.close()
Example #5
0
  def testRoundTrip(self):
    correct = 0
    for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in CODECS_TO_VALIDATE:
        file_path = self.NewTempFile()

        # Write the datum this many times in the data file:
        nitems = 10

        logging.debug(
            'Performing round-trip with codec %r in file %s for example #%d\n'
            'Writing datum: %r using writer schema:\n%s',
            codec, file_path, iexample,
            datum, writer_schema)

        logging.debug('Creating data file %r', file_path)
        with open(file_path, 'wb') as writer:
          datum_writer = io.DatumWriter()
          schema_object = schema.parse(writer_schema)
          with datafile.DataFileWriter(
              writer=writer,
              datum_writer=datum_writer,
              writer_schema=schema_object,
              codec=codec,
          ) as dfw:
            for _ in range(nitems):
              dfw.append(datum)

        logging.debug('Reading data from %r', file_path)
        with open(file_path, 'rb') as reader:
          datum_reader = io.DatumReader()
          with datafile.DataFileReader(reader, datum_reader) as dfr:
            round_trip_data = list(dfr)

        logging.debug(
            'Round-trip data has %d items: %r',
            len(round_trip_data), round_trip_data)

        if ([datum] * nitems) == round_trip_data:
          correct += 1
        else:
          logging.error(
              'Round-trip data does not match:\n'
              'Expect: %r\n'
              'Actual: %r',
              [datum] * nitems,
              round_trip_data)

    self.assertEqual(
        correct,
        len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
Example #6
0
 def init_avro(self, output_path, part_id, schema_path):
   output_dir = None
   if(type(output_path) is str):
     output_dir = self.init_directory(output_path)
   out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \
     {"output_dir": output_dir, "part_id": str(part_id)}
   self.schema = open(schema_path, 'r').read()
   email_schema = schema.parse(self.schema)
   rec_writer = io.DatumWriter(email_schema)
   self.avro_writer = datafile.DataFileWriter(
     open(out_filename, 'wb'),
     rec_writer,
     email_schema
   )
Example #7
0
def dump_report(datum):
    # have to diddle with some of the values so avro doesn't choke

    uuids = map(convert_uuids, datum.itervalues())
    map(convert_readings, datum.itervalues())

    # then just dump it to a string
    out = StringIO()
    dwriter = io.DatumWriter(writers_schema=REPORT_SCHEMA)
    dwriter.write(datum, io.BinaryEncoder(out))

    for id, p in zip(uuids, datum.itervalues()):
        if id: p['uuid'] = id

    return out.getvalue()
Example #8
0
def merge_output_records_to_file(records):
    bio = BytesIO()

    schema = avs.Parse(json.dumps(output_schema))

    writer = aio.DatumWriter()
    writer.write = lambda datum, encoder: encoder.write(datum)

    dw = adf.DataFileWriter(bio, writer, schema)

    for r in records:
        dw.append(r)
    dw.flush()

    return bio.getvalue()
Example #9
0
  def test_round_trip(self):
    print ''
    print 'TEST ROUND TRIP'
    print '==============='
    print ''
    correct = 0
    for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in CODECS_TO_VALIDATE:
        if (codec == 'snappy'):
          try:
            import snappy
          except:
            print 'Snappy not present. Skipping.'
            correct += 1
            continue
        print ''
        print 'SCHEMA NUMBER %d' % (i + 1)
        print '================'
        print ''
        print 'Schema: %s' % example_schema
        print 'Datum: %s' % datum
        print 'Codec: %s' % codec

        # write data in binary to file 10 times
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        schema_object = schema.parse(example_schema)
        dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec)
        for i in range(10):
          dfw.append(datum)
        dfw.close()

        # read data in binary from file
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        dfr = datafile.DataFileReader(reader, datum_reader)
        round_trip_data = []
        for datum in dfr:
          round_trip_data.append(datum)

        print 'Round Trip Data: %s' % round_trip_data
        print 'Round Trip Data Length: %d' % len(round_trip_data)
        is_correct = [datum] * 10 == round_trip_data
        if is_correct: correct += 1
        print 'Correct Round Trip: %s' % is_correct
        print ''
    os.remove(FILENAME)
    self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
Example #10
0
  def test_view_avro(self):
    prefix = self.cluster.fs_prefix + '/test_view_avro'
    self.cluster.fs.mkdir(prefix)

    test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

    f = self.cluster.fs.open(prefix + '/test-view.avro', "w")
    data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(),
                                                writers_schema=test_schema,
                                                codec='deflate')
    dummy_datum = {
      'name': 'Test',
      'integer': 10,
    }
    data_file_writer.append(dummy_datum)
    data_file_writer.close()

    # autodetect
    response = self.c.get('/filebrowser/view=%s/test-view.avro' % prefix)
    # (Note: we use eval here cause of an incompatibility issue between
    # the representation string of JSON dicts in simplejson vs. json)
    assert_equal(eval(response.context['view']['contents']), dummy_datum)

    # offsetting should work as well
    response = self.c.get('/filebrowser/view=%s/test-view.avro?offset=1' % prefix)
    assert_equal('avro', response.context['view']['compression'])

    f = self.cluster.fs.open(prefix + '/test-view2.avro', "w")
    f.write("hello")
    f.close()

    # we shouldn't autodetect non avro files
    response = self.c.get('/filebrowser/view=%s/test-view2.avro' % prefix)
    assert_equal(response.context['view']['contents'], "hello")

    # we should fail to do a bad thing if they specify compression when it's not set.
    response = self.c.get('/filebrowser/view=%s/test-view2.avro?compression=gzip' % prefix)
    assert_true('Failed to decompress' in response.context['message'])
Example #11
0
  def test_view_snappy_compressed_avro(self):
    if not snappy_installed():
      raise SkipTest
    import snappy

    finish = []
    try:
      prefix = self.cluster.fs_prefix + '/test-snappy-avro-filebrowser'
      self.cluster.fs.mkdir(prefix)

      test_schema = schema.parse("""
        {
          "name": "test",
          "type": "record",
          "fields": [
            { "name": "name", "type": "string" },
            { "name": "integer", "type": "int" }
          ]
        }
      """)

      # Cannot use StringIO with datafile writer!
      f = self.cluster.fs.open(prefix +'/test-view.compressed.avro', "w")
      data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(),
                                                  writers_schema=test_schema,
                                                  codec='snappy')
      dummy_datum = {
        'name': 'Test',
        'integer': 10,
      }
      data_file_writer.append(dummy_datum)
      data_file_writer.close()
      f.close()

      # Check to see if snappy is the codec
      f = self.cluster.fs.open(prefix + '/test-view.compressed.avro', "r")
      assert_true('snappy' in f.read())
      f.close()

      # Snappy compressed succeed
      response = self.c.get('/filebrowser/view=%s/test-view.compressed.avro' % prefix)
      assert_equal('avro', response.context['view']['compression'])
      assert_equal(eval(response.context['view']['contents']), dummy_datum, response)

    finally:
      for done in finish:
        done()
    def test_round_trip(self):
        print('')
        print('TEST ROUND TRIP')
        print('===============')
        print('')
        correct = 0
        for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
            for codec in CODECS_TO_VALIDATE:
                print('')
                print('SCHEMA NUMBER %d' % (i + 1))
                print('================')
                print('')
                print('Schema: %s' % example_schema)
                print('Datum: %s' % datum)
                print('Codec: %s' % codec)

                # write data in binary to file 10 times
                writer = open(FILENAME, 'wb')
                datum_writer = io.DatumWriter()
                schema_object = schema.parse(example_schema)
                dfw = datafile.DataFileWriter(writer,
                                              datum_writer,
                                              schema_object,
                                              codec=codec)
                for i in range(10):
                    dfw.append(datum)
                dfw.close()

                # read data in binary from file
                reader = open(FILENAME, 'rb')
                datum_reader = io.DatumReader()
                dfr = datafile.DataFileReader(reader, datum_reader)
                round_trip_data = []
                for datum in dfr:
                    round_trip_data.append(datum)

                print('Round Trip Data: %s' % round_trip_data)
                print('Round Trip Data Length: %d' % len(round_trip_data))
                is_correct = [datum] * 10 == round_trip_data
                if is_correct:
                    correct += 1
                print('Correct Round Trip: %s' % is_correct)
                print('')
        os.remove(FILENAME)
        self.assertEquals(correct,
                          len(CODECS_TO_VALIDATE) * len(SCHEMAS_TO_VALIDATE))
Example #13
0
  def test_context_manager(self):
    """Test the writer with a 'with' statement."""
    writer = open(FILENAME, 'wb')
    datum_writer = io.DatumWriter()
    sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
    schema_object = schema.parse(sample_schema)
    with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
      dfw.append(sample_datum)
    self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    reader = open(FILENAME, 'rb')
    datum_reader = io.DatumReader()
    with datafile.DataFileReader(reader, datum_reader) as dfr:
      for datum in dfr:
        datums.append(datum)
    self.assertTrue(reader.closed)
Example #14
0
 def init_avro(self, output_path, part_id, schema_path):
     output_dir = None
     output_dirtmp = None  # Handle Avro Write Error
     if (type(output_path) is str):
         output_dir = self.init_directory(output_path)
         output_dirtmp = self.init_directory(
             output_path + 'tmp')  # Handle Avro Write Error
     out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \
       {"output_dir": output_dir, "part_id": str(part_id)}
     out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \
       {"output_dirtmp": output_dirtmp, "part_id": str(part_id)}  # Handle Avro Write Error
     self.schema = open(schema_path, 'r').read()
     email_schema = schema.parse(self.schema)
     rec_writer = io.DatumWriter(email_schema)
     self.avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'),
                                                rec_writer, email_schema)
     # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError when writing into AvroStorage
     self.avro_writertmp = datafile.DataFileWriter(
         open(out_filenametmp, 'wb'), rec_writer, email_schema)
Example #15
0
    def encode_record_for_topic(self, topic, record, is_key=False):
        """
        Encode a record for a given topic.

        This is expensive as it fetches the latest schema for a given topic.
        """
        if not isinstance(record, dict):
            raise SerializerError("record must be a dictionary")
        subject_suffix = ('-key' if is_key else '-value')
        # get the latest schema for the subject
        subject = topic + subject_suffix
        try:
            schema_id,schema,version = self.registry_client.get_latest_schema(subject)
        except ClientError as e:
            message = "Unable to retrieve schema id for subject %s" % (subject)
            raise SerializerError(message)
        else:
            # cache writer
            self.id_to_writers[schema_id] = io.DatumWriter(schema)
            return self.encode_record_with_schema_id(schema_id, record)
Example #16
0
  def testContextManager(self):
    file_path = self.NewTempFile()

    # Test the writer with a 'with' statement.
    with open(file_path, 'wb') as writer:
      datum_writer = io.DatumWriter()
      sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
      schema_object = schema.parse(sample_schema)
      with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
        dfw.append(sample_datum)
      self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    with open(file_path, 'rb') as reader:
      datum_reader = io.DatumReader()
      with datafile.DataFileReader(reader, datum_reader) as dfr:
        for datum in dfr:
          datums.append(datum)
      self.assertTrue(reader.closed)
Example #17
0
def write_avro_file(args, outsource='args.avro'):
    SCHEMA = schema.parse(makeSchema(args))
    rec_writer = io.DatumWriter(SCHEMA)   
        
    if outsource == sys.stdout:
        df_writer = datafile.DataFileWriter(sys.stdout, rec_writer, 
                                        writers_schema = SCHEMA, codec = 'deflate')
    
    else:
        df_writer = datafile.DataFileWriter(open(outsource,'wb'), rec_writer, 
                                        writers_schema = SCHEMA, codec = 'deflate')
    data = {}
    count = 1
    data['size'] = len(args)
    for arg in args:
        if type(arg) == tuple:
            arg = tupleToList(arg)
        data["arg%s"%(count)] = arg
        count +=1
    df_writer.append(data)
    df_writer.close()
Example #18
0
def encode_record(schema_id, schema, record):
    #construct avro writer
    writer = io.DatumWriter(schema)
    outf = StringIO.StringIO()

    # write the header
    # magic byte
    outf.write(struct.pack('b', MAGIC_BYTE))

    # write the schema ID in network byte order (big end)
    outf.write(struct.pack('>I', schema_id))

    # write the record to the rest of it
    # Create an encoder that we'll write to
    encoder = io.BinaryEncoder(outf)

    # write the magic byte
    # write the object in 'obj' as Avro
    writer.write(record, encoder)

    return outf.getvalue()
    def __init__(self, scheme=None, outputClient=None):
        """

    Parameters
    ---------------------------------------------
    scheme - The scheme for the datums to output - can be a json string
           - or an instance of Schema
    outputClient - The output client used to send messages to the parent
    """

        if not (isinstance(scheme, schema.Schema)):
            scheme = schema.parse(scheme)

        if (outputClient is None):
            raise ValueError("output client can't be none.")

        self.scheme = scheme
        self.buff = StringIO()
        self.encoder = avio.BinaryEncoder(self.buff)

        self.datum_writer = avio.DatumWriter(writers_schema=self.scheme)
        self.outputClient = outputClient
Example #20
0
  def test_metadata(self):
    # Test the writer with a 'with' statement.
    writer = open(FILENAME, 'wb')
    datum_writer = io.DatumWriter()
    sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
    schema_object = schema.parse(sample_schema)
    with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
      dfw.set_meta('test.string', 'foo')
      dfw.set_meta('test.number', '1')
      dfw.append(sample_datum)
    self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    reader = open(FILENAME, 'rb')
    datum_reader = io.DatumReader()
    with datafile.DataFileReader(reader, datum_reader) as dfr:
      self.assertEquals('foo', dfr.get_meta('test.string'))
      self.assertEquals('1', dfr.get_meta('test.number'))
      for datum in dfr:
        datums.append(datum)
    self.assertTrue(reader.closed)
def send_event(exchange):
    """Send an event to publish at an input "exchange"."""

    # Get Avro schema, create serialized raw_bytes version of event body
    event_schema = schema.Parse(open(f"schemas/{exchange}.avsc", "rb").read())
    writer = avro_io.DatumWriter(event_schema)

    bytes_writer = io.BytesIO()
    encoder = avro_io.BinaryEncoder(bytes_writer)

    writer.write(event_bodies[exchange], encoder)
    raw_bytes = bytes_writer.getvalue()

    # create connection, declare exchange
    connection = pika.BlockingConnection(
        pika.ConnectionParameters(host='localhost'))
    channel = connection.channel()
    channel.exchange_declare(exchange=exchange, exchange_type='fanout')

    # publish message, close connection
    channel.basic_publish(exchange=exchange, routing_key='', body=raw_bytes)
    connection.close()
Example #22
0
def write_avro_file():
    # Lets generate our data
    data = {}
    data['name']    = 'Foo'
    data['age']     = 19
    data['address'] = '10, Bar Eggs Spam'
    data['value']   = 800
 
    # Create a 'record' (datum) writer
    rec_writer = io.DatumWriter(SCHEMA)
 
    # Create a 'data file' (avro file) writer
    df_writer = datafile.DataFileWriter(
                    # The file to contain
                    # the records
                    open(OUTFILE_NAME, 'wb'),
                    # The 'record' (datum) writer
                    rec_writer,
                    # Schema, if writing a new file
                    # (aka not 'appending')
                    # (Schema is stored into
                    # the file, so not needed
                    # when you want the writer
                    # to append instead)
                    writers_schema = SCHEMA,
                    # An optional codec name
                    # for compression
                    # ('null' for none)
                    codec = 'deflate'
                )
 
    # Write our data
    # (You can call append multiple times
    # to write more than one record, of course)
    df_writer.append(data)
 
    # Close to ensure writing is complete
    df_writer.close()
Example #23
0
File: avro.py Project: wesm/hdfs
 def _writer(_schema=self._schema):
     """Records coroutine."""
     writer = None
     try:
         while True:
             obj = (yield)
             if not writer:
                 if not _schema:  # no schema implies no writer
                     _schema = _get_schema(obj)
                     self._schema = _schema
                 datum_writer = avi.DatumWriter(_schema)
                 buf = BytesIO()
                 writer = avd.DataFileWriter(buf, datum_writer, _schema)
             writer.append(obj)
     except GeneratorExit:  # we are ready to send the data to HDFS
         if writer:
             writer.flush(
             )  # make sure everything has been written to the buffer
             buf.seek(0)
             self._client.write(hdfs_path, buf, overwrite=overwrite)
     finally:
         if writer:
             writer.close()
Example #24
0
  def test_context_manager(self):
    # Context manager was introduced as a first class
    # member only in Python 2.6 and above.
    import sys
    if sys.version_info < (2,6):
      print 'Skipping context manager tests on this Python version.'
      return
    # Test the writer with a 'with' statement.
    writer = open(FILENAME, 'wb')
    datum_writer = io.DatumWriter()
    sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
    schema_object = schema.parse(sample_schema)
    with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
      dfw.append(sample_datum)
    self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    reader = open(FILENAME, 'rb')
    datum_reader = io.DatumReader()
    with datafile.DataFileReader(reader, datum_reader) as dfr:
      for datum in dfr:
        datums.append(datum)
    self.assertTrue(reader.closed)
Example #25
0
 def generate_avro_file(cls, schema_str: str, out_file,
                        num_rows: int) -> str:
     """Creates an avro file and saves to tmp folder to be used by test cases
     :param schema_str: valid avro schema as a string
     :param out_file: name of file to be created
     :param num_rows: number of rows to be generated
     :return: string with path to the file created
     """
     filename = os.path.join(TMP_FOLDER, out_file + "." + cls.filetype)
     parsed_schema = schema.parse(schema_str)
     rec_writer = io.DatumWriter(parsed_schema)
     file_writer = datafile.DataFileWriter(open(filename, "wb"), rec_writer,
                                           parsed_schema)
     for _ in range(num_rows):
         data = {}
         data["name"] = "".join(
             random.choice(string.ascii_letters) for i in range(10))
         data["age"] = randrange(-100, 100)
         data["address"] = random.uniform(1.1, 100.10)
         data["street"] = random.uniform(1.1, 100.10)
         data["valid"] = random.choice([True, False])
         file_writer.append(data)
     file_writer.close()
     return filename
Example #26
0
  def testMetadata(self):
    file_path = self.NewTempFile()

    # Test the writer with a 'with' statement.
    with open(file_path, 'wb') as writer:
      datum_writer = io.DatumWriter()
      sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
      schema_object = schema.parse(sample_schema)
      with datafile.DataFileWriter(writer, datum_writer, schema_object) as dfw:
        dfw.SetMeta('test.string', 'foo')
        dfw.SetMeta('test.number', '1')
        dfw.append(sample_datum)
      self.assertTrue(writer.closed)

    # Test the reader with a 'with' statement.
    datums = []
    with open(file_path, 'rb') as reader:
      datum_reader = io.DatumReader()
      with datafile.DataFileReader(reader, datum_reader) as dfr:
        self.assertEqual(b'foo', dfr.GetMeta('test.string'))
        self.assertEqual(b'1', dfr.GetMeta('test.number'))
        for datum in dfr:
          datums.append(datum)
      self.assertTrue(reader.closed)
Example #27
0
def test_view_avro():
    cluster = pseudo_hdfs4.shared_cluster()
    try:
        c = make_logged_in_client()
        cluster.fs.setuser(cluster.superuser)
        if cluster.fs.isdir("/test-avro-filebrowser"):
            cluster.fs.rmtree('/test-avro-filebrowser/')

        cluster.fs.mkdir('/test-avro-filebrowser/')

        test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

        f = cluster.fs.open('/test-avro-filebrowser/test-view.avro', "w")
        data_file_writer = datafile.DataFileWriter(f,
                                                   io.DatumWriter(),
                                                   writers_schema=test_schema,
                                                   codec='deflate')
        dummy_datum = {
            'name': 'Test',
            'integer': 10,
        }
        data_file_writer.append(dummy_datum)
        data_file_writer.close()

        # autodetect
        response = c.get(
            '/filebrowser/view/test-avro-filebrowser/test-view.avro')
        # (Note: we use eval here cause of an incompatibility issue between
        # the representation string of JSON dicts in simplejson vs. json)
        assert_equal(eval(response.context['view']['contents']), dummy_datum)

        # offsetting should work as well
        response = c.get(
            '/filebrowser/view/test-avro-filebrowser/test-view.avro?offset=1')
        assert_equal('avro', response.context['view']['compression'])

        f = cluster.fs.open('/test-avro-filebrowser/test-view2.avro', "w")
        f.write("hello")
        f.close()

        # we shouldn't autodetect non avro files
        response = c.get(
            '/filebrowser/view/test-avro-filebrowser/test-view2.avro')
        assert_equal(response.context['view']['contents'], "hello")

        # we should fail to do a bad thing if they specify compression when it's not set.
        response = c.get(
            '/filebrowser/view/test-avro-filebrowser/test-view2.avro?compression=gzip'
        )
        assert_true('Failed to decompress' in response.context['message'])

    finally:
        try:
            cluster.fs.rmtree('/test-avro-filebrowser/')
        except:
            pass  # Don't let cleanup errors mask earlier failures
Example #28
0
def test_view_snappy_compressed_avro():
    if not snappy_installed():
        raise SkipTest
    import snappy

    cluster = pseudo_hdfs4.shared_cluster()
    finish = []
    try:
        c = make_logged_in_client()
        cluster.fs.setuser(cluster.superuser)
        if cluster.fs.isdir("/test-snappy-avro-filebrowser"):
            cluster.fs.rmtree('/test-snappy-avro-filebrowser/')

        cluster.fs.mkdir('/test-snappy-avro-filebrowser/')

        test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

        # Cannot use StringIO with datafile writer!
        f = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.avro',
                            "w")
        data_file_writer = datafile.DataFileWriter(f,
                                                   io.DatumWriter(),
                                                   writers_schema=test_schema,
                                                   codec='deflate')
        dummy_datum = {
            'name': 'Test',
            'integer': 10,
        }
        data_file_writer.append(dummy_datum)
        data_file_writer.close()

        fh = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.avro',
                             'r')
        f = cluster.fs.open(
            '/test-snappy-avro-filebrowser/test-view.compressed.avro', "w")
        f.write(snappy.compress(fh.read()))
        f.close()
        fh.close()

        # Snappy compressed fail
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.avro?compression=snappy_avro'
        )
        assert_true('Failed to decompress' in response.context['message'],
                    response)

        # Snappy compressed succeed
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro'
        )
        assert_equal('snappy_avro', response.context['view']['compression'])
        assert_equal(eval(response.context['view']['contents']), dummy_datum,
                     response)
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro?compression=snappy_avro'
        )
        assert_equal('snappy_avro', response.context['view']['compression'])
        assert_equal(eval(response.context['view']['contents']), dummy_datum,
                     response)

        # Avro should also decompress snappy
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro?compression=avro'
        )
        assert_equal('snappy_avro', response.context['view']['compression'])
        assert_equal(eval(response.context['view']['contents']), dummy_datum,
                     response)

        # Largest snappy compressed file
        finish.append(MAX_SNAPPY_DECOMPRESSION_SIZE.set_for_testing(1))
        response = c.get(
            '/filebrowser/view/test-snappy-avro-filebrowser/test-view.avro?compression=snappy_avro'
        )
        assert_true(
            'File size is greater than allowed max snappy decompression size of 1'
            in response.context['message'], response)

    finally:
        for done in finish:
            done()
        try:
            cluster.fs.rmtree('/test-snappy-avro-filebrowser/')
        except:
            pass  # Don't let cleanup errors mask earlier failures
    def test1(self):
        from word_count_task import WordCountTask
        from avro.tether import TaskRunner, find_port, HTTPRequestor, inputProtocol, TaskType
        from avro import io as avio
        import mock_tether_parent
        import subprocess
        import StringIO
        import logging

        # set the logging level to debug so that debug messages are printed
        logging.basicConfig(level=logging.DEBUG)

        proc = None
        try:
            # launch the server in a separate process
            env = dict()
            env["PYTHONPATH"] = ':'.join(sys.path)
            parent_port = find_port()

            pyfile = mock_tether_parent.__file__
            proc = subprocess.Popen(
                ["python", pyfile, "start_server", "{0}".format(parent_port)])
            input_port = find_port()

            print "Mock server started process pid={0}".format(proc.pid)
            # Possible race condition? open tries to connect to the subprocess before the subprocess is fully started
            # so we give the subprocess time to start up
            time.sleep(1)

            runner = TaskRunner(WordCountTask())

            runner.start(outputport=parent_port, join=False)

            # Test sending various messages to the server and ensuring they are
            # processed correctly
            requestor = HTTPRequestor("localhost",
                                      runner.server.server_address[1],
                                      inputProtocol)

            # TODO: We should validate that open worked by grabbing the STDOUT of the subproces
            # and ensuring that it outputted the correct message.

            # Test the mapper
            requestor.request(
                "configure", {
                    "taskType": TaskType.MAP,
                    "inSchema": str(runner.task.inschema),
                    "outSchema": str(runner.task.midschema)
                })

            # Serialize some data so we can send it to the input function
            datum = "This is a line of text"
            writer = StringIO.StringIO()
            encoder = avio.BinaryEncoder(writer)
            datum_writer = avio.DatumWriter(runner.task.inschema)
            datum_writer.write(datum, encoder)

            writer.seek(0)
            data = writer.read()

            # Call input to simulate calling map
            requestor.request("input", {"data": data, "count": 1})

            #Test the reducer
            requestor.request(
                "configure", {
                    "taskType": TaskType.REDUCE,
                    "inSchema": str(runner.task.midschema),
                    "outSchema": str(runner.task.outschema)
                })

            #Serialize some data so we can send it to the input function
            datum = {"key": "word", "value": 2}
            writer = StringIO.StringIO()
            encoder = avio.BinaryEncoder(writer)
            datum_writer = avio.DatumWriter(runner.task.midschema)
            datum_writer.write(datum, encoder)

            writer.seek(0)
            data = writer.read()

            #Call input to simulate calling reduce
            requestor.request("input", {"data": data, "count": 1})

            requestor.request("complete", {})

            runner.task.ready_for_shutdown.wait()
            runner.server.shutdown()
            #time.sleep(2)
            #runner.server.shutdown()

            sthread = runner.sthread

            #Possible race condition?
            time.sleep(1)

            #make sure the other thread terminated
            self.assertFalse(sthread.isAlive())

            #shutdown the logging
            logging.shutdown()

        except Exception as e:
            raise
        finally:
            #close the process
            if not (proc is None):
                proc.kill()
Example #30
0
 def write(self, fp, datum, schema):
     sch = self.names.get_name('edu.berkeley.cs.local.' + schema, None)
     dwriter = io.DatumWriter(writers_schema=sch)
     dwriter.write(datum, io.BinaryEncoder(fp))