コード例 #1
0
  def test_append(self):
    print ''
    print 'TEST APPEND'
    print '==========='
    print ''
    correct = 0
    for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in CODECS_TO_VALIDATE:
        if (codec == 'snappy'):
          try:
            import snappy
          except:
            print 'Snappy not present. Skipping.'
            correct += 1
            continue
        print ''
        print 'SCHEMA NUMBER %d' % (i + 1)
        print '================'
        print ''
        print 'Schema: %s' % example_schema
        print 'Datum: %s' % datum
        print 'Codec: %s' % codec

        # write data in binary to file once
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        schema_object = schema.parse(example_schema)
        dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec)
        dfw.append(datum)
        dfw.close()

        # open file, write, and close nine times
        for i in range(9):
          writer = open(FILENAME, 'ab+')
          dfw = datafile.DataFileWriter(writer, io.DatumWriter())
          dfw.append(datum)
          dfw.close()

        # read data in binary from file
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        dfr = datafile.DataFileReader(reader, datum_reader)
        appended_data = []
        for datum in dfr:
          appended_data.append(datum)

        print 'Appended Data: %s' % appended_data
        print 'Appended Data Length: %d' % len(appended_data)
        is_correct = [datum] * 10 == appended_data
        if is_correct: correct += 1
        print 'Correct Appended: %s' % is_correct
        print ''
    os.remove(FILENAME)
    self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
コード例 #2
0
ファイル: test_datafile.py プロジェクト: timjroberts/avro
  def testAppend(self):
    correct = 0
    codecs_to_validate = get_codecs_to_validate()
    for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in codecs_to_validate:
        file_path = self.NewTempFile()

        logging.debug(
            'Performing append with codec %r in file %s for example #%d\n'
            'Writing datum: %r using writer schema:\n%s',
            codec, file_path, iexample,
            datum, writer_schema)

        logging.debug('Creating data file %r', file_path)
        with open(file_path, 'wb') as writer:
          datum_writer = io.DatumWriter()
          schema_object = schema.parse(writer_schema)
          with datafile.DataFileWriter(
              writer=writer,
              datum_writer=datum_writer,
              writer_schema=schema_object,
              codec=codec,
          ) as dfw:
            dfw.append(datum)

        logging.debug('Appending data to %r', file_path)
        for i in range(9):
          with open(file_path, 'ab+') as writer:
            with datafile.DataFileWriter(writer, io.DatumWriter()) as dfw:
              dfw.append(datum)

        logging.debug('Reading appended data from %r', file_path)
        with open(file_path, 'rb') as reader:
          datum_reader = io.DatumReader()
          with datafile.DataFileReader(reader, datum_reader) as dfr:
            appended_data = list(dfr)

        logging.debug(
            'Appended data has %d items: %r',
            len(appended_data), appended_data)

        if ([datum] * 10) == appended_data:
          correct += 1
        else:
          logging.error(
              'Appended data does not match:\n'
              'Expect: %r\n'
              'Actual: %r',
              [datum] * 10,
              appended_data)

    self.assertEqual(
        correct,
        len(codecs_to_validate) * len(SCHEMAS_TO_VALIDATE))
コード例 #3
0
def obtain_df_writer(filename):
    '''This returns a df writer object to send data to .avro file.'''
    return  datafile.DataFileWriter(
        open(filename, 'wb'),
        rec_writer,
        writers_schema = SCHEMA
        )
コード例 #4
0
ファイル: test_datafile.py プロジェクト: sowmitra/avro
    def test_context_manager(self):
        # Context manager was introduced as a first class
        # member only in Python 2.6 and above.
        import sys
        if sys.version_info < (2, 6):
            print 'Skipping context manager tests on this Python version.'
            return
        # Test the writer with a 'with' statement.
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
        schema_object = schema.parse(sample_schema)
        with datafile.DataFileWriter(writer, datum_writer,
                                     schema_object) as dfw:
            dfw.append(sample_datum)
        self.assertTrue(writer.closed)

        # Test the reader with a 'with' statement.
        datums = []
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        with datafile.DataFileReader(reader, datum_reader) as dfr:
            for datum in dfr:
                datums.append(datum)
        self.assertTrue(reader.closed)
コード例 #5
0
ファイル: serializer.py プロジェクト: cpiva/avro-serializer
def process_files(output_path, hdfs_path, batch):
    """Process all files in batch a produce an avro file. """
    now = datetime.datetime.now()
    ts = now.strftime("%Y-%m-%d-%H-%M-%S-%f")
    output_filename = FILE_PREFIX + "-" + ts + '.avro'
    print "* creating new avro file: " + output_filename
    xschema = schema.parse(open(SCHEMA_FILE).read())
    rec_writer = io.DatumWriter(xschema)
    df_writer = datafile.DataFileWriter(
                open(output_path + output_filename, 'wb'),
                rec_writer,
                writers_schema = xschema,
                codec = 'deflate')

    for file_path in batch:
        bytes = read_binary(file_path)
        content = base64.b64encode(bytes)
        data = {}
        data['doc_uuid'] = str(uuid.uuid4())
        data['file_path'] = file_path
        data['content'] = content
        df_writer.append(data)

    df_writer.close()
    time.sleep(1)
    hdfs_put(output_path + output_filename, hdfs_path)
コード例 #6
0
def sample(args):
    """
    Select a random sample of the records from the input files
    and write them into an output file.

    This command assumes that the all the input files have the same
    schema.
    
    Arguments:
        infiles: Input files
        outfile: Output file
        sample_ratio: Ratio of records selected (0 <= ratio <= 1).
        codec:   Compression codec for the output
    
    """
    # Get the schema from the first file.
    json_schema = args.infiles[0].meta[datafile.SCHEMA_KEY]
    writers_schema = schema.parse(json_schema)

    rec_writer = io.DatumWriter()
    writer = datafile.DataFileWriter(args.outfile,
                                     rec_writer,
                                     writers_schema=writers_schema,
                                     codec=args.out_codec)

    for infile in args.infiles:
        try:
            for record in infile:
                if args.sample_ratio >= random.random():
                    writer.append(record)
        except:
            print >> sys.stderr, "Error reading file. Skipping", infile
            logging.exception('Error reading input file: %s' % infile)
            continue
コード例 #7
0
def cat(args):
    """
    Concatenates files and stores the result on an output file.

    It asumes that all the input files have the same schema
    """

    # Get the schema from the first file.
    json_schema = args.infiles[0].meta[datafile.SCHEMA_KEY]
    writers_schema = schema.parse(json_schema)

    rec_writer = io.DatumWriter()
    writer = datafile.DataFileWriter(args.outfile,
                                     rec_writer,
                                     writers_schema=writers_schema,
                                     codec=args.out_codec)

    for infile in args.infiles:
        try:
            for record in infile:
                writer.append(record)
        except:
            print >> sys.stderr, "Error reading file. Skipping", infile
            logging.exception('Error reading input file: %s' % infile)
            continue

    writer.close()
コード例 #8
0
def main():
    # 检测参数个数
    if len(sys.argv) != 3:
        sys.exit('Usage %s <Schema file> <Data_file>' % (sys.argv[0]))

    # 从 avsc 文件中读取模式
    schema_string = open(sys.argv[1], "r").read()

    # 打开 avro 文件
    avro_file = open(sys.argv[2], "wb")

    # 获取 DatumWriter 对象
    datum_writer = io.DatumWriter()

    # 解析模式
    schema_object = schema.parse(schema_string)

    # 获得 DataFileWriter 对象
    data_file_writer = datafile.DataFileWriter(avro_file, datum_writer,
                                               schema_object)

    # 从输入中赋值
    for line in sys.stdin:
        (left, right) = line[:-1].split(",")
        data_file_writer.append({'left': left, "right": right})

    # 关闭 DataFileWriter
    data_file_writer.close()
コード例 #9
0
ファイル: avro_tool.py プロジェクト: rajeshmr/kiji
def write(opts, files):
    if not opts.schema:
        raise AvroError('No schema specified')

    input_type = opts.input_type or guess_input_type(files)
    if not input_type:
        raise AvroError('Cannot guess input file type (not .json or .csv)')

    try:
        with open(opts.schema, 'rt') as f:
            json_schema = f.read()
        writer_schema = schema.parse(json_schema)
        out = _open(opts.output, 'wb')
    except (IOError, OSError) as e:
        raise AvroError('Cannot open file - %s' % e)

    record_parser_map = {
        'json': iter_json,
        'csv': iter_csv,
    }

    with datafile.DataFileWriter(
            writer=out,
            datum_writer=avro_io.DatumWriter(),
            writer_schema=writer_schema,
    ) as writer:
        iter_records = record_parser_map[input_type]
        for filename in (files or ['-']):
            reader = _open(filename, 'rt')
            for record in iter_records(reader, writer_schema):
                writer.append(record)
コード例 #10
0
    def test_container(self):
        writer = open('data.avro', 'wb')
        datum_writer = io.DatumWriter()
        schema_object = schema.parse("""\
{ "type": "record",
  "name": "StringPair",
  "doc": "A pair of strings.",
  "fields": [
    {"name": "left", "type": "string"},
    {"name": "right", "type": "string"}
  ]
}
    """)
        dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
        datum = {'left': 'L', 'right': 'R'}
        dfw.append(datum)
        dfw.close()

        reader = open('data.avro', 'rb')
        datum_reader = io.DatumReader()
        dfr = datafile.DataFileReader(reader, datum_reader)
        data = []
        for datum in dfr:
            data.append(datum)

        self.assertEquals(1, len(data))
        self.assertEquals(datum, data[0])
コード例 #11
0
ファイル: test_datafile.py プロジェクト: zhulanovaanna/avro
    def testMetadata(self):
        file_path = self.NewTempFile()

        # Test the writer with a 'with' statement.
        with open(file_path, 'wb') as writer:
            datum_writer = io.DatumWriter()
            sample_schema, sample_datum = SCHEMAS_TO_VALIDATE[1]
            schema_object = schema.parse(sample_schema)
            with datafile.DataFileWriter(writer, datum_writer,
                                         schema_object) as dfw:
                dfw.SetMeta('test.string', 'foo')
                dfw.SetMeta('test.number', '1')
                dfw.append(sample_datum)
            self.assertTrue(writer.closed)

        # Test the reader with a 'with' statement.
        datums = []
        with open(file_path, 'rb') as reader:
            datum_reader = io.DatumReader()
            with datafile.DataFileReader(reader, datum_reader) as dfr:
                self.assertEqual(b'foo', dfr.GetMeta('test.string'))
                self.assertEqual(b'1', dfr.GetMeta('test.number'))
                for datum in dfr:
                    datums.append(datum)
            self.assertTrue(reader.closed)
コード例 #12
0
ファイル: JSONtoAvro.py プロジェクト: siddarthkotnala/hadoop
def main():
	if len(sys.argv) < 2:
		print "Usage: cat input.json | python2.7 JSONtoAvro.py output"
		return

	s = schema.parse(open("tweet.avsc").read())
	f = open(sys.argv[1], "wb")

	writer = datafile.DataFileWriter(f, io.DatumWriter(), s, codec = 'deflate')

	failed = 0

	for line in sys.stdin:
		line = line.strip()

		try:
			data = json.loads(line)
		except ValueError as detail:
			continue

		try:
			writer.append(data)
		except io.AvroTypeException as detail:
			print line
			failed += 1

	writer.close()

	print str(failed) + " failed in schema"
コード例 #13
0
ファイル: views_test.py プロジェクト: ymc/hue
def test_view_avro():
  cluster = pseudo_hdfs4.shared_cluster()
  try:
    c = make_logged_in_client()
    cluster.fs.setuser(cluster.superuser)
    if cluster.fs.isdir("/test-avro-filebrowser"):
      cluster.fs.rmtree('/test-avro-filebrowser/')

    cluster.fs.mkdir('/test-avro-filebrowser/')

    test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

    f = cluster.fs.open('/test-avro-filebrowser/test-view.avro', "w")
    data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(),
                                                writers_schema=test_schema,
                                                codec='deflate')
    dummy_datum = {
      'name': 'Test',
      'integer': 10,
    }
    data_file_writer.append(dummy_datum)
    data_file_writer.close()

    # autodetect
    response = c.get('/filebrowser/view/test-avro-filebrowser/test-view.avro')
    # (Note: we use eval here cause of an incompatibility issue between
    # the representation string of JSON dicts in simplejson vs. json)
    assert_equal(eval(response.context['view']['contents']), dummy_datum)

    # offsetting should work as well
    response = c.get('/filebrowser/view/test-avro-filebrowser/test-view.avro?offset=1')
    assert_equal('avro', response.context['view']['compression'])

    f = cluster.fs.open('/test-avro-filebrowser/test-view2.avro', "w")
    f.write("hello")
    f.close()

    # we shouldn't autodetect non avro files
    response = c.get('/filebrowser/view/test-avro-filebrowser/test-view2.avro')
    assert_equal(response.context['view']['contents'], "hello")

    # we should fail to do a bad thing if they specify compression when it's not set.
    response = c.get('/filebrowser/view/test-avro-filebrowser/test-view2.avro?compression=gzip')
    assert_true('Failed to decompress' in response.context['message'])

  finally:
    try:
      cluster.fs.rmtree('/test-avro-filebrowser/')
    except:
      pass      # Don't let cleanup errors mask earlier failures
コード例 #14
0
ファイル: gen_interop_data.py プロジェクト: behe/avrocado
def write(interop_schema, writer, codec):
    datum_writer = io.DatumWriter()
    dfw = datafile.DataFileWriter(writer,
                                  datum_writer,
                                  interop_schema,
                                  codec=codec)
    dfw.append(DATUM)
    dfw.close()
コード例 #15
0
 def write_avro_file(self, file_object, schema, rec_creator, n_samples,
                     sync_interval):
     avdf.SYNC_INTERVAL = sync_interval
     self.assertEqual(avdf.SYNC_INTERVAL, sync_interval)
     writer = avdf.DataFileWriter(file_object, DatumWriter(), schema)
     for i in xrange(n_samples):
         writer.append(rec_creator(i))
     writer.close()
コード例 #16
0
 def write_avro_file(self, rec_creator, n_samples, sync_interval):
     avdf.SYNC_INTERVAL = sync_interval
     self.assertEqual(avdf.SYNC_INTERVAL, sync_interval)
     fo = self._mkf('data.avro', mode='wb')
     with avdf.DataFileWriter(fo, DatumWriter(), self.schema) as writer:
         for i in range(n_samples):
             writer.append(rec_creator(i))
     return fo.name
コード例 #17
0
def test_view_snappy_compressed_avro():
  if not snappy_installed():
    raise SkipTest
  import snappy

  cluster = pseudo_hdfs4.shared_cluster()
  finish = []
  try:
    c = make_logged_in_client()
    cluster.fs.setuser(cluster.superuser)
    if cluster.fs.isdir("/test-snappy-avro-filebrowser"):
      cluster.fs.rmtree('/test-snappy-avro-filebrowser/')

    cluster.fs.mkdir('/test-snappy-avro-filebrowser/')

    test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

    # Cannot use StringIO with datafile writer!
    f = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.compressed.avro', "w")
    data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(),
                                                writers_schema=test_schema,
                                                codec='snappy')
    dummy_datum = {
      'name': 'Test',
      'integer': 10,
    }
    data_file_writer.append(dummy_datum)
    data_file_writer.close()
    f.close()

    # Check to see if snappy is the codec
    f = cluster.fs.open('/test-snappy-avro-filebrowser/test-view.compressed.avro', "r")
    assert_true('snappy' in f.read())
    f.close()

    # Snappy compressed succeed
    response = c.get('/filebrowser/view/test-snappy-avro-filebrowser/test-view.compressed.avro')
    assert_equal('avro', response.context['view']['compression'])
    assert_equal(eval(response.context['view']['contents']), dummy_datum, response)

  finally:
    for done in finish:
      done()
    try:
      cluster.fs.rmtree('/test-snappy-avro-filebrowser/')
    except:
      pass      # Don't let cleanup errors mask earlier failures
コード例 #18
0
 def test_write_data(self):
     writer = open('pairs.avro', 'wb')
     datum_writer = io.DatumWriter()
     schema_object = schema.Parse(open('Pair.avsc').read())
     dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
     dfw.append({'left': 'a', 'right': '1'})
     dfw.append({'left': 'c', 'right': '2'})
     dfw.append({'left': 'b', 'right': '3'})
     dfw.append({'left': 'b', 'right': '2'})
     dfw.close()
コード例 #19
0
def generate(schema_file, output_path):
    interop_schema = schema.Parse(open(schema_file, 'r').read())
    datum_writer = io.DatumWriter()
    for codec in datafile.VALID_CODECS:
        filename = 'py3'
        if codec != 'null':
            filename += '_' + codec
        with Path(output_path, filename).with_suffix('.avro').open('wb') as writer, \
          datafile.DataFileWriter(writer, datum_writer, interop_schema, codec) as dfw:
            dfw.append(DATUM)
コード例 #20
0
 def init_avro(self, output_path, part_id, schema_path):
     output_dir = None
     output_dirtmp = None  # Handle Avro Write Error
     if (type(output_path) is str):
         output_dir = self.init_directory(output_path)
         output_dirtmp = self.init_directory(
             output_path + 'tmp')  # Handle Avro Write Error
     out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \
       {"output_dir": output_dir, "part_id": str(part_id)}
     out_filenametmp = '%(output_dirtmp)s/part-%(part_id)s.avro' % \
       {"output_dirtmp": output_dirtmp, "part_id": str(part_id)}  # Handle Avro Write Error
     self.schema = open(schema_path, 'r').read()
     email_schema = schema.parse(self.schema)
     rec_writer = io.DatumWriter(email_schema)
     self.avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'),
                                                rec_writer, email_schema)
     # CREATE A TEMP AvroWriter that can be used to workaround the UnicodeDecodeError when writing into AvroStorage
     self.avro_writertmp = datafile.DataFileWriter(
         open(out_filenametmp, 'wb'), rec_writer, email_schema)
コード例 #21
0
 def init_avro(self, output_path, part_id, schema_path):
     output_dir = None
     if (type(output_path) is str):
         output_dir = self.init_directory(output_path)
     out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \
       {"output_dir": output_dir, "part_id": str(part_id)}
     self.schema = open(schema_path, 'r').read()
     email_schema = schema.parse(self.schema)
     rec_writer = io.DatumWriter(email_schema)
     self.avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'),
                                                rec_writer, email_schema)
コード例 #22
0
ファイル: py_avro_inter.py プロジェクト: pbirsinger/aspNew
def write_avro_file(args, outsource='args.avro'):
    SCHEMA = schema.parse(makeSchema(args))
    rec_writer = io.DatumWriter(SCHEMA)   
        
    if outsource == sys.stdout:
        df_writer = datafile.DataFileWriter(sys.stdout, rec_writer, 
                                        writers_schema = SCHEMA, codec = 'deflate')
    
    else:
        df_writer = datafile.DataFileWriter(open(outsource,'wb'), rec_writer, 
                                        writers_schema = SCHEMA, codec = 'deflate')
    data = {}
    count = 1
    data['size'] = len(args)
    for arg in args:
        if type(arg) == tuple:
            arg = tupleToList(arg)
        data["arg%s"%(count)] = arg
        count +=1
    df_writer.append(data)
    df_writer.close()
コード例 #23
0
ファイル: test_datafile.py プロジェクト: timjroberts/avro
  def testRoundTrip(self):
    correct = 0
    codecs_to_validate = get_codecs_to_validate()
    for iexample, (writer_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in codecs_to_validate:
        file_path = self.NewTempFile()

        # Write the datum this many times in the data file:
        nitems = 10

        logging.debug(
            'Performing round-trip with codec %r in file %s for example #%d\n'
            'Writing datum: %r using writer schema:\n%s',
            codec, file_path, iexample,
            datum, writer_schema)

        logging.debug('Creating data file %r', file_path)
        with open(file_path, 'wb') as writer:
          datum_writer = io.DatumWriter()
          schema_object = schema.parse(writer_schema)
          with datafile.DataFileWriter(
              writer=writer,
              datum_writer=datum_writer,
              writer_schema=schema_object,
              codec=codec,
          ) as dfw:
            for _ in range(nitems):
              dfw.append(datum)

        logging.debug('Reading data from %r', file_path)
        with open(file_path, 'rb') as reader:
          datum_reader = io.DatumReader()
          with datafile.DataFileReader(reader, datum_reader) as dfr:
            round_trip_data = list(dfr)

        logging.debug(
            'Round-trip data has %d items: %r',
            len(round_trip_data), round_trip_data)

        if ([datum] * nitems) == round_trip_data:
          correct += 1
        else:
          logging.error(
              'Round-trip data does not match:\n'
              'Expect: %r\n'
              'Actual: %r',
              [datum] * nitems,
              round_trip_data)

    self.assertEqual(
        correct,
        len(codecs_to_validate) * len(SCHEMAS_TO_VALIDATE))
コード例 #24
0
 def test_write_data(self):
     writer = open('pairs.avro', 'wb')
     datum_writer = io.DatumWriter()
     schema_object = schema.parse(
         open(
             '/Users/tom/workspace/hadoop-book-avro/src/main/java/Pair.avsc'
         ).read())
     dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
     dfw.append({'left': 'a', 'right': '1'})
     dfw.append({'left': 'c', 'right': '2'})
     dfw.append({'left': 'b', 'right': '3'})
     dfw.append({'left': 'b', 'right': '2'})
     dfw.close()
コード例 #25
0
  def test_empty_datafile(self):
    """A reader should not fail to read a file consisting of a single empty block."""
    sample_schema = schema.parse(SCHEMAS_TO_VALIDATE[1][0])
    with datafile.DataFileWriter(open(FILENAME, 'wb'), io.DatumWriter(),
        sample_schema) as dfw:
      dfw.flush()
      # Write an empty block
      dfw.encoder.write_long(0)
      dfw.encoder.write_long(0)
      dfw.writer.write(dfw.sync_marker)

    with datafile.DataFileReader(open(FILENAME, 'rb'), io.DatumReader()) as dfr:
      self.assertEqual([], list(dfr))
コード例 #26
0
 def test_write_data(self):
     writer = open('pairs.avro', 'wb')
     datum_writer = io.DatumWriter()
     schema_object = schema.Parse(
         open(
             '/Users/zzy/Docs/hadoop_book/ch12-avro/src/main/resources/StringPair.avsc'
         ).read())
     dfw = datafile.DataFileWriter(writer, datum_writer, schema_object)
     dfw.append({'left': 'a', 'right': '1'})
     dfw.append({'left': 'c', 'right': '2'})
     dfw.append({'left': 'b', 'right': '3'})
     dfw.append({'left': 'b', 'right': '2'})
     dfw.close()
コード例 #27
0
ファイル: output_avro.py プロジェクト: raider377/qnt-python
def merge_output_records_to_file(records):
    bio = BytesIO()

    schema = avs.Parse(json.dumps(output_schema))

    writer = aio.DatumWriter()
    writer.write = lambda datum, encoder: encoder.write(datum)

    dw = adf.DataFileWriter(bio, writer, schema)

    for r in records:
        dw.append(r)
    dw.flush()

    return bio.getvalue()
コード例 #28
0
  def test_round_trip(self):
    print ''
    print 'TEST ROUND TRIP'
    print '==============='
    print ''
    correct = 0
    for i, (example_schema, datum) in enumerate(SCHEMAS_TO_VALIDATE):
      for codec in CODECS_TO_VALIDATE:
        if (codec == 'snappy'):
          try:
            import snappy
          except:
            print 'Snappy not present. Skipping.'
            correct += 1
            continue
        print ''
        print 'SCHEMA NUMBER %d' % (i + 1)
        print '================'
        print ''
        print 'Schema: %s' % example_schema
        print 'Datum: %s' % datum
        print 'Codec: %s' % codec

        # write data in binary to file 10 times
        writer = open(FILENAME, 'wb')
        datum_writer = io.DatumWriter()
        schema_object = schema.parse(example_schema)
        dfw = datafile.DataFileWriter(writer, datum_writer, schema_object, codec=codec)
        for i in range(10):
          dfw.append(datum)
        dfw.close()

        # read data in binary from file
        reader = open(FILENAME, 'rb')
        datum_reader = io.DatumReader()
        dfr = datafile.DataFileReader(reader, datum_reader)
        round_trip_data = []
        for datum in dfr:
          round_trip_data.append(datum)

        print 'Round Trip Data: %s' % round_trip_data
        print 'Round Trip Data Length: %d' % len(round_trip_data)
        is_correct = [datum] * 10 == round_trip_data
        if is_correct: correct += 1
        print 'Correct Round Trip: %s' % is_correct
        print ''
    os.remove(FILENAME)
    self.assertEquals(correct, len(CODECS_TO_VALIDATE)*len(SCHEMAS_TO_VALIDATE))
コード例 #29
0
  def test_view_snappy_compressed_avro(self):
    if not snappy_installed():
      raise SkipTest
    import snappy

    finish = []
    try:
      prefix = self.cluster.fs_prefix + '/test-snappy-avro-filebrowser'
      self.cluster.fs.mkdir(prefix)

      test_schema = schema.parse("""
        {
          "name": "test",
          "type": "record",
          "fields": [
            { "name": "name", "type": "string" },
            { "name": "integer", "type": "int" }
          ]
        }
      """)

      # Cannot use StringIO with datafile writer!
      f = self.cluster.fs.open(prefix +'/test-view.compressed.avro', "w")
      data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(),
                                                  writers_schema=test_schema,
                                                  codec='snappy')
      dummy_datum = {
        'name': 'Test',
        'integer': 10,
      }
      data_file_writer.append(dummy_datum)
      data_file_writer.close()
      f.close()

      # Check to see if snappy is the codec
      f = self.cluster.fs.open(prefix + '/test-view.compressed.avro', "r")
      assert_true('snappy' in f.read())
      f.close()

      # Snappy compressed succeed
      response = self.c.get('/filebrowser/view=%s/test-view.compressed.avro' % prefix)
      assert_equal('avro', response.context['view']['compression'])
      assert_equal(eval(response.context['view']['contents']), dummy_datum, response)

    finally:
      for done in finish:
        done()
コード例 #30
0
  def test_view_avro(self):
    prefix = self.cluster.fs_prefix + '/test_view_avro'
    self.cluster.fs.mkdir(prefix)

    test_schema = schema.parse("""
      {
        "name": "test",
        "type": "record",
        "fields": [
          { "name": "name", "type": "string" },
          { "name": "integer", "type": "int" }
        ]
      }
    """)

    f = self.cluster.fs.open(prefix + '/test-view.avro', "w")
    data_file_writer = datafile.DataFileWriter(f, io.DatumWriter(),
                                                writers_schema=test_schema,
                                                codec='deflate')
    dummy_datum = {
      'name': 'Test',
      'integer': 10,
    }
    data_file_writer.append(dummy_datum)
    data_file_writer.close()

    # autodetect
    response = self.c.get('/filebrowser/view=%s/test-view.avro' % prefix)
    # (Note: we use eval here cause of an incompatibility issue between
    # the representation string of JSON dicts in simplejson vs. json)
    assert_equal(eval(response.context['view']['contents']), dummy_datum)

    # offsetting should work as well
    response = self.c.get('/filebrowser/view=%s/test-view.avro?offset=1' % prefix)
    assert_equal('avro', response.context['view']['compression'])

    f = self.cluster.fs.open(prefix + '/test-view2.avro', "w")
    f.write("hello")
    f.close()

    # we shouldn't autodetect non avro files
    response = self.c.get('/filebrowser/view=%s/test-view2.avro' % prefix)
    assert_equal(response.context['view']['contents'], "hello")

    # we should fail to do a bad thing if they specify compression when it's not set.
    response = self.c.get('/filebrowser/view=%s/test-view2.avro?compression=gzip' % prefix)
    assert_true('Failed to decompress' in response.context['message'])