def _write(self, data): "Internal Write API" schema = self.schema wmaid = self.wmaid(data) year, month, _ = today() hdir = '%s/%s/%s' % (self.hdir, year, month) if not hdfs.path.isdir(hdir): hdfs.mkdir(hdir) fname = file_name(hdir, wmaid, self.compress) # create Avro writer and binary encoder writer = avro.io.DatumWriter(schema) bytes_writer = io.BytesIO() if self.compress: # use gzip'ed writer with BytesIO file object gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb') encoder = avro.io.BinaryEncoder(gzip_writer) else: # plain binary reader encoder = avro.io.BinaryEncoder(bytes_writer) # write records from given data stream to binary writer writer.write(data, encoder) # close gzip stream if necessary if self.compress: gzip_writer.flush() gzip_writer.close() # store raw data to hadoop via HDFS hdfs.dump(bytes_writer.getvalue(), fname) # close bytes stream bytes_writer.close()
def _read(self, spec, fields=None): "Internal read API" if PAT_UID.match(str(spec)): # requested to read concrete file out = [] year, month, _ = today() hdir = '%s/%s/%s' % (self.hdir, year, month) fname = file_name(hdir, spec, self.compress) data = hdfs.load(fname) bytes_reader = io.BytesIO(data) if self.compress: # use gzip'ed reader and pass to it BytesIO as file object gzip_reader = gzip.GzipFile(fileobj=bytes_reader) decoder = avro.io.BinaryDecoder(gzip_reader) else: # use non-compressed reader decoder = avro.io.BinaryDecoder(bytes_reader) reader = avro.io.DatumReader(self.schema) while True: try: rec = reader.read(decoder) out.append(rec) except: break # close gzip stream if necessary if self.compress: gzip_reader.close() # close bytes stream bytes_reader.close() return out return self.empty_data
def _write(self, data): "Internal write API" wmaid = self.wmaid(data) schema = self.schema fname = file_name(self.hdir, wmaid) with open_file(fname, "w") as ostream: with DataFileWriter(ostream, DatumWriter(), schema) as writer: writer.append(data)
def _write(self, data): "Internal write API" wmaid = self.wmaid(data) schema = self.schema fname = file_name(self.hdir, wmaid) with open_file(fname, 'w') as ostream: with DataFileWriter(ostream, DatumWriter(), schema) as writer: writer.append(data)
def test_file_name(self): "Test file_name function" uri = "test" wmaid = 123 for compress in ["", "bz2", "gz"]: fname = file_name(uri, wmaid, compress) if compress: tname = "%s/%s.avro.%s" % (uri, wmaid, compress) else: tname = "%s/%s.avro" % (uri, wmaid) self.assertEqual(fname, tname) self.assertRaises(Exception, file_name, (uri, wmaid, "gzip"))
def test_file_name(self): "Test file_name function" uri = 'test' wmaid = 123 for compress in ['', 'bz2', 'gz']: fname = file_name(uri, wmaid, compress) if compress: tname = '%s/%s.avro.%s' % (uri, wmaid, compress) else: tname = '%s/%s.avro' % (uri, wmaid) self.assertEqual(fname, tname) self.assertRaises(Exception, file_name, (uri, wmaid, 'gzip'))
def _read(self, spec, fields=None): "Internal read API" if PAT_UID.match(str(spec)): # requested to read concrete file out = [] fname = file_name(self.hdir, spec) with open_file(fname) as istream: reader = DataFileReader(istream, DatumReader()) for data in reader: if isinstance(data, list): for rec in data: self.check(rec) return data self.check(data) out.append(data) return out return self.empty_data