def _read(self, spec, fields=None): "Internal read API" if PAT_UID.match(str(spec)): # requested to read concrete file out = [] year, month, _ = today() hdir = '%s/%s/%s' % (self.hdir, year, month) fname = file_name(hdir, spec, self.compress) data = hdfs.load(fname) bytes_reader = io.BytesIO(data) if self.compress: # use gzip'ed reader and pass to it BytesIO as file object gzip_reader = gzip.GzipFile(fileobj=bytes_reader) decoder = avro.io.BinaryDecoder(gzip_reader) else: # use non-compressed reader decoder = avro.io.BinaryDecoder(bytes_reader) reader = avro.io.DatumReader(self.schema) while True: try: rec = reader.read(decoder) out.append(rec) except: break # close gzip stream if necessary if self.compress: gzip_reader.close() # close bytes stream bytes_reader.close() return out return self.empty_data
def _write(self, data): "Internal Write API" schema = self.schema wmaid = self.wmaid(data) year, month, _ = today() hdir = '%s/%s/%s' % (self.hdir, year, month) if not hdfs.path.isdir(hdir): hdfs.mkdir(hdir) fname = file_name(hdir, wmaid, self.compress) # create Avro writer and binary encoder writer = avro.io.DatumWriter(schema) bytes_writer = io.BytesIO() if self.compress: # use gzip'ed writer with BytesIO file object gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb') encoder = avro.io.BinaryEncoder(gzip_writer) else: # plain binary reader encoder = avro.io.BinaryEncoder(bytes_writer) # write records from given data stream to binary writer writer.write(data, encoder) # close gzip stream if necessary if self.compress: gzip_writer.flush() gzip_writer.close() # store raw data to hadoop via HDFS hdfs.dump(bytes_writer.getvalue(), fname) # close bytes stream bytes_writer.close()