Example #1
0
    def _read(self, spec, fields=None):
        "Internal read API"
        if PAT_UID.match(str(spec)):  # requested to read concrete file
            out = []
            year, month, _ = today()
            hdir = '%s/%s/%s' % (self.hdir, year, month)
            fname = file_name(hdir, spec, self.compress)
            data = hdfs.load(fname)
            bytes_reader = io.BytesIO(data)

            if self.compress:
                # use gzip'ed reader and pass to it BytesIO as file object
                gzip_reader = gzip.GzipFile(fileobj=bytes_reader)
                decoder = avro.io.BinaryDecoder(gzip_reader)
            else:
                # use non-compressed reader
                decoder = avro.io.BinaryDecoder(bytes_reader)

            reader = avro.io.DatumReader(self.schema)
            while True:
                try:
                    rec = reader.read(decoder)
                    out.append(rec)
                except:
                    break
            # close gzip stream if necessary
            if self.compress:
                gzip_reader.close()
            # close bytes stream
            bytes_reader.close()
            return out
        return self.empty_data
Example #2
0
    def _read(self, spec, fields=None):
        "Internal read API"
        if  PAT_UID.match(str(spec)): # requested to read concrete file
            out = []
            year, month, _ = today()
            hdir = '%s/%s/%s' % (self.hdir, year, month)
            fname = file_name(hdir, spec, self.compress)
            data = hdfs.load(fname)
            bytes_reader = io.BytesIO(data)

            if  self.compress:
                # use gzip'ed reader and pass to it BytesIO as file object
                gzip_reader = gzip.GzipFile(fileobj=bytes_reader)
                decoder = avro.io.BinaryDecoder(gzip_reader)
            else:
                # use non-compressed reader
                decoder = avro.io.BinaryDecoder(bytes_reader)

            reader = avro.io.DatumReader(self.schema)
            while True:
                try:
                    rec = reader.read(decoder)
                    out.append(rec)
                except:
                    break
            # close gzip stream if necessary
            if  self.compress:
                gzip_reader.close()
            # close bytes stream
            bytes_reader.close()
            return out
        return self.empty_data
Example #3
0
    def _write(self, data):
        "Internal Write API"
        schema = self.schema
        wmaid = self.wmaid(data)
        year, month, _ = today()
        hdir = '%s/%s/%s' % (self.hdir, year, month)
        if not hdfs.path.isdir(hdir):
            hdfs.mkdir(hdir)
        fname = file_name(hdir, wmaid, self.compress)

        # create Avro writer and binary encoder
        writer = avro.io.DatumWriter(schema)
        bytes_writer = io.BytesIO()

        if self.compress:
            # use gzip'ed writer with BytesIO file object
            gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb')
            encoder = avro.io.BinaryEncoder(gzip_writer)
        else:
            # plain binary reader
            encoder = avro.io.BinaryEncoder(bytes_writer)

        # write records from given data stream to binary writer
        writer.write(data, encoder)

        # close gzip stream if necessary
        if self.compress:
            gzip_writer.flush()
            gzip_writer.close()

        # store raw data to hadoop via HDFS
        hdfs.dump(bytes_writer.getvalue(), fname)

        # close bytes stream
        bytes_writer.close()
Example #4
0
    def _write(self, data):
        "Internal Write API"
        schema = self.schema
        wmaid = self.wmaid(data)
        year, month, _ = today()
        hdir = '%s/%s/%s' % (self.hdir, year, month)
        if  not hdfs.path.isdir(hdir):
            hdfs.mkdir(hdir)
        fname = file_name(hdir, wmaid, self.compress)

        # create Avro writer and binary encoder
        writer = avro.io.DatumWriter(schema)
        bytes_writer = io.BytesIO()

        if  self.compress:
            # use gzip'ed writer with BytesIO file object
            gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb')
            encoder = avro.io.BinaryEncoder(gzip_writer)
        else:
            # plain binary reader
            encoder = avro.io.BinaryEncoder(bytes_writer)

        # write records from given data stream to binary writer
        writer.write(data, encoder)

        # close gzip stream if necessary
        if  self.compress:
            gzip_writer.flush()
            gzip_writer.close()

        # store raw data to hadoop via HDFS
        hdfs.dump(bytes_writer.getvalue(), fname)

        # close bytes stream
        bytes_writer.close()