Exemple #1
0
    def _write(self, data):
        "Internal Write API"
        schema = self.schema
        wmaid = self.wmaid(data)
        year, month, _ = today()
        hdir = '%s/%s/%s' % (self.hdir, year, month)
        if not hdfs.path.isdir(hdir):
            hdfs.mkdir(hdir)
        fname = file_name(hdir, wmaid, self.compress)

        # create Avro writer and binary encoder
        writer = avro.io.DatumWriter(schema)
        bytes_writer = io.BytesIO()

        if self.compress:
            # use gzip'ed writer with BytesIO file object
            gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb')
            encoder = avro.io.BinaryEncoder(gzip_writer)
        else:
            # plain binary reader
            encoder = avro.io.BinaryEncoder(bytes_writer)

        # write records from given data stream to binary writer
        writer.write(data, encoder)

        # close gzip stream if necessary
        if self.compress:
            gzip_writer.flush()
            gzip_writer.close()

        # store raw data to hadoop via HDFS
        hdfs.dump(bytes_writer.getvalue(), fname)

        # close bytes stream
        bytes_writer.close()
Exemple #2
0
    def _read(self, spec, fields=None):
        "Internal read API"
        if PAT_UID.match(str(spec)):  # requested to read concrete file
            out = []
            year, month, _ = today()
            hdir = '%s/%s/%s' % (self.hdir, year, month)
            fname = file_name(hdir, spec, self.compress)
            data = hdfs.load(fname)
            bytes_reader = io.BytesIO(data)

            if self.compress:
                # use gzip'ed reader and pass to it BytesIO as file object
                gzip_reader = gzip.GzipFile(fileobj=bytes_reader)
                decoder = avro.io.BinaryDecoder(gzip_reader)
            else:
                # use non-compressed reader
                decoder = avro.io.BinaryDecoder(bytes_reader)

            reader = avro.io.DatumReader(self.schema)
            while True:
                try:
                    rec = reader.read(decoder)
                    out.append(rec)
                except:
                    break
            # close gzip stream if necessary
            if self.compress:
                gzip_reader.close()
            # close bytes stream
            bytes_reader.close()
            return out
        return self.empty_data
Exemple #3
0
    def _read(self, spec, fields=None):
        "Internal read API"
        if  PAT_UID.match(str(spec)): # requested to read concrete file
            out = []
            year, month, _ = today()
            hdir = '%s/%s/%s' % (self.hdir, year, month)
            fname = file_name(hdir, spec, self.compress)
            data = hdfs.load(fname)
            bytes_reader = io.BytesIO(data)

            if  self.compress:
                # use gzip'ed reader and pass to it BytesIO as file object
                gzip_reader = gzip.GzipFile(fileobj=bytes_reader)
                decoder = avro.io.BinaryDecoder(gzip_reader)
            else:
                # use non-compressed reader
                decoder = avro.io.BinaryDecoder(bytes_reader)

            reader = avro.io.DatumReader(self.schema)
            while True:
                try:
                    rec = reader.read(decoder)
                    out.append(rec)
                except:
                    break
            # close gzip stream if necessary
            if  self.compress:
                gzip_reader.close()
            # close bytes stream
            bytes_reader.close()
            return out
        return self.empty_data
Exemple #4
0
    def _write(self, data):
        "Internal Write API"
        schema = self.schema
        wmaid = self.wmaid(data)
        year, month, _ = today()
        hdir = '%s/%s/%s' % (self.hdir, year, month)
        if  not hdfs.path.isdir(hdir):
            hdfs.mkdir(hdir)
        fname = file_name(hdir, wmaid, self.compress)

        # create Avro writer and binary encoder
        writer = avro.io.DatumWriter(schema)
        bytes_writer = io.BytesIO()

        if  self.compress:
            # use gzip'ed writer with BytesIO file object
            gzip_writer = gzip.GzipFile(fileobj=bytes_writer, mode='wb')
            encoder = avro.io.BinaryEncoder(gzip_writer)
        else:
            # plain binary reader
            encoder = avro.io.BinaryEncoder(bytes_writer)

        # write records from given data stream to binary writer
        writer.write(data, encoder)

        # close gzip stream if necessary
        if  self.compress:
            gzip_writer.flush()
            gzip_writer.close()

        # store raw data to hadoop via HDFS
        hdfs.dump(bytes_writer.getvalue(), fname)

        # close bytes stream
        bytes_writer.close()
Exemple #5
0
 def _write(self, data):
     "Internal write API"
     wmaid = self.wmaid(data)
     schema = self.schema
     fname = file_name(self.hdir, wmaid)
     with open_file(fname, "w") as ostream:
         with DataFileWriter(ostream, DatumWriter(), schema) as writer:
             writer.append(data)
Exemple #6
0
 def _write(self, data):
     "Internal write API"
     wmaid = self.wmaid(data)
     schema = self.schema
     fname = file_name(self.hdir, wmaid)
     with open_file(fname, 'w') as ostream:
         with DataFileWriter(ostream, DatumWriter(), schema) as writer:
             writer.append(data)
Exemple #7
0
 def test_file_name(self):
     "Test file_name function"
     uri = "test"
     wmaid = 123
     for compress in ["", "bz2", "gz"]:
         fname = file_name(uri, wmaid, compress)
         if compress:
             tname = "%s/%s.avro.%s" % (uri, wmaid, compress)
         else:
             tname = "%s/%s.avro" % (uri, wmaid)
         self.assertEqual(fname, tname)
     self.assertRaises(Exception, file_name, (uri, wmaid, "gzip"))
Exemple #8
0
 def test_file_name(self):
     "Test file_name function"
     uri = 'test'
     wmaid = 123
     for compress in ['', 'bz2', 'gz']:
         fname = file_name(uri, wmaid, compress)
         if  compress:
             tname = '%s/%s.avro.%s' % (uri, wmaid, compress)
         else:
             tname = '%s/%s.avro' % (uri, wmaid)
         self.assertEqual(fname, tname)
     self.assertRaises(Exception, file_name, (uri, wmaid, 'gzip'))
Exemple #9
0
 def _read(self, spec, fields=None):
     "Internal read API"
     if PAT_UID.match(str(spec)):  # requested to read concrete file
         out = []
         fname = file_name(self.hdir, spec)
         with open_file(fname) as istream:
             reader = DataFileReader(istream, DatumReader())
             for data in reader:
                 if isinstance(data, list):
                     for rec in data:
                         self.check(rec)
                     return data
                 self.check(data)
                 out.append(data)
         return out
     return self.empty_data
Exemple #10
0
 def _read(self, spec, fields=None):
     "Internal read API"
     if PAT_UID.match(str(spec)):  # requested to read concrete file
         out = []
         fname = file_name(self.hdir, spec)
         with open_file(fname) as istream:
             reader = DataFileReader(istream, DatumReader())
             for data in reader:
                 if isinstance(data, list):
                     for rec in data:
                         self.check(rec)
                     return data
                 self.check(data)
                 out.append(data)
         return out
     return self.empty_data