def file_write(self, fname, data): "Write documents in append mode to given file name" # perform input data validation good_data = [] # write bad data records into output file bdir = '%s/bad' % os.path.dirname(fname) if not os.path.exists(bdir): os.makedirs(bdir) bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname)) count = ecount = edocs = 0 with open(bfname, 'a') as bstream: for rec in data: validator = RecordValidator() validator.run(self.schema_json, rec) if validator.errors: bstream.write(json.dumps(rec)+'\n') for err in validator.errors: msg = 'SCHEMA ERROR ' for key, val in err.items(): msg += '%s: %s ' % (key.upper(), json.dumps(val)) bstream.write(msg+'\n') bstream.write('-------------\n') ecount += len(validator.errors) edocs += 1 else: good_data.append(rec) count += 1 if ecount: print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\ % (count, edocs, ecount, bfname)) # use only good portion of the data data = good_data try: schema = self.schema wmaids = [] if not hasattr(data, '__iter__') or isinstance(data, dict): data = [data] if os.path.exists(fname): schema = None # we'll append to existing file mode = 'a+' if fname.endswith('.avro') else 'a' if mode == 'a': print("We're unable yet to implement read-write mode with compressed avro files") raise NotImplementedError rec = None # keep doc in case of failure with DataFileWriter(open_file(fname, mode), DatumWriter(), schema) as writer: for rec in data: writer.append(rec) writer.flush() wmaid = rec.get('wmaid', wmaHash(rec)) wmaids.append(wmaid) return wmaids except Exception as exc: err = traceback.format_exc(limit=1).splitlines()[-1] line = ' '.join(str(exc).replace('\n', '').split()) msg = 'Failure in %s storage, error=%s, exception=%s' \ % (self.stype, err, line) msg += ' Failed document: ' msg += json.dumps(rec) raise WriteError(msg)
def _write(self, data): "Internal write API" wmaid = self.wmaid(data) schema = self.schema fname = file_name(self.hdir, wmaid) with open_file(fname, "w") as ostream: with DataFileWriter(ostream, DatumWriter(), schema) as writer: writer.append(data)
def _write(self, data): "Internal write API" wmaid = self.wmaid(data) schema = self.schema fname = file_name(self.hdir, wmaid) with open_file(fname, 'w') as ostream: with DataFileWriter(ostream, DatumWriter(), schema) as writer: writer.append(data)
def _read(self, spec, fields=None): "Internal read API" if PAT_UID.match(str(spec)): # requested to read concrete file fname = '%s/%s.gz' % (self.uri, spec) data = json.load(open_file(fname)) if isinstance(data, list): for rec in data: self.check(rec) return data self.check(data) return [data] return self.empty_data
def file_read(self, fname): "Read documents from given file name" try: schema = self.schema out = [] with DataFileReader(open_file(fname), DatumReader()) as reader: for rec in reader: out.append(rec) return out except Exception as exc: err = traceback.format_exc(limit=1).splitlines()[-1] msg = "Failure in %s storage, error=%s" % (self.stype, err) raise ReadError(msg)
def file_read(self, fname): "Read documents from given file name" try: schema = self.schema out = [] with DataFileReader(open_file(fname), DatumReader()) as reader: for rec in reader: out.append(rec) return out except Exception as exc: err = traceback.format_exc(limit=1).splitlines()[-1] msg = 'Failure in %s storage, error=%s' % (self.stype, err) raise ReadError(msg)
def _read(self, spec, fields=None): "Internal read API" if PAT_UID.match(str(spec)): # requested to read concrete file out = [] fname = file_name(self.hdir, spec) with open_file(fname) as istream: reader = DataFileReader(istream, DatumReader()) for data in reader: if isinstance(data, list): for rec in data: self.check(rec) return data self.check(data) out.append(data) return out return self.empty_data
def _write(self, data): "Internal write API" wmaid = self.wmaid(data) fname = '%s/%s.gz' % (self.uri, wmaid) with open_file(fname, 'w') as ostream: ostream.write(json.dumps(data))
def file_write(self, fname, data): "Write documents in append mode to given file name" # perform input data validation good_data = [] # write bad data records into output file bdir = os.path.dirname(fname) bdir = '%s/bad' % bdir if bdir else '/tmp/bad' if not os.path.exists(bdir): os.makedirs(bdir) bfname = '%s/%s_bad.txt' % (bdir, os.path.basename(fname)) count = ecount = edocs = 0 with open(bfname, 'a') as bstream: for rec in data: validator = RecordValidator() validator.run(self.schema_json, rec) if validator.errors: bstream.write(json.dumps(rec) + '\n') for err in validator.errors: msg = 'SCHEMA ERROR ' for key, val in err.items(): msg += '%s: %s ' % (key.upper(), json.dumps(val)) bstream.write(msg + '\n') bstream.write('-------------\n') ecount += len(validator.errors) edocs += 1 else: good_data.append(rec) count += 1 if ecount: print("WARNING: received %s docs, found %s bad docs, %s errors, see %s"\ % (count, edocs, ecount, bfname)) # use only good portion of the data data = good_data try: schema = self.schema wmaids = [] if not hasattr(data, '__iter__') or isinstance(data, dict): data = [data] if os.path.exists(fname): schema = None # we'll append to existing file mode = 'a+' if fname.endswith('.avro') else 'a' if mode == 'a': print( "We're unable yet to implement read-write mode with compressed avro files" ) raise NotImplementedError rec = None # keep doc in case of failure with DataFileWriter(open_file(fname, mode), DatumWriter(), schema) as writer: for rec in data: writer.append(rec) writer.flush() wmaid = rec.get('wmaid', wmaHash(rec)) wmaids.append(wmaid) return wmaids except Exception as exc: err = traceback.format_exc(limit=1).splitlines()[-1] line = ' '.join(str(exc).replace('\n', '').split()) msg = 'Failure in %s storage, error=%s, exception=%s' \ % (self.stype, err, line) msg += ' Failed document: ' msg += json.dumps(rec) raise WriteError(msg)