def read(self, format): time_start = time.time() if format == 'json': with open('./output/output.json') as file: json.loads(file.read()) if format == 'jsch': with open('./output/output.json') as file: validate(json.loads(file.read()), self._schema_json) elif format == 'avro': reader = DataFileReader(open('./output/output.avro', 'r'), DatumReader()) for user in reader: pass reader.close() elif format == 'protobuf': with open('./output/output.pb', 'rb') as file: addressbook_pb2.AddressBook().ParseFromString(file.read()) elif format == 'gzjson': with gzip.open('./output/output.jsz', 'rb') as file: json.loads(file.read()) time_end = time.time() return time_end - time_start
def deserializeDataFromFile2Str(inputFile): logging.debug("Deserializing file:"+inputFile) reader = DataFileReader(open(inputFile, "r"), DatumReader()) data="" for item in reader: data=data+str(item) reader.close() return data
def generic_dataframe(self, df, avro_schema, assert_fns=None): """Generic test running function for arbitrary avro schemas. Writes a dataframe containing the records to avro. Reads back and compares with the original """ print(avro_schema) cyavro.write_avro_file_from_dataframe(df, self.filename, json.dumps(avro_schema), codec='null' ) if assert_fns is None: assert_fns = {} df_read = cyavro.read_avro_file_as_dataframe(self.filename) import avro.schema from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter with open(self.filename, 'rb') as fo: reader = DataFileReader(fo, DatumReader()) records = [] for user in reader: records.append(user) df_reference = pd.DataFrame(records) reader.close() success = True for col in avro_schema["fields"]: colname = col['name'] assert_fn = assert_fns.get(colname, np.testing.assert_array_equal) def print_fail_header(s): print('#' * len(s)) print("FAIL: Column {}".format(col)) print('#' * len(s)) print(s) try: assert_fn(df_read[colname], df[colname]) except AssertionError: print_fail_header("Failed for cyavro read comparison {}\n".format(col)) traceback.print_exc(file=sys.stdout) success = False try: assert_fn(df_reference[colname], df[colname]) except AssertionError: print_fail_header("Failed for cyavro write comparison {}\n".format(col)) traceback.print_exc(file=sys.stdout) success = False assert success
def testRead(filename): fd = open(filename, 'rb') datum_writer = DatumReader() freader = DataFileReader(fd, datum_writer) for datum in freader: print datum['name'], datum['company'] print datum['website'] print freader.close()
def testRead(filename): fd = open(filename, 'rb') datum = DatumReader() reader = DataFileReader(fd, datum) for record in reader: print record['name'], record['age'] reader.close()
def main(): if len(sys.argv) < 3: print "Usage:", sys.argv[0] print "add [num of events to add] filename" print "list filename" exit(1) command = sys.argv[1] if command == 'add': noEvents = sys.argv[2] filename = sys.argv[3] # load existing events existingEvents = {} try: reader = DataFileReader(open(filename, "rb"), DatumReader()) existingEvents = reader reader.close() except IOError: print filename + ": Could not open file. Creating a new one." # Write back out to disk try: schema = avro.schema.parse(open("etc/userevent.avsc").read()) f = open(filename, "w") writer = DataFileWriter(f, DatumWriter(), schema) # Append new user events for i in range(0, int(noEvents)): newEvent = createUserEvent() print newEvent writer.append(newEvent) writer.close() print "Wrote {0} user events".format(noEvents) except IOError: print filename + ": Could not save file." elif command == 'list': listAllUserEvents(sys.argv[2]) else: print "Unregistered command. Exiting" sys.exit(1)
def loadOldData(filename): oldDataDict = dict() if not os.path.isfile(filename): return oldDataDict reader = DataFileReader(open(filename, "r"), DatumReader()) for weight in reader: oldDataDict[weight["site"]] = weight["weight"] reader.close() return oldDataDict
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "hi:s:", ["help", "input-file=", "schema="]) except getopt.GetoptError as err: # print help information and exit: print str(err) # will print something like "option -a not recognized" usage(sys.argv[0]) sys.exit(2) avro_file = None avro_schema_file = None required_cl = 0 for o, a in opts: if o in ("-h", "--help"): usage(sys.argv[0]) sys.exit() elif o in ("-i", "--input-file"): required_cl += 1 avro_file = a elif o in ("-s", "--schema"): avro_schema_file = a else: assert False, "unhandled option" if (required_cl < 1): print "ERROR: Missing required argument" usage(sys.argv[0]) sys.exit(1) if not avro_schema_file: reader = DataFileReader(open(avro_file, "r"), DatumReader()) for datum in reader: print datum reader.close() else: reader_schema = open(avro_schema_file, "r") avro_schema = reader_schema.read() reader_schema.close() parsed_avro_schema = avro.schema.parse(avro_schema) with open(avro_file, "rb") as reader_data: inputio = io.BytesIO(reader_data.read()) decoder = avro.io.BinaryDecoder(inputio) reader = avro.io.DatumReader(parsed_avro_schema) while inputio.tell() < len(inputio.getvalue()): avro_datum = reader.read(decoder) print avro_datum reader_data.close()
def listAllUserEvents(filename): try: reader = DataFileReader(open(filename, "r"), DatumReader()) for event in reader: # Query uuids of events print "event id: {0}, event data extra fields: {1}".format(event["uuid"], event["eventData"]["otherEventData"]) reader.close() except IOError: print filename + ": Could not open file. Exiting" sys.exit(1)
def processBlob(filename): reader = DataFileReader(open(filename, 'rb'), DatumReader()) dict = {} for reading in reader: parsed_json = json.loads(reading["Body"]) if not 'id' in parsed_json: return if not dict.has_key(parsed_json['id']): list = [] dict[parsed_json['id']] = list else: list = dict[parsed_json['id']] list.append(parsed_json) reader.close() for device in dict.keys(): deviceFile = open(device + '.csv', "a") for r in dict[device]: deviceFile.write(", ".join([str(r[x]) for x in r.keys()]) + '\n')
def evaluate_file(fname: str): logger.info("Opening file %s", fname) reader = DataFileReader(open(fname, "rb"), DatumReader()) logger.info("Counting lines...") i = 0 for val in reader: i += 1 if i % 1000 == 0: logger.debug("Read %d lines", i) logger.info("Found %d lines in file", i)
def doKmeans(self): numpy.seterr(divide="ignore", invalid="ignore") # get a dataset for the k-means generator dataset = [] reader = DataFileReader(open("test/prettypfa/exoplanets.avro", "rb"), DatumReader()) for record in reader: mag, dist, mass, radius = record.get("mag"), record.get( "dist"), record.get("mass"), record.get("radius") if mag is not None and dist is not None and mass is not None and radius is not None: dataset.append([mag, dist, mass, radius]) reader.close() # set up and run the k-means generator TestClustering.kmeansResult = KMeans(len(self.clusterNames), numpy.array(dataset)) TestClustering.kmeansResult.optimize( whileall(moving(), maxIterations(1000)))
def read_log(topic, log): schema = avro.schema.parse(open(os.path.abspath(os.path.dirname(__file__)) + "/avro_schema/" + topic + ".avsc").read()) print "schema:", schema writer = DataFileWriter(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "w"), DatumWriter(), schema) for i in range(5): writer.append(log) writer.close() reader = DataFileReader(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "r"), DatumReader()) for log in reader: print log
def readAndWriteAvro(): """ Unlike java, avro does not let you generate code for Tweet in python. So only way to read and write data is without using code generation""" #Read the schema schema = avro.schema.parse(open("tweet.avsc").read()) #write some data writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema) writer.append({"tweetId": 5, "user": "******", "text" : "Tweeting from python as well"}) writer.close() #read the same data tweets = DataFileReader(open("tweets.avro", "r"), DatumReader()) for tweet in tweets: print tweet tweets.close()
def main(fn, out_fn, avro_mode=''): with open(out_fn, 'w') as fo: with open(fn, 'rb') as f: reader = DataFileReader(f, DatumReader()) for r in reader: if avro_mode.upper() == 'KV': r = r['key'] fo.write('%s\t%r\n' % (r['office'], r['counts'])) print('wrote', out_fn)
def read_corpus(corpus_path): avro_files_path = [ os.path.join(corpus_path, filename) for filename in os.listdir(corpus_path) if os.path.splitext(filename)[1] == '.avro' ] for avro_file in avro_files_path: small_corpus = DataFileReader(open(avro_file, 'rb'), DatumReader()) for article in small_corpus: yield article
def handle(self): data = self.request.recv(8024).strip() data = StringIO(data) reader = DataFileReader(data, DatumReader()) for fileData in reader: id = fileData['id'] data = fileData['data'] print fileData if not fileDict.has_key(id): fileDict[id] = open("./" + id, "w") f = fileDict[id] f.write(data) f.flush() reader.close()
def processBlob(filename): reader = DataFileReader(open(filename, 'rb'), DatumReader()) dict = {} readingNb = 0 for reading in reader: readingNb += 1 try: parsed_json = json.loads(reading["Body"]) #print parsed_json print "-----------------------------" print "id:" print parsed_json[0]["id"] #print parsed_json[0] if not 'id' in parsed_json[0]: print "no id found..." return if not dict.has_key(parsed_json[0]['id']): list = [parsed_json[0]] dict[parsed_json[0]['id']] = list else: list = dict[parsed_json[0]['id']] list.append(parsed_json[0]) print "id:" print dict[parsed_json[0]['id']][0]["id"] print "eventTime:" print dict[parsed_json[0]['id']][0]["eventTime"] print "eventType:" print dict[parsed_json[0]['id']][0]["eventType"] print "resourceUri:" print dict[parsed_json[0]['id']][0]["data"]["resourceUri"] print "operationName:" print dict[parsed_json[0]['id']][0]["data"]["operationName"] print "resourceProvider:" print dict[parsed_json[0]['id']][0]["data"]["resourceProvider"] print "status:" print dict[parsed_json[0]['id']][0]["data"]["status"] print "subject:" print dict[parsed_json[0]['id']][0]["subject"] except: print "exception in converting blob to json" reader.close() print readingNb '''
def main(): """Start of execution""" #combine the schemas known_schemas = avro.schema.Names() types_schema = LoadAvsc("parameter_types.avsc", known_schemas) param_schema = LoadAvsc("parameter.avsc", known_schemas) print json.dumps(param_schema.to_json(avro.schema.Names()), indent=2) #test the schema works param_file = open("parameters.avro", "w") writer = DataFileWriter(param_file, DatumWriter(), param_schema) param_1 = {"name": "test", "description":"An Avro test.", "type":"int"} param_2 = {"name": "test", "description":"An Avro test.", "type":"boolean"} writer.append(param_1) writer.append(param_2) writer.close() reader = DataFileReader(open("parameters.avro", "r"), DatumReader()) for parameter in reader: print parameter reader.close()
def read_avro(iostream, runs=1): times = [] for _ in range(runs): iostream.seek(0) start = time.time() records = list(DataFileReader(iostream, DatumReader())) end = time.time() times.append(end - start) print(f'... {runs} runs averaged {sum(times) / runs} seconds') return records
def deserialize(value): """Deserialize AVRO encoded binary string and yield records. Args: value (str): binary string value. Yields: dict: deserialized record. """ with DataFileReader(io.BytesIO(value), DatumReader()) as reader: for record in reader: yield record
def read_data(): # read avro file into an array of dicts reader = DataFileReader(open(DATA_FILE_PATH, "rb"), DatumReader()) try: data = [] for row in reader: data.append(row) # pandas can only read json or csv # convert data to json object json_data = json.dumps(data) # read the json into a pandas dataframe dataset = pd.read_json(json_data) # separate features and labels features = dataset.copy().drop('rating', 1) labels = dataset.copy().pop('rating') # normalize features features = normalize_features(features) # split into train and test data train_features = features.sample(frac=0.8, random_state=0) test_features = features.drop(train_features.index) train_labels = labels[labels.index.isin(train_features.index)] test_labels = labels.drop(train_features.index) # convert features to numpy arrays train_features = train_features.to_numpy() test_features = test_features.to_numpy() # left shift labels to convert them from the range [1,10] # to the range [0, 9] train_labels = left_shift_labels(train_labels) test_labels = left_shift_labels(test_labels) return train_features, train_labels, test_features, test_labels finally: reader.close()
def get_manifest_hdfs_path_list(self, tmp_path_prefix, manifest_list_hdfs_path): local_path = '%s_%s.manifest_list.avro' % (tmp_path_prefix, random.randint(0, 10000)) check_call([ 'hadoop', 'fs', '-copyToLocal', manifest_list_hdfs_path, local_path ]) manifest_hdfs_path_list = [] reader = None try: with open(local_path, 'rb') as fp: reader = DataFileReader(fp, DatumReader()) for manifest in reader: manifest_hdfs_path_list.append(manifest['manifest_path']) finally: if reader: reader.close() os.remove(local_path) return manifest_hdfs_path_list
def read_avro_file(file): reader = DataFileReader(open(file, "rb"), DatumReader()) data = [] fields = json.loads(reader.meta['avro.schema'])['fields'] for i in range(min(100, reader.file_length)): rec = reader.next() data.append(rec) reader.close() json_fields = json.dumps(fields) json_data = json.dumps(data) return render_template('tables.html', columns=len(fields), rows=reader.file_length, shown_rows=min(100, reader.file_length)) + \ json2html.convert(json=json_fields) + \ json2html.convert(json=json_data)
def read_avro_with_schema(avro_filepath, schema_filepath): print("\nfile:{}\nschema:{}".format(avro_filepath, schema_filepath)) with open(schema_filepath) as f: schema = avro.schema.Parse(f.read()) datum_reader = DatumReader(reader_schema=schema) with open(avro_filepath, 'rb') as f: with DataFileReader(f, datum_reader) as dfr: for record in dfr: print(record)
def read(fin, fout=None, nrecords=0): "Read given avro file according to its schema and dump on stdout its content" reader = DataFileReader(open(fin, "r"), DatumReader()) fobj = open(fout, 'w') if fout else None count = 0 if fobj: fobj.write("[\n") for rec in reader: if fobj: if count: fobj.write(",\n") fobj.write(json.dumps(rec)) else: pprint.pprint(rec) if nrecords and count >= nrecords: break count += 1 if fobj: fobj.write("]\n") fobj.close() reader.close()
def json_avro_schema(self): if self._json_avro_schema is None: # dependency on the avro python reference implementation since getting full json # avro schema from the c-api is elusive from avro.datafile import DataFileReader from avro.io import DatumReader import json with open(self.filename) as fo: with DataFileReader(fo, DatumReader()) as avf: self._json_avro_schema = json.loads( avf.meta['avro.schema']) return self._json_avro_schema
def main(args): global in_file_name, out_file_name processParams(args) print(' * Processing ' + in_file_name) ifh = openFile(in_file_name, "r") reader = DataFileReader(ifh, DatumReader()) if out_file_name is None: print(' * Sending Output to STDOUT') ofh = sys.stdout print_progress_status = False else: print(' * Sending Output to ' + out_file_name) ofh = openFile(out_file_name, "w") print_progress_status = True rec_count = 0 start_time = time.time() prev_time = start_time for rec in reader: rec_count += 1 if is_pretty_print: rec_str = json.dumps(rec, indent=4, sort_keys=True) ofh.write("[" if (rec_count == 1) else ",\n") else: rec_str = json.dumps(rec) ofh.write("[" if (rec_count == 1) else ",") ofh.write(rec_str) cur_time = time.time() if (print_progress_status == True) and ( int(cur_time - prev_time) >= STATUS_IN_TERMINAL_AFTER_SECONDS): print(" .... Processed record # " + str(rec_count)) prev_time = cur_time ofh.write("]") reader.close() cur_time = time.time() print('\n * Processed ' + str(rec_count) + ' records in ' + str(int(round(cur_time - start_time))) + ' seconds.')
def runEngine(self, engine): last = [None] if engine.config.method == "emit": def emit(x): last[0] = x engine.emit = emit for record in DataFileReader( open("test/prettypfa/exoplanets.avro", "r"), DatumReader()): engine.action(record) else: for record in DataFileReader( open("test/prettypfa/exoplanets.avro", "r"), DatumReader()): last[0] = engine.action(record) return last[0]
def check_avro(self, filehandle): try: DataFileReader(filehandle, DatumReader()) print(self.valid_avro_msg) except avro.datafile.DataFileException as _: if 'snappy' in str(_): die("%s => ERROR: %s - Is the python-snappy module installed? ('pip install python-snappy')" \ % (filehandle.name, _)) die("%s => ERROR: %s" % (filehandle.name, _)) except TypeError as _: if self.verbose > 2: print(_) die(self.invalid_avro_msg)
def file_read(self, fname): "Read documents from given file name" try: schema = self.schema out = [] with DataFileReader(open_file(fname), DatumReader()) as reader: for rec in reader: out.append(rec) return out except Exception as exc: err = traceback.format_exc(limit=1).splitlines()[-1] msg = 'Failure in %s storage, error=%s' % (self.stype, err) raise ReadError(msg)
def readAndWriteAvro(): """ Unlike java, avro does not let you generate code for Tweet in python. So only way to read and write data is without using code generation""" #Read the schema schema = avro.schema.parse(open("tweet.avsc").read()) #write some data writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema) writer.append({ "tweetId": 5, "user": "******", "text": "Tweeting from python as well" }) writer.close() #read the same data tweets = DataFileReader(open("tweets.avro", "r"), DatumReader()) for tweet in tweets: print tweet tweets.close()
def read_orders(in_filename): sample = None counter = 0 t0 = time() reader = DataFileReader(open(in_filename, 'rb'), DatumReader()) for pedido in reader: if counter == 0: print("Primeira iteracao em {:0.8f}s".format(time() - t0)) sample = pedido counter += 1 delta = time() - t0 print("{} registros lidos em {:0.3f}s".format(counter, delta)) print("Exemplo de registro:") pprint(sample)
def _get_jc_for_avro_input(self, file_in, job_conf): jc = dict(job_conf) if self.avro_input: jc[AVRO_INPUT] = self.avro_input reader = DataFileReader(file_in, DatumReader()) schema = reader.get_meta('avro.schema') file_in.seek(0) if self.avro_input == 'v': jc[AVRO_VALUE_INPUT_SCHEMA] = schema elif self.avro_input == 'k': jc[AVRO_KEY_INPUT_SCHEMA] = schema else: schema_obj = json.loads(schema) for field in schema_obj['fields']: if field['name'] == 'key': key_schema = field['type'] else: value_schema = field['type'] jc[AVRO_KEY_INPUT_SCHEMA] = json.dumps(key_schema) jc[AVRO_VALUE_INPUT_SCHEMA] = json.dumps(value_schema) return jc
def test_HKMA_Bondtrades(self): avroFile = "testbondtrade.avro" numOfInvestors = 10 numOfTradesEach = 100 generateHKMATrades(numOfInvestors, numOfTradesEach, "HKMA/SelectedSecurity.json", avroFile) reader = DataFileReader(open(avroFile, "rb"), DatumReader()) cnt = 0 for bondtrade in reader: self.assertIsNotNone(bondtrade["cust"]) self.assertIsNotNone(bondtrade["tradeDate"]) self.assertIsNotNone(bondtrade["asset"]["securityId"]) self.assertGreater(bondtrade["asset"]["notional"], 1000000) dt = datetime.datetime.fromtimestamp(bondtrade["timestamp"] / 1000) nowdt = datetime.datetime.now() self.assertEqual(dt.year, nowdt.year) self.assertEqual(dt.month, nowdt.month) self.assertEqual(dt.day, nowdt.day) cnt += 1 self.assertEqual(numOfInvestors * numOfTradesEach, cnt) reader.close()
def binary_roundtrip(self, model_class, data): model = model_class(data) schema_dumper = self.mk_schema_dumper() schema = avro.schema.parse(schema_dumper.dump_schema(model_class)) fp, file_name = self.get_tempfile(text=False) with DataFileWriter(fp, DatumWriter(), schema) as writer: writer.append(dict(model)) with DataFileReader( open(file_name, 'rb'), DatumReader(readers_schema=schema)) as reader: [row] = reader return row
def avro2dataframe(path, verbose=False): ''' Transforms DNA snapshot data in a pandas DataFrame object. ''' read_schema = avro.schema.Parse(json.dumps(djdna_avro_schema)) file_content = list() files = sorted(os.listdir(path)) for avro_file in files: if (os.path.isfile(os.path.join(path, avro_file)) and avro_file.split('.')[-1] == 'avro'): if verbose: print('Reading file {} \r'.format(avro_file), end='') file_path = os.path.join(path, avro_file) reader = DataFileReader(open(file_path, 'rb'), DatumReader(read_schema)) # new_schema = reader.GetMeta('avro.schema') users = [] for user in reader: users.append(user) file_content.append(users) reader.close() data = [pd.DataFrame(content) for content in file_content] data = pd.concat(data, ignore_index=True) return data
def new_schema_create_new_table(filename, table_name, database_name = "braze"): reader = DataFileReader(open(filename, "rb"), DatumReader()) schema = json.loads(reader.meta['avro.schema']) create_table = "CREATE TABLE IF NOT EXISTS " + table_name all_field_string = '' for field in schema['fields']: comma = ', ' if(all_field_string == ""): comma = ' ' all_field_string = all_field_string + comma + convert_schema_to_Presto(field) create_table = create_table + ' ( ' + all_field_string + ' ); ' td = tdclient.Client(os.environ['td_apikey']) job = td.query(database_name, create_table, type = "presto") job.wait()
def decode( self, encoded_obj: Any ) -> Dict[str, Union[BaseRecord, StoreRecord, BaseHandler, BaseStoreRecordHandler]]: """ Decode bytes format to BaseModel and return dict which contains decoded *BaseModel / BaseStoreRecord* This function is used by kafka-python / internal call Args: encoded_obj (Any): Bytes encode BaseModel / BaseStoreRecord Raises: AvroDecodeError: fail to decode bytes in BaseModel MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema) MissingHandlerClass: can’t find BaseHandlerModel in own registered BaseHandlerModel list (self._handler) Returns: Dict[str, Union[BaseModel, BaseStoreRecord, BaseHandler, BaseStoreRecordHandler]]: example: {'event_class': ..., 'handler_class': ...} """ try: reader = DataFileReader(BytesIO(encoded_obj), DatumReader()) schema = json.loads(reader.meta.get('avro.schema').decode('utf-8')) schema_name = schema['namespace'] + '.' + schema['name'] dict_data = next(reader) except AvroTypeException as err: self.logger.exception('%s', err.__str__()) raise AvroDecodeError # Finds a matching event name for e_name, event in self._events.items(): if e_name.match(schema_name): # type: ignore record_class = event break else: raise MissingEventClass # Finds a matching handler name for e_name, handler in self._handlers.items(): if e_name.match(schema_name): # type: ignore handler_class = handler break else: raise MissingHandlerClass return { 'record_class': record_class.from_dict(dict_data=dict_data), 'handler_class': handler_class }
def _read(self, spec, fields=None): "Internal read API" if PAT_UID.match(str(spec)): # requested to read concrete file out = [] fname = file_name(self.hdir, spec) with open_file(fname) as istream: reader = DataFileReader(istream, DatumReader()) for data in reader: if isinstance(data, list): for rec in data: self.check(rec) return data self.check(data) out.append(data) return out return self.empty_data
def process(self, elem): # TODO: figure out how to cache the client locally gcs = google.cloud.storage.Client() event = json.loads(elem) bucket = event["bucket"] name = event["name"] blob = gcs.get_bucket(bucket).blob(name) contents = blob.download_as_string() print("fetched {}/{}: {} bytes".format(bucket, name, len(contents))) rd = DataFileReader(StringIO.StringIO(contents), DatumReader()) for record in rd: print(record) return []
def test1(self): """ Run a tethered map-reduce job. Assumptions: 1) bash is available in /bin/bash """ from word_count_task import WordCountTask from avro.tether import tether_task_runner from avro.datafile import DataFileReader from avro.io import DatumReader import avro import subprocess import StringIO import shutil import tempfile import inspect proc=None try: # TODO we use the tempfile module to generate random names # for the files base_dir = "/tmp/test_tether_word_count" if os.path.exists(base_dir): shutil.rmtree(base_dir) inpath = os.path.join(base_dir, "in") infile=os.path.join(inpath, "lines.avro") lines=["the quick brown fox jumps over the lazy dog", "the cow jumps over the moon", "the rain in spain falls mainly on the plains"] self._write_lines(lines,infile) true_counts=self._count_words(lines) if not(os.path.exists(infile)): self.fail("Missing the input file {0}".format(infile)) # The schema for the output of the mapper and reducer oschema=""" {"type":"record", "name":"Pair","namespace":"org.apache.avro.mapred","fields":[ {"name":"key","type":"string"}, {"name":"value","type":"long","order":"ignore"} ] } """ # write the schema to a temporary file osfile=tempfile.NamedTemporaryFile(mode='w',suffix=".avsc",prefix="wordcount",delete=False) outschema=osfile.name osfile.write(oschema) osfile.close() if not(os.path.exists(outschema)): self.fail("Missing the schema file") outpath = os.path.join(base_dir, "out") args=[] args.append("java") args.append("-jar") args.append(os.path.abspath("@TOPDIR@/../java/tools/target/avro-tools-@[email protected]")) args.append("tether") args.extend(["--in",inpath]) args.extend(["--out",outpath]) args.extend(["--outschema",outschema]) args.extend(["--protocol","http"]) # form the arguments for the subprocess subargs=[] srcfile=inspect.getsourcefile(tether_task_runner) # Create a shell script to act as the program we want to execute # We do this so we can set the python path appropriately script="""#!/bin/bash export PYTHONPATH={0} python -m avro.tether.tether_task_runner word_count_task.WordCountTask """ # We need to make sure avro is on the path # getsourcefile(avro) returns .../avro/__init__.py asrc=inspect.getsourcefile(avro) apath=asrc.rsplit(os.sep,2)[0] # path to where the tests lie tpath=os.path.split(__file__)[0] exhf=tempfile.NamedTemporaryFile(mode='w',prefix="exec_word_count_",delete=False) exfile=exhf.name exhf.write(script.format((os.pathsep).join([apath,tpath]),srcfile)) exhf.close() # make it world executable os.chmod(exfile,0755) args.extend(["--program",exfile]) print "Command:\n\t{0}".format(" ".join(args)) proc=subprocess.Popen(args) proc.wait() # read the output with file(os.path.join(outpath,"part-00000.avro")) as hf: reader=DataFileReader(hf, DatumReader()) for record in reader: self.assertEqual(record["value"],true_counts[record["key"]]) reader.close() except Exception as e: raise finally: # close the process if proc is not None and proc.returncode is None: proc.kill() if os.path.exists(base_dir): shutil.rmtree(base_dir) if os.path.exists(exfile): os.remove(exfile)
def main(): parser = optparse.OptionParser(description="""Filters consumer messages based on various criteria (allowed NGIs, service flavours, metrics...)""") parser.add_option('-g', dest='gloconf', nargs=1, metavar='global.conf', help='path to global configuration file', type=str) group = optparse.OptionGroup(parser, 'Compute Engine usage') group.add_option('-d', dest='date', nargs=1, metavar='YEAR-MONTH-DAY') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Debugging usage') group.add_option('-f', dest='cfile', nargs=1, metavar='consumer_log_YEAR-MONTH-DAY.avro') parser.add_option_group(group) (options, args) = parser.parse_args() global logger logger = Logger(os.path.basename(sys.argv[0])) prefilter = {'Prefilter': ['ConsumerFilePath', 'PoemExpandedProfiles', 'PoemNameMapping', 'LookbackPoemExpandedProfiles']} schemas = {'AvroSchemas': ['Prefilter']} output = {'Output': ['Prefilter']} confpath = options.gloconf if options.gloconf else None cglob = Global(confpath, schemas, output, prefilter) global globopts globopts = cglob.parse() stats = () if options.cfile and options.date: parser.print_help() raise SystemExit(1) elif options.cfile: fname = options.cfile date = options.cfile.split('_')[-1] date = date.split('.')[0] date = date.split('-') elif options.date: date = options.date.split('-') else: parser.print_help() raise SystemExit(1) if len(date) == 0 or len(date) != 3: logger.error('Consumer file does not end with correctly formatted date') parser.print_help() raise SystemExit(1) year, month, day = date # avro files if options.cfile: inputFile = options.cfile else: inputFile = gen_fname_repdate(logger, year+'-'+month+'-'+day, globopts['PrefilterConsumerFilePath'.lower()], '') outputFile = gen_fname_repdate(logger, year+'_'+month+'_'+day, globopts['OutputPrefilter'.lower()], '') try: schema = avro.schema.parse(open(globopts['AvroSchemasPrefilter'.lower()]).read()) writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema) reader = DataFileReader(open(inputFile, "r"), DatumReader()) except IOError as e: logger.error(str(e)) raise SystemExit(1) # load poem data ngis = loadNGIs(year, month, day) profiles = loadFilteredProfiles(year, month, day) nameMapping = loadNameMapping(year, month, day) s = time.time() msgs, msgswrit, msgsfilt, falsemonhost, falseroc, falseprofile = prefilterit(reader, writer, ngis, profiles, nameMapping) e = time.time() logger.info('ExecTime:%.2fs ConsumerDate:%s Read:%d Written:%d Filtered:%d(Monitoring_Host:%d,ROC:%d,ServiceTypes_Metrics:%d)' % (round(e - s, 2), year+'-'+month+'-'+day, msgs, msgswrit, msgsfilt, falsemonhost, falseroc, falseprofile)) reader.close() writer.close()
def readFile(): reader = DataFileReader(open("part-00000.avro", "r"), DatumReader()) for user in reader: print user reader.close()
import avro.schema from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter with open("blog.avsc") as schema_file: schema = avro.schema.parse(schema_file.read()) with open("blog.avro", "wb") as out_file: writer = DataFileWriter(out_file, DatumWriter(), schema) writer.append({ "title": "Avro is awesome", "content": "Let's learn Avro!", "is_published": False }) writer.close() with open("blog.avro") as in_file: reader = DataFileReader(in_file, DatumReader()) for blog in reader: print blog reader.close()
def getit(avroType): reader = DataFileReader(urllib.urlopen(url), DatumReader()) return reader.read()
def main(): known_schemas = avro.schema.Names() with open("point.avsc", "rb") as fp: point = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) with open("review.avsc", "rb") as fp: place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) with open("place.avsc", "rb") as fp: place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5, 'y': 2.75}) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['x'] == 1.5 assert deserialized['y'] == 2.75 reader.close() writer.close() try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5}) assert False except AvroTypeException as e: pass try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5, 'y': "wtanaka.com"}) assert False except AvroTypeException as e: pass output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75} }) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['location']['x'] == 1.5 assert deserialized['location']['y'] == 2.75 reader.close() writer.close() output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75}, 'review': {'rating': 4, 'text': '4 stars would come again'}, }) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['location']['x'] == 1.5 assert deserialized['location']['y'] == 2.75 reader.close() writer.close() try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75}, 'review': {'x': 1.5, 'y': 2.75}, }) assert False except AvroTypeException as e: pass