def write_avro(rows, file_out, schema_path): schema = avro.schema.parse(open(schema_path, "rb").read()) writer = DataFileWriter(open(file_out, "wb"), DatumWriter(), schema) for line in rows: print("INPUT LINE: ", line) writer.append({"name": line[0], "sex": line[1], "count": line[2], "year": line[3]}) writer.close()
def _write_lines(self,lines,fname): """ Write the lines to an avro file named fname Parameters -------------------------------------------------------- lines - list of strings to write fname - the name of the file to write to. """ import avro.io as avio from avro.datafile import DataFileReader,DataFileWriter from avro import schema #recursively make all directories dparts=fname.split(os.sep)[:-1] for i in range(len(dparts)): pdir=os.sep+os.sep.join(dparts[:i+1]) if not(os.path.exists(pdir)): os.mkdir(pdir) with file(fname,'w') as hf: inschema="""{"type":"string"}""" writer=DataFileWriter(hf,avio.DatumWriter(inschema),writers_schema=schema.parse(inschema)) #encoder = avio.BinaryEncoder(writer) #datum_writer = avio.DatumWriter() for datum in lines: writer.append(datum) writer.close()
def generate_sample_datasets (host_ips, metric_ids, year, month, day, hour): avro_schema = '' #load data from hdfs cat = subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-cat', '/user/pnda/PNDA_datasets/datasets/.metadata/schema.avsc'], stdout=subprocess.PIPE) for line in cat.stdout: avro_schema = avro_schema + line schema = avro.schema.parse(avro_schema) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) #create hdfs folder structure dir = create_hdfs_dirs (year, month, day, hour) filename = str(uuid.uuid4()) + '.avro' filepath = dir + filename tmp_file = '/tmp/' + filename writer = DataFileWriter(open(tmp_file, "w"), DatumWriter(), schema) start_dt = datetime.datetime(year, month, day, hour, 0, 0) start_ts = int(time.mktime(start_dt.timetuple())) end_dt = start_dt.replace(hour=hour+1) end_ts = int(time.mktime(end_dt.timetuple())) for ts in xrange(start_ts, end_ts, 1): #generate random pnda record on per host ip basis for host_ip in host_ips: record = {} record['timestamp'] = (ts * 1000) record['src'] = 'test' record['host_ip'] = host_ip record['rawdata'] = generate_random_metrics(metric_ids) #encode avro writer.append(record) writer.close() subprocess.Popen(['sudo', '-u', 'hdfs', 'hadoop', 'fs', '-copyFromLocal', tmp_file, dir]) return filepath
def outputManager(self, output, key, bucket, output_format='json'): if output_format == 'avro': avro_schema = avro.schema.Parse(json.dumps(schema[self.endpoint])) writer = DataFileWriter(open("{0}.avro".format(self.endpoint), "wb"), DatumWriter(), avro_schema) for record in output: writer.append(record) writer.close() output_file = "{0}.avro".format(self.endpoint) elif output_format == 'json': tmp = NamedTemporaryFile("w") for row in output: tmp.write(json.dumps(row) + '\n') tmp.flush() output_file = tmp.name s3 = S3Hook(s3_conn_id=self.s3_conn_id) s3.load_file( filename=output_file, key=self.s3_key, bucket_name=self.s3_bucket, replace=True )
def dump_data(bso_number, schema, dsn, args): offset = args.offset or 0 total_rows = 0 # things time out around 1_500_000 rows. db = conf_db(dsn) out_file = args.output.rsplit('.', 1) row_count = count_rows(db, bso_number) for chunk in range(max(1, math.trunc(math.ceil(row_count / MAX_ROWS)))): print("Dumping {} rows from bso#{} into chunk {}".format( row_count, bso_number, chunk)) out_file_name = "{}_{}_{}.{}".format(out_file[0], bso_number, hex(chunk), out_file[1]) writer = DataFileWriter(open(out_file_name, "wb"), DatumWriter(), schema) rows = dump_rows(bso_number=bso_number, chunk_offset=offset, db=db, writer=writer, args=args) writer.close() if rows == 0: break offset = offset + rows chunk += 1 return rows
def encode(self, obj: BaseRecord) -> bytes: """ Encode *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* to bytes format This function is used by kafka-python Args: obj (BaseModel): *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* Raises: MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema) AvroEncodeError: fail to encode BaseModel to bytes Returns: bytes: BaseModel in bytes """ try: schema = self._schemas[obj.event_name()] except KeyError as err: self.logger.exception('%s', err.__str__()) raise MissingEventClass try: output = BytesIO() writer = DataFileWriter(output, DatumWriter(), schema) writer.append(obj.to_dict()) writer.flush() encoded_event = output.getvalue() writer.close() except AvroTypeException as err: self.logger.exception('%s', err.__str__()) raise AvroEncodeError return encoded_event
def serializeDataToOCFFile(schemaFile,outputFile,dataToSerialize): logging.debug("Parsing in avro schema:"+schemaFile) schema=parse_schema(schemaFile) logging.debug("Writing avro data to:"+outputFile) writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema) writer.append(dataToSerialize) writer.close()
class AvroWriter(object): """ AvroWriter """ def __init__(self, schema, outfile): self.schema = schema self.outfile = outfile self.datawrite = None self.avrofile = None self._load_datawriter() def _load_datawriter(self): try: lschema = load_schema(self.schema) self.avrofile = open(self.outfile, 'w+b') self.datawrite = DataFileWriter(self.avrofile, DatumWriter(), lschema) except Exception: return False return True def write(self, data): try: if (not self.datawrite or not self.avrofile): raise AvroWriteException('AvroFileWriter not initalized') for elem in data: self.datawrite.append(elem) self.datawrite.close() self.avrofile.close() except Exception as e: return False, e return True, None
def serializeDataToOCFFile(schemaFile, outputFile, dataToSerialize): logging.debug("Parsing in avro schema:" + schemaFile) schema = parse_schema(schemaFile) logging.debug("Writing avro data to:" + outputFile) writer = DataFileWriter(open(outputFile, "w"), DatumWriter(), schema) writer.append(dataToSerialize) writer.close()
def hello_gcs(event, context): # set storage client client = storage.Client() # get bucket bucket = client.get_bucket(bucket_name) # get the data print('URL: {}'.format(url)) response = urllib.request.urlopen(url) data = json.loads(response.read()) # remove unneeded data AND convert to bytes #small_data = json.dumps( data['observations'] ).encode('utf-8') # write to local file file_name = '{}.{}'.format(series_id, file_type) local_path = '/tmp/{}'.format(file_name) writer = DataFileWriter(open(local_path, "wb"), DatumWriter(), schema) for record in data['observations']: days_since_epoch, data_point = convert_data_types(record) writer.append({"date": days_since_epoch, "value": data_point}) writer.close() # set Blob file_name = '{}_{}.{}'.format(series_id, get_datetime(), file_type) blob = storage.Blob(file_name, bucket) # upload the file to GCS blob.upload_from_filename(local_path) print('Event ID: {}'.format(context.event_id)) print('Event type: {}'.format(context.event_type)) print("""This Function was triggered by messageId {} published at {} """.format(context.event_id, context.timestamp))
def save_records_to_avrofile( flows_towrite, fn_output, avro_schema=cons.DEFAULT_AVRO_NFCAP_FLOWS_SCHEMA_FILEPATH): """ Write to an AVRO file a given a dictionary or a list of dicts containing flow records. :param flows_towrite: dict or list of flow records. :param fn_output: .avro output filepath and name. :param avro_schema: schema to write the records to an .avro file. :return: none """ # load schema schema = avro.schema.parse(open(avro_schema, "rb").read()) # create object writer writer = DataFileWriter(open(fn_output, "wb"), DatumWriter(), schema, codec="deflate") # write records to avro file output if type(flows_towrite) is dict: for k, v in flows_towrite.items(): writer.append(v) writer.close() if type(flows_towrite) is list: for record in flows_towrite: writer.append(record) writer.close()
def write_data_to_avro(raw_data, data_type): data_folder = Path('avro') avro_file = data_type + '.avro' avro_file_path = data_folder / avro_file avsc_file = data_type + '.avsc' avsc_file_path = data_folder / avsc_file schema = avro.schema.Parse(open(avsc_file_path.resolve(), "rb").read()) writer = DataFileWriter(open(avro_file_path.resolve(), "wb"), DatumWriter(), schema) for _ , record in raw_data.iterrows(): dict = record.to_dict() if data_type == "stops": dict['stop_lat_lon'] = {'stop_lon': dict['stop_lat_lon'].x, 'stop_lat': dict['stop_lat_lon'].y} ''' if data_type == "stop_times": #del dict['arrival_time'] #del dict['departure_time'] del dict['stop_id'] del dict['stop_sequence'] del dict['pickup_type'] del dict['drop_off_type'] del dict['timepoint'] ''' writer.append(dict) writer.close() '''
def handle_avro_client_print_to_file(connection, address): schema = avro.schema.Parse(open("schema/addressbook.avsc", "rb").read()) data = connection.recv(4) message_length, = struct.unpack('>I', data) message = connection.recv(message_length) message_buf = io.BytesIO(message) reader = avro.datafile.DataFileReader(message_buf, avro.io.DatumReader()) # Create a data file using DataFileWriter dataFile = open("schema/addressbook.avro", "wb") writer = DataFileWriter(dataFile, DatumWriter(), schema) for thing in reader: writer.append(thing) reader.close() writer.close() return (len(message))
def write_to_hdfs(rows: List[Tuple[str, str]]): conn: Connection = Connection.get_connection_from_secrets('local_hdfs') uri = conn.get_uri() pat = re.compile("http://(\w+(:\w+)?)?@") print(conn.get_uri()) uri = pat.sub("http://", uri) print(uri) print(conn.login) client = InsecureClient(uri, user=conn.login) sch = avro.schema.make_avsc_object({ 'type':'record', 'name':'Video', 'fields': [ {'type': {'type': 'string', 'avro.java.string': 'String'}, 'name': 'title'}, {'type': ["null", {'type': 'string', 'avro.java.string': 'String'}], 'name': 'description'}, ] }) local_file_name = 'videos.avro' writer = DataFileWriter(open(local_file_name, "wb"), DatumWriter(), sch) for row in rows: print(row) writer.append({"title":row[0], "description":row[1]}) writer.close() client.upload('/tmp/videos.avro', local_file_name)
def _produce_test_input(self): schema = avro.schema.parse(""" { "name": "TestQueryTask_record", "type": "record", "doc": "The description", "fields": [ {"name": "col0", "type": "int", "doc": "The bold"}, {"name": "col1", "type": { "name": "inner_record", "type": "record", "doc": "This field shall be an inner", "fields": [ {"name": "inner", "type": "int", "doc": "A inner field"}, {"name": "col0", "type": "int", "doc": "Same name as outer but different doc"}, {"name": "col1", "type": ["null", "string"], "default": null, "doc": "Nullable primitive"}, {"name": "col2", "type": ["null", { "type": "map", "values": "string" }], "default": null, "doc": "Nullable map"} ] }, "doc": "This field shall be an inner"}, {"name": "col2", "type": "int", "doc": "The beautiful"}, {"name": "col3", "type": "double"} ] }""") self.addCleanup(os.remove, "tmp.avro") writer = DataFileWriter(open("tmp.avro", "wb"), DatumWriter(), schema) writer.append({'col0': 1000, 'col1': {'inner': 1234, 'col0': 3000}, 'col2': 1001, 'col3': 1.001}) writer.close() self.gcs_client.put("tmp.avro", self.gcs_dir_url + "/tmp.avro")
def score(graphs, schema, url, port): """ graphs is expected to be a list of dictionaries, where each entry in the list represents a graph with * key idx -> index value * key nodes -> list of ints representing vertices of the graph * key edges -> list of list of ints representing edges of graph """ stream = BufferedWriter(BytesIO()) writer = DataFileWriter(stream, avro.io.DatumWriter(), schema) # writer = DataFileWriter(open("imdb-graph.avro", "wb"), DatumWriter(), schema) for graph in graphs: writer.append({ "edges": graph["edges"], "vertices": graph["vertices"], "index": graph["idx"], "label": graph.get("label") }) writer.flush() raw_bytes = stream.raw.getvalue() writer.close() url = "{}:{}/predictUnstructured/?ret_mode=binary".format( url.strip("/"), port) payload = raw_bytes headers = {'Content-Type': 'application/octet-stream'} response = requests.request("POST", url, headers=headers, data=payload) return response
def export(table, args): dest = args.dest print('{} exporting table {}...'.format( strftime("%H:%M:%S"), table)) if exp_dict[table] is None: global cnxpool cnx = cnxpool.get_connection() writer = None try: query = "SELECT * from {}".format(table) cursor = cnx.cursor(dictionary=True) cursor.execute(query) rows = cursor.fetchall() schema = avro.schema.parse( open(os.path.join("schema", "{}.avsc".format(table)), "rb").read()) file_path = os.path.join(dest, "{}.avro".format(table)) print('{} exporting to {}'.format( strftime("%H:%M:%S"), file_path)) writer = DataFileWriter( open(file_path, "wb"), DatumWriter(), schema) for i, row in enumerate(rows): if i != 0 and i % 5000 == 0: print('{} {} records exported...'.format( strftime("%H:%M:%S"), i)) writer.append(row) cursor.close() except: print(sys.exc_info()[0]) raise finally: cnx.close() if writer: writer.close() else: exp_dict[table].export(table, dest, args)
def testAppend(filename): fd = open(filename, 'a+b') datum_writer = DatumWriter() fwriter = DataFileWriter(fd, datum_writer) for i in xrange(10, 20): fwriter.append(_makeTestPerson(i)) fwriter.close()
def write(self, format): time_start = time.time() if format == 'json' or format == 'jsch': with open('./output/output.json', 'w') as file: for base_person_obj in self._base_person_list: file.write(json.dumps(self._get_json_person(base_person_obj), separators=(',', ':'))) # file.write(json.dumps(self._data_dict, separators=(',', ':'))) elif format == 'avro': writer = DataFileWriter(open('./output/output.avro', 'wb'), DatumWriter(), self._schema_avro) for user in self._data_dict: writer.append(user) writer.close() elif format == 'protobuf': with open('./output/output.pb', 'wb') as file: for base_person_obj in self._base_person_list: protobuf_person = self._get_proto_buf_person(base_person_obj) file.write(protobuf_person.SerializeToString()) elif format == 'gzjson': with gzip.open('./output/output.jsz', 'wb') as file: file.write(json.dumps(self._data_dict, separators=(',', ':'))) time_end = time.time() return time_end - time_start
def ExportToBin(self, data, schema=None) -> tuple: ''' Exporta objeto data utilizando o schema informado em formato binário (bytes) ''' if not schema == None: pschema = self._parseschema(schema) if pschema[0]: schema = self._data['schema'] else: return pschema else: schema = self._data['schema'] if not type(schema) is avro.schema.RecordSchema: schema = None try: with tempfile.SpooledTemporaryFile(suffix='.avro') as tmp: writer = DataFileWriter(tmp, DatumWriter(), schema) if not data is list: writer.append(data) else: for d in data: writer.append(d) writer.flush() tmp.seek(0) export_bin = tmp.read() writer.close() tmp.close() self._data['data'] = export_bin return (True, export_bin, self.getSchemaInfos()) except Exception as e: return (False, str(e), self.getSchemaInfos())
def _exp_wcctrn(p): global cnxpool, count, file_path, schema flag, dest = p print('{} [{}] exporting {}...'.format(strftime("%H:%M:%S"), os.getpid(), flag)) cnx = cnxpool.get_connection() writer = None _schema = None if file_path is None or count >= parallel_threshold: file_path = os.path.join( dest, "wcc_trn", "{}_{}.avro".format(os.getpid(), strftime("%Y%m%d_%H%M%S"))) print('{} allocating new file {}...'.format(strftime("%H:%M:%S"), file_path)) count = 0 _schema = schema try: cursor = cnx.cursor(dictionary=True, buffered=True) cursor.execute("SELECT * from wcc_trn where flag = %s", (flag, )) rows = cursor.fetchall() total = cursor.rowcount cursor.close() writer = DataFileWriter(open(file_path, "ab+"), DatumWriter(), _schema) for row in rows: writer.append(row) count += total except: print(sys.exc_info()[0]) raise finally: cnx.close() if writer: writer.close()
def produce_kafka_messages(topic, cluster, message, data_format): """Send basic messages to Kafka""" # Get Kafka producer producer = cluster.kafka.producer() basic_data_formats = ['XML', 'CSV', 'SYSLOG', 'NETFLOW', 'COLLECTD', 'BINARY', 'LOG', 'TEXT', 'JSON'] # Write records into Kafka depending on the data_format. if data_format in basic_data_formats: producer.send(topic, message) elif data_format == 'WITH_KEY': producer.send(topic, message, key=get_random_string(string.ascii_letters, 10).encode()) elif data_format == 'AVRO': writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA))) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(message, encoder) raw_bytes = bytes_writer.getvalue() producer.send(topic, raw_bytes) elif data_format == 'AVRO_WITHOUT_SCHEMA': bytes_writer = io.BytesIO() datum_writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA))) data_file_writer = DataFileWriter(writer=bytes_writer, datum_writer=datum_writer, writer_schema=avro.schema.Parse(json.dumps(SCHEMA))) data_file_writer.append(message) data_file_writer.flush() raw_bytes = bytes_writer.getvalue() data_file_writer.close() producer.send(topic, raw_bytes) producer.flush()
def save_avro(data, file_name='data.avro', test=True): import json import avro.schema from avro.datafile import DataFileWriter from avro.io import DatumWriter schema_path = str(DATA_ROOT / 'schemas.avsc') with open(schema_path) as f: schema = avro.schema.SchemaFromJSONData(json.load(f)) if test: file_name = "{}.{}".format(file_name, os.getpid()) path = str(DATA_ROOT / file_name) writer = DataFileWriter(open(path, "wb"), DatumWriter(), schema) try: for datum in data: writer.append(datum) finally: writer.close() if test: os.remove(path)
def _loadAvro(config, superSchema, daysArray): print("**********************Loading ForecastDataAvro****************") autGenSchemaFile = config["ETL"]["Extract"]["AutGenSchemaFile"] forecastAvroFile = config["ETL"]["Load"]["Avro"]["File"] dWHForecastPath = config["ETL"]["Load"]["AvgData"]["DWHForecastPath"] dayAvroSchema = _autogenerateSchema(superSchema) with open(dWHForecastPath+autGenSchemaFile, "w") as file: file.write(json.dumps(dayAvroSchema, indent=4)) # create avro.schema from json schema dayAvroSchemaString = json.dumps(dayAvroSchema) schema = avro.schema.Parse(dayAvroSchemaString) avroFile = dWHForecastPath + forecastAvroFile # create a writer for DWH writer = DataFileWriter(open(avroFile, "wb"), DatumWriter(), schema) # append each day for day in daysArray: # pp.pprint(day) writer.append(day) # close writer writer.close() # pp.pprint(writer) _readAvro(avroFile)
def position_sorter(key_val, output_dir): from apache_beam.io.gcp import gcsio import avro.schema from avro.datafile import DataFileWriter from avro.io import DatumWriter from itertools import groupby key = key_val[0] vals = list(key_val[1]) vals = sorted(vals, key=lambda x: int(x['position'])) out_file_path = output_dir.get() + "{:06d}.avro".format(key) out_file = gcsio.GcsIO().open(out_file_path, 'wb') schema = avro.schema.parse(schema_string) writer = DataFileWriter(out_file, DatumWriter(), schema) def clean_record(record): cleaned = {k:v for (k,v) in record.items() if v is not None} cleaned.pop('position', None) return cleaned #for key, group in groupby(things, lambda x: x[0]): for position, values in groupby(vals, lambda x: int(x['position'])): cleaned_values = [ clean_record(record) for record in values] writer.append({"position": position, "values" : cleaned_values}) writer.close() return out_file_path
class AvroRecordWriter(TrivialRecordWriter): def __init__(self, simulator, stream): super(AvroRecordWriter, self).__init__(simulator, stream) self.deserializers = {} schema = None if self.simulator.avro_output_key_schema: self.deserializers['k'] = AvroDeserializer(self.simulator.avro_output_key_schema) schema = avro.schema.parse(self.simulator.avro_output_key_schema) if self.simulator.avro_output_value_schema: self.deserializers['v'] = AvroDeserializer(self.simulator.avro_output_value_schema) schema = avro.schema.parse(self.simulator.avro_output_value_schema) if self.simulator.avro_output == 'kv': schema_k_parsed = avro.schema.parse(self.simulator.avro_output_key_schema) schema_v_parsed = avro.schema.parse(self.simulator.avro_output_value_schema) schema_k = json.loads(self.simulator.avro_output_key_schema) schema_k.pop('namespace', None) schema_v = json.loads(self.simulator.avro_output_value_schema) schema_v.pop('namespace', None) schema = { 'type': 'record', 'name': 'kv', 'fields': [ {'name': 'key', 'type': schema_k}, {'name': 'value', 'type': schema_v if schema_k_parsed.fullname != schema_v_parsed.fullname else schema_k_parsed.name} ] } schema = avro.schema.parse(json.dumps(schema)) self.writer = DataFileWriter(self.stream, DatumWriter(), schema) def send(self, cmd, *vals): if cmd == 'done': self.writer.close() super(AvroRecordWriter, self).send(cmd, *vals) def output(self, key, value): if self.simulator.avro_output == 'k': obj_to_append = self.deserializers['k'].deserialize(key) elif self.simulator.avro_output == 'v': obj_to_append = self.deserializers['v'].deserialize(value) else: obj_to_append = { 'key': self.deserializers['k'].deserialize(key), 'value': self.deserializers['v'].deserialize(value) } self.writer.append(obj_to_append) def close(self): try: self.writer.close() except ValueError: # let's ignore if already closed pass self.stream.close()
def gen_avro(filename): schema = avro.schema.parse(SCHEMA) fo = open(filename, "wb") writer = DataFileWriter(fo, DatumWriter(), schema) for record in looney_records(): writer.append(record) writer.close() fo.close()
def encode(self, raw_data): byte_stream = BytesIO() writer = DataFileWriter(byte_stream, DatumWriter(), self._schema) writer.append(raw_data) writer.flush() serialized_data = byte_stream.getvalue() writer.close() return serialized_data
def run(self): # for normalizing alcohol minimum, maximum, average = 100, 0, 0 with open('raw.csv', 'r') as fd: csv_reader = csv.reader(fd, delimiter=',') collection = {} for i, row in enumerate(csv_reader): desc = row[3].lower().replace('.', '').replace(',', '') alc = float(row[-1]) if alc < minimum: minimum = alc if alc > maximum: maximum = alc average += alc # Remove gifts or items without description if 'engin' in desc: continue if 'gjafa' in desc or 'gjafa' in row[0]: continue if 'öskju' in desc or 'öskju' in row[0]: continue if 'flöskur m/glasi' in desc or 'kútur' in row[0]: continue features = self.parse(desc.split(), row[0]) features['alcohol'] = alc collection[row[0]] = features average = average / (i + 1) with open('beers.avsc', 'r') as fd: schema = avro.schema.Parse(fd.read()) with open('beers.avro', 'wb') as fd: writer = DataFileWriter(fd, DatumWriter(), schema) denominator_alc = maximum - minimum for k, v in collection.items(): v['bitterness'] = self.BITTERNESS['class'][ v['bitterness']] / self.BITTERNESS['maximum'] v['color'] = self.COLOR['class'][ v['color']] / self.COLOR['maximum'] v['clarity'] = self.CLARITY['class'][ v['clarity']] / self.CLARITY['maximum'] v['sweetness'] = self.SWEETNESS['class'][ v['sweetness']] / self.CLARITY['maximum'] v['alcohol'] = (v['alcohol'] - minimum) / denominator_alc v['name'] = k writer.append(v) writer.close()
def _create_avro_file(schema, items, file_prefix): _, result_file_path = tempfile.mkstemp(prefix=file_prefix, suffix='.avro') parsed_schema = avro.schema.Parse(schema) with open(result_file_path, 'wb') as f: writer = DataFileWriter(f, DatumWriter(), parsed_schema) for s in items: writer.append(s) writer.close() return result_file_path
def gen_single_day_data(date, schema): writer = DataFileWriter(open("events2-{}.avro".format(date), "w"), DatumWriter(), schema) N = 10 ** 5 for i in xrange(0, N): tags = ["t{}".format(random.randint(1, 10)) for x in range(0, 4)] (tag1, tag2, tag3, tag4) = tags cookie = 'CK.{}'.format(random.randint(1, 10 ** 5)) writer.append({"tag1":tag1, "tag2":tag2, "tag3": tag3, "tag4":tag4, "date":date, "cookie":cookie, "count": 1}) writer.close()
def testWrite(filename): schema_object = avro.schema.parse(TEST_SCHEMA) fd = open(filename, 'wb') datum_writer = DatumWriter() fwriter = DataFileWriter(fd, datum_writer, schema_object) for i in xrange(10): fwriter.append(_makeTestPerson(i)) fwriter.close()
class Avro_Merger(object): _merge_started = False _avro_extention = '.avro' _avro_stats_record = None def __init__(self, path, new_filename): try: self._avro_files = filter(lambda x: x.endswith(self._avro_extention), iter(os.listdir(path))) schema = avro.schema.parse(open(schema_file).read()) self._writter = DataFileWriter(open(output_file, 'w'), DatumWriter(), schema, 'deflate') except Exception as e: raise avro.schema.AvroException(e) sys.exit(1) def flog_metadata_handler(func): """ This is a decorator that handles avro meta data as well as very last stats record in each file during merging """ def wrapper(self, avro_records): """ Wrapper method for consuming flog avro file """ # Handle meta data if self._writter.tell() != 0: # TODO, need to fix this next(avro_records) # Handle stats line self._avro_stats_record = deque(avro_records, maxlen=1).pop() func(avro_records) return wrapper @flog_metadata_handler def consume_avro(self, avro_records): """ Write the avro data from the butter to file """ map(self._writter.append, iter(self._avro_record)) def merge(self): """ Loop through the avros and merge each file """ for file_ in self._avro_files: try: avro_records = DataFileReader(open(os.path.join(input_dir, file_), "r"), DatumReader()) except Exception as e: raise avro.schema.AvroException(e) # Consume the records! self.consume_avro(avro_records) # Write stats data to the last of the file self._writter.append(self._avro_stats_record) self._writter.close()
class AvroFileWriter(Writer): def __init__(self, schemaFile, avroFile): self.schema = avro.schema.Parse(open(schemaFile, "rb").read()) self.writer = DataFileWriter(open(avroFile, "wb"), DatumWriter(), self.schema) def write(self, obj): self.writer.append(obj); def close(self): self.writer.close()
def check_schema(self, data, schema_path): schema = avro.schema.Parse( open(schema_path, "rb").read().decode("utf-8")) writer = DataFileWriter(open('_test.avro', "wb"), DatumWriter(), schema) writer.append(data) writer.close()
def create_archive(basedir, destdir): all_files = [] all_dirs = [] # make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output for path, dirs, files in os.walk(basedir): for d in dirs: dir = os.path.join(path, d) all_dirs.append(dir) for f in files: file = os.path.join(path, f) all_files.append(file) schema = avro.schema.parse( open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "avro-schemas.json")).read()) fileprefix = time.strftime("%Y%m%d-%H%M%S") avrofile = fileprefix + "-part-0001.avro" iteration = 1 fd = open(os.path.join(destdir, avrofile), 'wb') datum = avro.io.DatumWriter() writer = DataFileWriter(fd, datum, schema, codec='deflate') try: for d in all_dirs: val = makedir(os.path.basename(os.path.normpath(d)), os.path.relpath(d, basedir)) writer.append(val) for f in all_files: for sibling, numsiblings, chunk in get_file_chunks(f): if (fd.tell() + len(chunk)) > maxfilesize * 1.1: fd, writer, iteration = rotate_avro_file(fd, writer, iteration, fileprefix, destdir, datum, schema) file = makefile(os.path.basename(os.path.normpath(f)), os.path.relpath(f, basedir), numsiblings, sibling, chunk) writer.append(file) writer.flush() del file for f in all_files: os.remove(f) for d in all_dirs: os.rmdir(d) finally: writer.close() fd.close()
def read_log(topic, log): schema = avro.schema.parse(open(os.path.abspath(os.path.dirname(__file__)) + "/avro_schema/" + topic + ".avsc").read()) print "schema:", schema writer = DataFileWriter(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "w"), DatumWriter(), schema) for i in range(5): writer.append(log) writer.close() reader = DataFileReader(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "r"), DatumReader()) for log in reader: print log
def objToBin2(): file = io.BytesIO() datum_writer = DatumWriter() fwriter = DataFileWriter(file, datum_writer, sc) for d in datum: fwriter.append(d) ab = file.getvalue() fwriter.close() return ab
def write_json_to_avro(schema_uri, output_uri, json_str): schema = avro.schema.parse(open(schema_uri).read()) writer = DataFileWriter(open(output_uri, "w"), DatumWriter(), schema) json_list = json.loads(json_str) for row in json_list: writer.append(row) writer.close()
def main(): if len(sys.argv) < 3: print "Usage:", sys.argv[0] print "add [num of events to add] filename" print "list filename" exit(1) command = sys.argv[1] if command == 'add': noEvents = sys.argv[2] filename = sys.argv[3] # load existing events existingEvents = {} try: reader = DataFileReader(open(filename, "rb"), DatumReader()) existingEvents = reader reader.close() except IOError: print filename + ": Could not open file. Creating a new one." # Write back out to disk try: schema = avro.schema.parse(open("etc/userevent.avsc").read()) f = open(filename, "w") writer = DataFileWriter(f, DatumWriter(), schema) # Append new user events for i in range(0, int(noEvents)): newEvent = createUserEvent() print newEvent writer.append(newEvent) writer.close() print "Wrote {0} user events".format(noEvents) except IOError: print filename + ": Could not save file." elif command == 'list': listAllUserEvents(sys.argv[2]) else: print "Unregistered command. Exiting" sys.exit(1)
def main(): parser = ArgumentParser(description="Simple AMS example of subscription pull/consume") parser.add_argument('--host', type=str, default='messaging-devel.argo.grnet.gr', help='FQDN of AMS Service') parser.add_argument('--token', type=str, required=True, help='Given token') parser.add_argument('--project', type=str, required=True, help='Project registered in AMS Service') parser.add_argument('--subscription', type=str, required=True, help='Subscription name') parser.add_argument('--topic', type=str, required=True, help='Given topic') parser.add_argument('--nummsgs', type=int, default=3, help='Number of messages to pull and ack') parser.add_argument('--schema', type=str, required=True, help='Avro schema') parser.add_argument('--outfile', type=str, required=True, help='Output avro file') args = parser.parse_args() # initialize service with given token and project ams = ArgoMessagingService(endpoint=args.host, token=args.token, project=args.project) # ensure that subscription is created in first run. messages can be # pulled from the subscription only when subscription already exists # for given topic prior messages being published to topic try: if not ams.has_sub(args.subscription): ams.create_sub(args.subscription, args.topic) subscription = ams.get_sub(args.subscription, retobj=True) except AmsException as e: print(e) raise SystemExit(1) # try to pull number of messages from subscription. method will # return (ackIds, AmsMessage) tuples from which ackIds and messages # payload will be extracted. avro_payloads = list() for msg in subscription.pullack(args.nummsgs, retry=5, retrysleep=15, return_immediately=True): data = msg.get_data() msgid = msg.get_msgid() print('msgid={0}'.format(msgid)) avro_payloads.append(data) try: schema = load_schema(args.schema) if os.path.exists(args.outfile): avroFile = open(args.outfile, 'a+') writer = DataFileWriter(avroFile, DatumWriter()) else: avroFile = open(args.outfile, 'w+') writer = DataFileWriter(avroFile, DatumWriter(), schema) for am in avro_payloads: msg = avro_deserialize(am, args.schema) writer.append(msg) writer.close() avroFile.close() except Exception as e: print(e) raise SystemExit(1)
def write(fin, fout, schema): "write json to avro" schema = avro.schema.parse(open(schema).read()) data = json.load(open(fin, 'r')) writer = DataFileWriter(open(fout, "w"), DatumWriter(), schema) if isinstance(data, list): for doc in data: writer.append(doc) else: writer.append(data) writer.close()
def _write_to_avro(self, log, fields): msglist = [] msg, tags = {}, {} msg = {'service': fields['serviceType'], 'timestamp': fields['timestamp'], 'hostname': fields['hostName'], 'metric': fields['metricName'], 'status': fields['metricStatus']} msgattrmap = {'detailsData': 'message', 'summaryData': 'summary', 'nagios_host': 'monitoring_host'} for attr in msgattrmap.keys(): if attr in fields: msg[msgattrmap[attr]] = fields[attr] tagattrmap = {'ROC': 'roc', 'voName': 'voName', 'voFqan': 'voFqan'} for attr in tagattrmap.keys(): tags[tagattrmap[attr]] = fields.get(attr, None) if tags: msg['tags'] = tags if ',' in fields['serviceType']: servtype = fields['serviceType'].split(',') msg['service'] = servtype[0].strip() msglist.append(msg) copymsg = msg.copy() copymsg['service'] = servtype[1].strip() msglist.append(copymsg) else: msglist.append(msg) sh.thlock.acquire(True) try: schema = avro.schema.parse(open(self.avroSchema).read()) if path.exists(log): avroFile = open(log, 'a+') writer = DataFileWriter(avroFile, DatumWriter()) else: avroFile = open(log, 'w+') writer = DataFileWriter(avroFile, DatumWriter(), schema) for m in msglist: writer.append(m) writer.close() avroFile.close() except (IOError, OSError) as e: sh.Logger.error(e) raise SystemExit(1) finally: sh.thlock.release()
def testWrite(filename, schema): fd = open(filename, 'wb') datum = DatumWriter() writer = DataFileWriter(fd, datum, schema) writer.append(makeObject("Person A", 23)) writer.append(makeObject("Person B", 31)) writer.append(makeObject("Person C", 28)) writer.close()
def main(schema_fn, csv_fn, avro_fn): with open(schema_fn) as f_in: schema = avro.schema.parse(f_in.read()) with open(csv_fn) as f_in: reader = csv.reader(f_in, delimiter=';') with open(avro_fn, 'wb') as f_out: writer = DataFileWriter(f_out, DatumWriter(), schema) for row in reader: writer.append(dict(zip(FIELDS, row))) writer.close()
def _write_data(self, directory=None, prefix=tempfile.template, codec='null', count=len(RECORDS)): with tempfile.NamedTemporaryFile( delete=False, dir=directory, prefix=prefix) as f: writer = DataFileWriter(f, DatumWriter(), self.SCHEMA, codec=codec) len_records = len(self.RECORDS) for i in range(count): writer.append(self.RECORDS[i % len_records]) writer.close() self._temp_files.append(f.name) return f.name
class AvroWriter(RecordWriter): schema = None def __init__(self, context): super(AvroWriter, self).__init__(context) job_conf = context.job_conf part = int(job_conf['mapreduce.task.partition']) outdir = job_conf["mapreduce.task.output.dir"] outfn = "%s/part-r-%05d.avro" % (outdir, part) wh = hdfs.open(outfn, "w") self.writer = DataFileWriter(wh, DatumWriter(), self.schema) def close(self): self.writer.close() # FIXME do we really need to explicitly close the filesystem? self.writer.writer.fs.close()
class AvroAppender(threading.Thread): def __init__(self, file): threading.Thread.__init__(self) self.avro_writer = DataFileWriter(open(file, "w"), DatumWriter(), schema) self.queue = Queue.Queue() self.should_stop = False self.mutex = threading.Lock() self.start() def log_append(self, user, advertiser, **kwargs): if user is not None and advertiser is not None: record = dict(user=user, advertiser=advertiser) if kwargs["ip"]: record["ip"] = kwargs["ip"] if kwargs["agent"]: record["agent"] = kwargs["agent"] if kwargs["time"]: record["timestamp"] = float(kwargs["time"]) else: record["timestamp"] = float(time.time()) if kwargs["keywords"]: record["keywords"] = list(set([string.strip() for string in kwargs["keywords"].split(",")])) self.queue.put_nowait(record) else: print "Missing user" def close_appender(self): self.mutex.acquire() self.should_stop = True self.mutex.release() def run(self): while True: try: record = self.queue.get(False, 1000) self.avro_writer.append(record) except Queue.Empty: self.mutex.acquire() stop = self.should_stop self.mutex.release() if stop: break self.avro_writer.close()
def write(self): try: schema = avro.schema.parse(open(self.schema).read()) avrofile = open(self.outfile, 'w+') datawrite = DataFileWriter(avrofile, DatumWriter(), schema) for elem in self.listdata: datawrite.append(elem) datawrite.close() avrofile.close() except (avro.schema.SchemaParseException, avro.io.AvroTypeException): self.logger.error(" couldn't parse %s" % self.schema) raise SystemExit(1) except IOError as e: self.logger.error(e) raise SystemExit(1)
def main(): """Start of execution""" #combine the schemas known_schemas = avro.schema.Names() types_schema = LoadAvsc("parameter_types.avsc", known_schemas) param_schema = LoadAvsc("parameter.avsc", known_schemas) print json.dumps(param_schema.to_json(avro.schema.Names()), indent=2) #test the schema works param_file = open("parameters.avro", "w") writer = DataFileWriter(param_file, DatumWriter(), param_schema) param_1 = {"name": "test", "description":"An Avro test.", "type":"int"} param_2 = {"name": "test", "description":"An Avro test.", "type":"boolean"} writer.append(param_1) writer.append(param_2) writer.close() reader = DataFileReader(open("parameters.avro", "r"), DatumReader()) for parameter in reader: print parameter reader.close()
def readAndWriteAvro(): """ Unlike java, avro does not let you generate code for Tweet in python. So only way to read and write data is without using code generation""" #Read the schema schema = avro.schema.parse(open("tweet.avsc").read()) #write some data writer = DataFileWriter(open("tweets.avro", "w"), DatumWriter(), schema) writer.append({"tweetId": 5, "user": "******", "text" : "Tweeting from python as well"}) writer.close() #read the same data tweets = DataFileReader(open("tweets.avro", "r"), DatumReader()) for tweet in tweets: print tweet tweets.close()
def main(argv): try: schema_fn = argv[1] n_users = int(argv[2]) avro_fn = argv[3] except IndexError: sys.exit('Usage: %s SCHEMA_FILE N_USERS AVRO_FILE' % argv[0]) with open(schema_fn) as f_in: schema = avro.schema.parse(f_in.read()) with open(avro_fn, 'wb') as f_out: writer = DataFileWriter(f_out, DatumWriter(), schema) for i in xrange(n_users): writer.append({ 'name': random.choice(NAME_POOL), 'office': random.choice(OFFICE_POOL), 'favorite_color': random.choice(COLOR_POOL), 'favorite_number': i, }) writer.close()
def main(): parser = argparse.ArgumentParser() parser.add_argument('-s', nargs=1, help='new schema', required=True, metavar='avro schema') parser.add_argument('-i', nargs='+', help='avro files', required=True, metavar='avro file') parser.add_argument('-ts', action='store_true', help='convert int tag values to str', required=False) parser.add_argument('-o', nargs=1, help='output directory', required=True, metavar='output directory') args = parser.parse_args() for f in args.i: out = [] if args.o[0].startswith('/'): dest = args.o[0] else: dest = os.path.abspath('.') + '/' + args.o[0] try: os.makedirs(dest) except OSError as e: if e.args[0] != errno.EEXIST: print os.strerror(e.args[0]), e.args[1], args.o[0] raise SystemExit(1) schema = avro.schema.parse(open(args.s[0]).read()) writer = DataFileWriter(open(dest + '/' + os.path.basename(f), 'w'), DatumWriter(), schema) reader = DataFileReader(open(f, 'r'), DatumReader()) try: for i, entry in enumerate(reader): if args.ts: for t in entry['tags']: if isinstance(entry['tags'][t], int): entry['tags'][t] = str(entry['tags'][t]) writer.append(entry) writer.close() except UnicodeDecodeError as e: pprint.pprint(e) print f
def _produce_test_input(self): schema = avro.schema.parse(""" { "type":"record", "name":"TrackEntity2", "namespace":"com.spotify.entity.schema", "doc":"Track entity merged from various sources", "fields":[ { "name":"map_record", "type":{ "type":"map", "values":{ "type":"record", "name":"MapNestedRecordObj", "doc":"Nested Record in a map doc", "fields":[ { "name":"element1", "type":"string", "doc":"element 1 doc" }, { "name":"element2", "type":[ "null", "string" ], "doc":"element 2 doc" } ] } }, "doc":"doc for map" }, { "name":"additional", "type":{ "type":"map", "values":"string" }, "doc":"doc for second map record" }, { "name":"track_gid", "type":"string", "doc":"Track GID in hexadecimal string" }, { "name":"track_uri", "type":"string", "doc":"Track URI in base62 string" }, { "name":"Suit", "type":{ "type":"enum", "name":"Suit", "doc":"enum documentation broz", "symbols":[ "SPADES", "HEARTS", "DIAMONDS", "CLUBS" ] } }, { "name":"FakeRecord", "type":{ "type":"record", "name":"FakeRecord", "namespace":"com.spotify.data.types.coolType", "doc":"My Fake Record doc", "fields":[ { "name":"coolName", "type":"string", "doc":"Cool Name doc" } ] } }, { "name":"master_metadata", "type":[ "null", { "type":"record", "name":"MasterMetadata", "namespace":"com.spotify.data.types.metadata", "doc":"metadoc", "fields":[ { "name":"track", "type":[ "null", { "type":"record", "name":"Track", "doc":"Sqoop import of track", "fields":[ { "name":"id", "type":[ "null", "int" ], "doc":"id description field", "default":null, "columnName":"id", "sqlType":"4" }, { "name":"name", "type":[ "null", "string" ], "doc":"name description field", "default":null, "columnName":"name", "sqlType":"12" } ], "tableName":"track" } ], "default":null } ] } ] }, { "name":"children", "type":{ "type":"array", "items":{ "type":"record", "name":"Child", "doc":"array of children documentation", "fields":[ { "name":"name", "type":"string", "doc":"my specific child\'s doc" } ] } } } ] }""") self.addCleanup(os.remove, "tmp.avro") writer = DataFileWriter(open("tmp.avro", "wb"), DatumWriter(), schema) writer.append({ u'track_gid': u'Cool guid', u'map_record': { u'Cool key': { u'element1': u'element 1 data', u'element2': u'element 2 data' } }, u'additional': { u'key1': u'value1' }, u'master_metadata': { u'track': { u'id': 1, u'name': u'Cool Track Name' } }, u'track_uri': u'Totally a url here', u'FakeRecord': { u'coolName': u'Cool Fake Record Name' }, u'Suit': u'DIAMONDS', u'children': [ { u'name': u'Bob' }, { u'name': u'Joe' } ] }) writer.close() self.gcs_client.put("tmp.avro", self.gcs_dir_url + "/tmp.avro")
def main(): known_schemas = avro.schema.Names() with open("point.avsc", "rb") as fp: point = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) with open("review.avsc", "rb") as fp: place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) with open("place.avsc", "rb") as fp: place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5, 'y': 2.75}) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['x'] == 1.5 assert deserialized['y'] == 2.75 reader.close() writer.close() try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5}) assert False except AvroTypeException as e: pass try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5, 'y': "wtanaka.com"}) assert False except AvroTypeException as e: pass output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75} }) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['location']['x'] == 1.5 assert deserialized['location']['y'] == 2.75 reader.close() writer.close() output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75}, 'review': {'rating': 4, 'text': '4 stars would come again'}, }) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['location']['x'] == 1.5 assert deserialized['location']['y'] == 2.75 reader.close() writer.close() try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75}, 'review': {'x': 1.5, 'y': 2.75}, }) assert False except AvroTypeException as e: pass
'Cc': cleanCc, 'Date': message['Date'], 'Subject': message['Subject'], 'Body': get_body(message) }) print(cleanFrom) schema = avro.schema.Parse(open("email.avro.schema").read()) writer = DataFileWriter(open("email.avro", "wb"), DatumWriter(), schema) pathToEmails = '../emails/Archives' pathToCleanup = '../emails/name_to_address.csv' mboxfiles = [os.path.join(dirpath, f) for dirpath, dirnames, files in os.walk(pathToEmails) for f in files if f.endswith('mbox')] mailTable = [] #print(mboxfiles) for mboxfile in mboxfiles: # print(mboxfile) write_avro(mboxfile, writer, pathToCleanup) writer.close() #reader = DataFileReader(open("email.avro", "rb"), DatumReader()) #for email in reader: # print(email['Subject']) # reader.close()
def writeFile(): writer = DataFileWriter(open("part-00000.avro", "w"), DatumWriter(), schema) writer.append({"logline": "2016\t30"}) writer.close()