def print_all_events(path, limit=10): """example stepping through all the data files and parsing them 1. iterate through all data files 2. open files with avro 3. parse event JSON 4. pretty-print events """ printed = 0 for parent, dirs, files in os.walk(path): for fname in sorted(files): printed += 1 if printed >= limit: print("...") return with open(os.path.join(parent, fname), 'rb') as f: # this is how you open an avro file reader = DataFileReader(f, DatumReader()) # an avro file provides an iterable of events for reading in reader: # the uuid we want to use is reading.SystemProperties.connectionDeviceId print( f"uuid={reading['SystemProperties']['connectionDeviceId']}" ) # the actual payload from the app is the json body (as a bytestring) try: # parse it out so it looks nicer when we print: reading['Body'] = json.loads( reading['Body'].decode('utf8')) except ValueError: # leave not json as bytes. This shouldn't happen! pass pprint.pprint(reading)
def runEngine(self, engine): if engine.config.method == "emit": engine.emit = lambda x: x for record in DataFileReader( open("test/prettypfa/exoplanets.avro", "r"), DatumReader()): engine.action(record)
def lambda_handler(event, context): source_bucket = event['Records'][0]['s3']['bucket']['name'] key = urllib.parse.unquote(event['Records'][0]['s3']['object']['key']) splitStr = source_bucket.split(".") account = splitStr[0] profile = splitStr[1] processed_bucket = source_bucket + "-processed" processed_key = key stream = get_object(source_bucket, key) success = copy_object(source_bucket, key, processed_bucket, processed_key) if success: s3.delete_object(Bucket=source_bucket, Key=key) if stream is not None: raw_bytes = stream.read() avro_bytes = io.BytesIO(raw_bytes) reader = DataFileReader(avro_bytes, DatumReader()) for line in reader: send_to_tealium(line, account, profile) return ""
def decode(self, encoded_event: Any) -> Dict[str, Union[BaseModel, BaseStoreRecord, BaseHandler, BaseStoreRecordHandler]]: try: reader = DataFileReader(BytesIO(encoded_event), DatumReader()) schema = json.loads(reader.meta.get('avro.schema').decode('utf-8')) schema_name = schema['namespace'] + '.' + schema['name'] event_data = next(reader) except AvroTypeException as err: self.logger.exception(f'{err.__str__()}') raise AvroDecodeError # Finds a matching event name for e_name, event in self._events.items(): if e_name.match(schema_name): # type: ignore event_class = event break else: raise MissingEventClass # Finds a matching handler name for e_name, handler in self._handlers.items(): if e_name.match(schema_name): # type: ignore handler_class = handler break else: raise MissingHandlerClass return {'event_class': event_class.from_data(event_data=event_data), 'handler_class': handler_class}
def build_cars_in_time(files): cars = {} for idx, file in enumerate(files): print_progress(idx, len(files)) try: reader = DataFileReader(open(file, "rb"), DatumReader()) for car in reader: car_reg_number = select_reg_number(car) if not cars.has_key(car_reg_number): cars[car_reg_number] = [] cars[car_reg_number].append({ 'timestamp': select_timestamp(car), 'distanceAccumulated': select_distance_accumulated(car), 'regNumber': select_reg_number(car) }) except TypeError: print("Error reading file {0}".format(file)) finally: reader.close() return cars
def handle_file(path): print("Reading file from: " + path) reader = DataFileReader(open(path, "rb"), DatumReader()) for record in reader: cset = CollectionSet() cset.ParseFromString(record['Body']) handle_collection_set(cset)
def read(self, format): time_start = time.time() if format == 'json': with open('./output/output.json') as file: json.loads(file.read()) if format == 'jsch': with open('./output/output.json') as file: validate(json.loads(file.read()), self._schema_json) elif format == 'avro': reader = DataFileReader(open('./output/output.avro', 'r'), DatumReader()) for user in reader: pass reader.close() elif format == 'protobuf': with open('./output/output.pb', 'rb') as file: addressbook_pb2.AddressBook().ParseFromString(file.read()) elif format == 'gzjson': with gzip.open('./output/output.jsz', 'rb') as file: json.loads(file.read()) time_end = time.time() return time_end - time_start
def get_data_in_batches( self, bucket_name, prefix=None, data_after=None, data_until=None, batch_size=10000, ): rows = [] for blob in self.generate_blob_list(bucket_name, prefix, data_after, data_until): # download file content as bytes, read via avro blob_meta = { "blob_name": blob.name, "blob_modified_at": blob.updated, } bytes_data = blob.download_as_string() bytes_object = BytesIO(bytes_data) bytes_object.mode = "rb+" # need to "fake" the mode attribute because # avro checks the mode of the file given for some reason, fails otherwise reader = DataFileReader(bytes_object, DatumReader()) for row in reader: # add blob-level metadata row.update(blob_meta) rows.append(row) if len(rows) >= batch_size: yield rows rows = [] if rows: # return any data that was left after the last iteration yield rows
def _read_avro_file(self) -> List[dict]: records = [] for file in glob.glob( os.path.join(os.path.join(self.avro_dir_name, '**/*'))): for record in DataFileReader(open(file, 'rb'), DatumReader()): records.append(record) return records
def read_then_to_json(client, file_names, bucket, error_keys_table): temp_json_output = [] for file in file_names: filename = "/tmp/temp.avro" try: client.download_file(Bucket = bucket, Key = file, Filename = filename) except Exception as e: ''' files which could not be downloaded''' print ("File could not be downloaded: " + file) error_keys_table['aws']['files'].append(file) continue try: reader = DataFileReader(open(filename , "rb"), DatumReader()) except Exception as e: ''' files that couldn't be opened ''' print ("File could not be opened: " + file) error_keys_table['open']['files'].append(file) continue for user in reader: if user not in temp_json_output: temp_json_output.append(user) return temp_json_output
def _from_avro_generic(avro_container_uri: str, ): datum_counter = 0 datum_to_return = None # DET TODO add other exception handling around the double with clause with open(avro_container_uri, "rb") as avro_fp: with DataFileReader(avro_fp, DatumReader()) as reader: # # This static meethod can only initialize one datum in the file - scan through and raise # error if more than one found # Not sure if there is lazy access to the datum - if so returning the datum to caller # for subsequent loading would be problematic # for datum_counter, datum in enumerate(reader, start=1): print('Reading datum #' + str(datum_counter)) print('The message datum = ' + str(datum)) if datum_counter == 1: datum_to_return = datum if datum_counter > 1: raise EmeraldMessageDeserializationError( 'Unable to deserialize from AVRO container "' + avro_container_uri + '" - this deserializer can only have one datum per file' + os.linesep + 'Total element count in this file = ' + str(datum_counter)) if datum_to_return is None: raise EmeraldMessageDeserializationError( 'Data could not be loaded from AVRO file "' + str(avro_container_uri) + '" using schema ' + AbstractContainer.get_avro_schema_record().avro_schema_name) print('Length of datum to return = ' + str(datum_to_return)) print('Type of data to return = ' + str(type(datum_to_return))) return datum_to_return
def main(args): log = logging.getLogger(__name__) log.setLevel(logging.INFO) sys_log = logging.handlers.SysLogHandler("/dev/log") sys_format = logging.Formatter( '%(name)s[%(process)d]: %(levelname)s %(message)s') sys_log.setFormatter(sys_format) log.addHandler(sys_log) reader = DataFileReader(open(args.avro_file, "r"), DatumReader()) schema = reader.datum_reader.writers_schema for i, row in enumerate(reader): log.debug("Consumer row:" + str(row)) writer = DatumWriter(schema) bytes_writer = io.BytesIO() encoder = BinaryEncoder(bytes_writer) writer.write(row, encoder) raw_bytes = bytes_writer.getvalue() b64enc = base64.b64encode(raw_bytes) msg = {"messages": [{"data": b64enc}]} json_str = json.dumps(msg) log.debug("json msg:" + json_str) publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)
def generic_dataframe(self, df, avro_schema, assert_fns=None): """Generic test running function for arbitrary avro schemas. Writes a dataframe containing the records to avro. Reads back and compares with the original """ print(avro_schema) cyavro.write_avro_file_from_dataframe(df, self.filename, json.dumps(avro_schema), codec='null') if assert_fns is None: assert_fns = {} df_read = cyavro.read_avro_file_as_dataframe(self.filename) import avro.schema from avro.datafile import DataFileReader, DataFileWriter from avro.io import DatumReader, DatumWriter with open(self.filename, 'rb') as fo: reader = DataFileReader(fo, DatumReader()) records = [] for user in reader: records.append(user) df_reference = pd.DataFrame(records) reader.close() success = True for col in avro_schema["fields"]: colname = col['name'] assert_fn = assert_fns.get(colname, np.testing.assert_array_equal) def print_fail_header(s): print('#' * len(s)) print("FAIL: Column {}".format(col)) print('#' * len(s)) print(s) try: assert_fn(df_read[colname], df[colname]) except AssertionError: print_fail_header( "Failed for cyavro read comparison {}\n".format(col)) traceback.print_exc(file=sys.stdout) success = False try: assert_fn(df_reference[colname], df[colname]) except AssertionError: print_fail_header( "Failed for cyavro write comparison {}\n".format(col)) traceback.print_exc(file=sys.stdout) success = False assert success
def get_schema(self): """Lazy accessor for data store schema If schema is given as a run parameter, then returns this schema. Otherwise extracts the schema from the Avro data store files. """ if self._schema: return self._schema else: if not self._schema_path: #if there is no schema paths = self.__get_paths_to_avro_files() with DataFileReader( paths[0].open("r"), _FieldsOrderPreservingDatumReader()) as reader: self._schema = avro.schema.parse( reader.get_meta('avro.schema')) return self._schema else: #a schema is given try: self._schema = avro.schema.parse( self._schema_path.open("r").read()) return self._schema except TypeError: error("supplied schema cannot be parsed!") raise
def get_flowrecords_from_flowdata_file(filename_path_input): """ Create a Python generator to read the csv/txt/avro file returning the records to processing. *Important: when considering CSV/TXT files remember to use files without header/statistics as input files* :param filename_path_input: exported csv/txt/avro flow input file from the original nfpcap file via NFDUMP :return: generator to records from file """ if filename_path_input.lower().endswith(('.csv', '.txt')): with open(filename_path_input) as csvfile: reader = csv.reader(csvfile) for line in reader: yield create_flow_record_from_csv(line) # >> default extension Apache AVRO << else: # prepare to read binary flowsrecords_reader = DataFileReader(open(filename_path_input, "rb"), DatumReader()) try: for flow in flowsrecords_reader: yield flow except zlib.error as ze: print ze.message pass except IOError as io: print io.message
def deserializeDataFromFile2Str(inputFile): logging.debug("Deserializing file:" + inputFile) reader = DataFileReader(open(inputFile, "r"), DatumReader()) data = "" for item in reader: data = data + str(item) reader.close() return data
def open_avrofile(fn_input): """ Return an the data file reader to a given AVRO file. *note that to open an .avro file is not necessary to inform the schema because it's embedded in the file* :param fn_input: :return: record reader object """ return DataFileReader(open(fn_input, "rb"), DatumReader())
def run(argv=None, save_main_session=True): '''Main entry point; defines and runs the wordcount pipeline.''' parser = argparse.ArgumentParser() parser.add_argument( '--input', dest='input', default='gs://dataflow-samples/shakespeare/kinglear.txt', help='Input file to process.') parser.add_argument('--output', dest='output', required=True, help='Output file to write results to.') known_args, pipeline_args = parser.parse_known_args(argv) # We use the save_main_session option because one or more DoFn's in this # workflow rely on global context (e.g., a module imported at module level). pipeline_options = PipelineOptions(pipeline_args) pipeline_options.view_as( SetupOptions).save_main_session = save_main_session p = beam.Pipeline(options=pipeline_options) # Read the text file[pattern] into a PCollection. lines = p | 'read' >> ReadFromText(known_args.input) processed_users = (lines | 'splits' >> beam.Map(split_and_lower) | 'noNum' >> beam.Map(no_num_format) | 'formatOut' >> beam.Map(format_output)) processed_users | 'uniqueUser' >> beam.Distinct( ) | 'writeUnique' >> WriteToText(known_args.output, file_name_suffix='.csv') schema = avro.schema.parse(open("user.avsc", "rb").read()) processed_users | 'avro_write' >> beam.io.avroio.WriteToAvro( 'output_avro', schema, file_name_suffix='.avro') reader = DataFileReader(open("output_avro-00000-of-00001.avro", "rb"), DatumReader()) for user in reader: print user reader.close() result = p.run() result.wait_until_finish() # Do not query metrics when creating a template which doesn't run if (not hasattr(result, 'has_job') # direct runner or result.has_job): # not just a template creation empty_lines_filter = MetricsFilter().with_name('empty_lines') query_result = result.metrics().query(empty_lines_filter) if query_result['counters']: empty_lines_counter = query_result['counters'][0] logging.info('number of empty lines: %d', empty_lines_counter.result) word_lengths_filter = MetricsFilter().with_name('word_len_dist') query_result = result.metrics().query(word_lengths_filter) if query_result['distributions']: word_lengths_dist = query_result['distributions'][0] logging.info('average word length: %d', word_lengths_dist.result.mean)
def main(event, context): for record in event['Records']: key = record['s3']['object']['key'] key = urllib.unquote(key).decode('utf8') print(key) json_file_key = create_json_s3_path(key) print(json_file_key) obj = client.get_object(Bucket=BUCKET_NAME, Key=key) obj = io.BytesIO(obj['Body'].read()) reader = DataFileReader(obj, DatumReader()) schema = reader.datum_reader.writers_schema schema = schema.__dict__ try: event = schema['_props']['name'].lower() except: event = None converted_avro_data = [ dict(row, **add_on_dict(obj=row, event=event)) for row in reader ] json_data = json.dumps(converted_avro_data, indent=1) print(json.dumps(converted_avro_data[:10])) try: resource.Object( BUCKET_NAME, json_file_key).put(Body=(bytes(json_data.encode('UTF-8')))) print("Objects Uploaded to S3") except Exception as e: print("Objects failed to send to s3") print(e) ''' BATCH RECORDS IN 500'S AND JSON.DUMPS EACH ROW ''' if event in [ 'uninstall', 'conversion', 'click', 'impression', 'bounce', 'open', 'send' ]: try: firehose_records = create_payload(converted_avro_data) firehose_records = list(divide_chunks(firehose_records, 500)) for chunk in firehose_records: response = firehost_client.put_record_batch( DeliveryStreamName=STREAM_NAME, Records=chunk) print( "Objects sent to Firehose stream: {0}".format(STREAM_NAME)) print('{0} firehose records failed'.format( response['FailedPutCount'])) except Exception as e: print("Objects failed to send to Firehose") print(e)
def deserialize_records(record) -> list: #print(f"DESRECORD {record}") with io.BytesIO(record) as buf: reader = DataFileReader(buf, DatumReader()) msgs = [msg for msg in reader] #print(f'WHAT? {msgs}') reader.close() return msgs
def binToObjSChema(ab): datum = io.BytesIO(ab) reader = DataFileReader(datum, DatumReader()) cschema = reader.GetMeta('avro.schema') print(cschema) for user in reader: print(user) reader.close()
def read_log(topic, log): schema = avro.schema.parse(open(os.path.abspath(os.path.dirname(__file__)) + "/avro_schema/" + topic + ".avsc").read()) print "schema:", schema writer = DataFileWriter(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "w"), DatumWriter(), schema) for i in range(5): writer.append(log) writer.close() reader = DataFileReader(open(os.path.abspath(os.path.dirname(__file__)) + topic + ".avro", "r"), DatumReader()) for log in reader: print log
def testRead(filename): fd = open(filename, 'rb') datum = DatumReader() reader = DataFileReader(fd, datum) for record in reader: print record['name'], record['age'] reader.close()
def read_avro(iostream, runs=1): times = [] for _ in range(runs): iostream.seek(0) start = time.time() records = list(DataFileReader(iostream, DatumReader())) end = time.time() times.append(end - start) print(f'... {runs} runs averaged {sum(times) / runs} seconds') return records
def evaluate_file(fname: str): logger.info("Opening file %s", fname) reader = DataFileReader(open(fname, "rb"), DatumReader()) logger.info("Counting lines...") i = 0 for val in reader: i += 1 if i % 1000 == 0: logger.debug("Read %d lines", i) logger.info("Found %d lines in file", i)
def deserialize(value): """Deserialize AVRO encoded binary string and yield records. Args: value (str): binary string value. Yields: dict: deserialized record. """ with DataFileReader(io.BytesIO(value), DatumReader()) as reader: for record in reader: yield record
def main(fn, out_fn, avro_mode=''): with open(out_fn, 'w') as fo: with open(fn, 'rb') as f: reader = DataFileReader(f, DatumReader()) for r in reader: if avro_mode.upper() == 'KV': r = r['key'] fo.write('%s\t%r\n' % (r['office'], r['counts'])) print('wrote', out_fn)
def read_corpus(corpus_path): avro_files_path = [ os.path.join(corpus_path, filename) for filename in os.listdir(corpus_path) if os.path.splitext(filename)[1] == '.avro' ] for avro_file in avro_files_path: small_corpus = DataFileReader(open(avro_file, 'rb'), DatumReader()) for article in small_corpus: yield article
def load_avro(file_name='data.avro'): from avro.datafile import DataFileReader from avro.io import DatumReader path = str(DATA_ROOT / file_name) reader = DataFileReader(open(path, "rb"), DatumReader()) try: return list(reader) finally: reader.close()
def read_avro_with_schema(avro_filepath, schema_filepath): print("\nfile:{}\nschema:{}".format(avro_filepath, schema_filepath)) with open(schema_filepath) as f: schema = avro.schema.Parse(f.read()) datum_reader = DatumReader(reader_schema=schema) with open(avro_filepath, 'rb') as f: with DataFileReader(f, datum_reader) as dfr: for record in dfr: print(record)